fix: time update and upstream reference

eywalker · eywalker · commit 35153e25c5c1 · 2025-09-08T15:13:58.000-07:00
diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py
@@ -9,6 +9,7 @@
 from .pipeline import Pipeline
 
 
+
 no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking
 
 __all__ = [
diff --git a/src/orcapod/core/streams/pod_node_stream.py b/src/orcapod/core/streams/pod_node_stream.py
@@ -38,10 +38,10 @@ def __init__(self, pod_node: pp.PodNode, input_stream: cp.Stream, **kwargs):
         super().__init__(source=pod_node, upstreams=(input_stream,), **kwargs)
         self.pod_node = pod_node
         self.input_stream = input_stream
-        self._set_modified_time()  # set modified time to when we obtain the iterator
-        # capture the immutable iterator from the input stream
 
+        # capture the immutable iterator from the input stream
         self._prepared_stream_iterator = input_stream.iter_packets()
+        self._set_modified_time()  # set modified time to when we obtain the iterator
 
         # Packet-level caching (from your PodStream)
         self._cached_output_packets: list[tuple[cp.Tag, cp.Packet | None]] | None = None
@@ -134,7 +134,7 @@ def run(
         cached_results = []
 
         # identify all entries in the input stream for which we still have not computed packets
-        if filter is not None:
+        if len(args) > 0 or len(kwargs) > 0:
             input_stream_used = self.input_stream.polars_filter(*args, **kwargs)
         else:
             input_stream_used = self.input_stream
@@ -194,6 +194,7 @@ def run(
 
         if existing is not None and existing.num_rows > 0:
             # If there are existing entries, we can cache them
+            # TODO: cache them based on the record ID
             existing_stream = TableStream(existing, tag_columns=tag_keys)
             for tag, packet in existing_stream.iter_packets():
                 cached_results.append((tag, packet))
@@ -232,6 +233,14 @@ def run(
 
         self._cached_output_packets = cached_results
         self._set_modified_time()
+        self.pod_node.flush()
+        # TODO: evaluate proper handling of cache here
+        self.clear_cache()
+
+    def clear_cache(self) -> None:
+        self._cached_output_packets = None
+        self._cached_output_table = None
+        self._cached_content_hash_column = None
 
     def iter_packets(
         self, execution_engine: cp.ExecutionEngine | None = None
@@ -423,21 +432,41 @@ def as_table(
 
             converter = self.data_context.type_converter
 
-            struct_packets = converter.python_dicts_to_struct_dicts(all_packets)
-            all_tags_as_tables: pa.Table = pa.Table.from_pylist(
-                all_tags, schema=tag_schema
-            )
-            all_packets_as_tables: pa.Table = pa.Table.from_pylist(
-                struct_packets, schema=packet_schema
-            )
+            if len(all_tags) == 0:
+                tag_types, packet_types = self.pod_node.output_types(
+                    include_system_tags=True
+                )
+                tag_schema = converter.python_schema_to_arrow_schema(tag_types)
+                source_entries = {
+                    f"{constants.SOURCE_PREFIX}{c}": str for c in packet_types.keys()
+                }
+                packet_types.update(source_entries)
+                packet_types[constants.CONTEXT_KEY] = str
+                packet_schema = converter.python_schema_to_arrow_schema(packet_types)
+                total_schema = arrow_utils.join_arrow_schemas(tag_schema, packet_schema)
+                # return an empty table with the right schema
+                self._cached_output_table = pa.Table.from_pylist(
+                    [], schema=total_schema
+                )
+            else:
+                struct_packets = converter.python_dicts_to_struct_dicts(all_packets)
 
-            self._cached_output_table = arrow_utils.hstack_tables(
-                all_tags_as_tables, all_packets_as_tables
-            )
+                all_tags_as_tables: pa.Table = pa.Table.from_pylist(
+                    all_tags, schema=tag_schema
+                )
+                all_packets_as_tables: pa.Table = pa.Table.from_pylist(
+                    struct_packets, schema=packet_schema
+                )
+
+                self._cached_output_table = arrow_utils.hstack_tables(
+                    all_tags_as_tables, all_packets_as_tables
+                )
         assert self._cached_output_table is not None, (
             "_cached_output_table should not be None here."
         )
 
+        if self._cached_output_table.num_rows == 0:
+            return self._cached_output_table
         drop_columns = []
         if not include_source:
             drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1])
diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py
@@ -192,7 +192,8 @@ def run(
             Current implementation uses a simple traversal through all nodes. Future versions
             may implement more efficient graph traversal algorithms.
         """
-        for node in self.nodes.values():
+        import networkx as nx
+        for node in nx.topological_sort(self.graph):
             if run_async:
                 synchronous_run(node.run_async, execution_engine=execution_engine)
             else:
@@ -215,7 +216,7 @@ def wrap_invocation(
                 pipeline_database=self.pipeline_database,
                 pipeline_path_prefix=self.pipeline_store_path_prefix,
                 label=invocation.label,
-                kernel_type="pod",
+                kernel_type="function",
             )
         elif invocation in self.invocation_to_source_lut:
             source = self.invocation_to_source_lut[invocation]
@@ -306,7 +307,7 @@ class GraphRenderer:
             "style": "filled",
             "typefontcolor": "#3A3737",  # dark gray
         },
-        "pod": {
+        "function": {
             "fillcolor": "#f5f5f5",  # off white
             "shape": "cylinder",
             "fontcolor": "#090271",  # darker navy blue
@@ -633,7 +634,7 @@ def create_custom_rules(
                 "style": "filled",
                 "type_font_color": operator_type_fcolor,
             },
-            "pod": {
+            "function": {
                 "fillcolor": pod_bg,
                 "shape": "box",
                 "fontcolor": pod_main_fcolor,
diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py
@@ -91,16 +91,15 @@ def pipeline_path(self) -> tuple[str, ...]:
         ...
 
     def validate_inputs(self, *streams: cp.Stream) -> None:
-        """Sources take no input streams."""
-        if len(streams) > 0:
-            raise NotImplementedError(
-                "At this moment, Node does not yet support handling additional input streams."
-            )
+        return
 
-    def forward(self, *streams: cp.Stream) -> cp.Stream:
-        # TODO: re-evaluate the use here -- consider semi joining with input streams
-        # super().validate_inputs(*self.input_streams)
-        return super().forward(*self.upstreams)  # type: ignore[return-value]
+    # def forward(self, *streams: cp.Stream) -> cp.Stream:
+    #     # TODO: re-evaluate the use here -- consider semi joining with input streams
+    #     # super().validate_inputs(*self.input_streams)
+    #     return super().forward(*self.upstreams)  # type: ignore[return-value]
+
+    def pre_kernel_processing(self, *streams: cp.Stream) -> tuple[cp.Stream, ...]:
+        return self.upstreams
 
     def kernel_output_types(
         self, *streams: cp.Stream, include_system_tags: bool = False
@@ -128,6 +127,9 @@ def get_all_records(
         """
         raise NotImplementedError("This method should be implemented by subclasses.")
 
+    def flush(self):
+        self.pipeline_database.flush()
+
 
 class KernelNode(NodeBase, WrappedKernel):
     """
@@ -264,6 +266,11 @@ def __init__(
             **kwargs,
         )
 
+    def flush(self):
+        self.pipeline_database.flush()
+        if self.result_database is not None:
+            self.result_database.flush()
+
     @property
     def contained_kernel(self) -> cp.Kernel:
         return self.pod
diff --git a/src/orcapod/protocols/pipeline_protocols.py b/src/orcapod/protocols/pipeline_protocols.py
@@ -35,6 +35,19 @@ def get_all_records(
         """
         ...
 
+    def flush(self):
+        """
+        Flush any in-memory data to persistent storage.
+
+        This method ensures that all buffered data is written to the underlying
+        storage system, making it durable and consistent. It is useful for:
+        - Ensuring data integrity before shutdown or restart
+        - Committing changes after a batch of operations
+        - Reducing memory usage by clearing buffers
+
+        """
+        ...
+
     def add_pipeline_record(
         self,
         tag: cp.Tag,