Genesis-Embodied-AI · hughperkins · Mar 11, 2026 · Mar 12, 2026 · Apr 19, 2026 · Apr 20, 2026
diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
@@ -57,6 +57,7 @@ tile16
 
 fastcache
 graph
+streams
 perf_dispatch
 init_options
 ```

diff --git a/docs/source/user_guide/streams.md b/docs/source/user_guide/streams.md
@@ -0,0 +1,138 @@
+# Streams
+
+Streams allow concurrent execution of GPU operations. By default, all Quadrants kernels launch on the default stream, which serializes everything. By creating explicit streams, you can run independent kernels concurrently and control synchronization with events.
+
+## Supported platforms
+
+| Backend | Streams | Events | Notes |
+|---------|---------|--------|-------|
+| CUDA    | Yes     | Yes    | Full concurrent execution |
+| CPU     | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
+| Metal   | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
+| Vulkan  | No-op   | No-op  | `qd_stream` is silently ignored, kernels run serially |
+
+On backends without native stream support, `create_stream()` and `create_event()` return objects with handle `0`. All stream/event operations become no-ops and kernels run serially. Code written with streams is portable across all backends in the sense that it will run without modifications, but serially.
+
+## Creating and using streams
+
+```python
+import quadrants as qd
+
+qd.init(arch=qd.cuda)
+
+N = 1024
+a = qd.field(qd.f32, shape=(N,))
+b = qd.field(qd.f32, shape=(N,))
+
+@qd.kernel
+def fill_a():
+    for i in range(N):
+        a[i] = 1.0
+
+@qd.kernel
+def fill_b():
+    for i in range(N):
+        b[i] = 2.0
+
+s1 = qd.create_stream()
+s2 = qd.create_stream()
+
+fill_a(qd_stream=s1)
+fill_b(qd_stream=s2)
+
+s1.synchronize()
+s2.synchronize()
+
+s1.destroy()
+s2.destroy()
+```
+
+Pass `qd_stream=` to any kernel call to launch it on that stream. Kernels on different streams may execute concurrently. Call `synchronize()` to block until all work on a stream completes.
+
+## Events
+
+Events let you express dependencies between streams without full synchronization.
+
+```python
+s1 = qd.create_stream()
+s2 = qd.create_stream()
+
+@qd.kernel
+def produce():
+    for i in range(N):
+        a[i] = 10.0
+
+@qd.kernel
+def consume():
+    for i in range(N):
+        b[i] = a[i]
+
+produce(qd_stream=s1)
+
+e = qd.create_event()
+e.record(s1)       # record when s1 finishes produce()
+e.wait(qd_stream=s2)  # s2 waits for that event before proceeding
+
+consume(qd_stream=s2)  # safe to read a[] — produce() is guaranteed complete
+s2.synchronize()
+
+e.destroy()
+s1.destroy()
+s2.destroy()
+```
+
+`e.record(stream)` captures the point in `stream`'s execution. `e.wait(qd_stream=stream)` makes `stream` wait until the recorded point is reached. If `qd_stream` is omitted, the default stream waits.
+
+## Context managers
+
+Streams and events support `with` blocks for automatic cleanup:
+
+```python
+with qd.create_stream() as s:
+    fill_a(qd_stream=s)
+    s.synchronize()
+# s.destroy() called automatically
+```
+
+## PyTorch interop (CUDA)
+
+When mixing Quadrants kernels with PyTorch operations on CUDA, both frameworks must use the same stream to avoid race conditions. Without explicit stream management, Quadrants and PyTorch may launch work on different streams with no ordering guarantees, leading to intermittent data corruption.
+
+### Running Quadrants kernels on PyTorch's stream
+
+```python
+import torch
+from quadrants.lang.stream import Stream
+
+torch_stream_ptr = torch.cuda.current_stream().cuda_stream
+stream = Stream(torch_stream_ptr)
+
+physics_kernel(qd_stream=stream)
+observations = compute_obs_tensor()  # PyTorch op on the same stream
+apply_actions_kernel(qd_stream=stream)
+```
+
+Wrap PyTorch's raw `CUstream` pointer in a Quadrants `Stream` object. Do **not** call `destroy()` on this wrapper — PyTorch owns the underlying stream.
+
+### Running PyTorch operations on a Quadrants stream
+
+```python
+qd_stream = qd.create_stream()
+torch_stream = torch.cuda.ExternalStream(qd_stream.handle)
+
+with torch.cuda.stream(torch_stream):
+    physics_kernel(qd_stream=qd_stream)
+    observations = compute_obs_tensor()
+    apply_actions_kernel(qd_stream=qd_stream)
+
+qd_stream.destroy()
+```
+
+`Stream.handle` is the raw `CUstream` pointer, which `torch.cuda.ExternalStream` accepts directly.
+
+## Limitations
+
+- **Not compatible with graphs.** Do not pass `qd_stream` to a kernel decorated with `graph=True`.
+- **Not compatible with autodiff.** Do not pass `qd_stream` to a kernel that uses reverse-mode or forward-mode differentiation, or inside a `qd.ad.Tape` context.
+- **`qd.sync()` only waits on the default stream.** It does not drain explicit streams. Call `stream.synchronize()` on each stream you need to wait for.
+- **No automatic synchronization.** You are responsible for inserting events or `synchronize()` calls when one stream's output is another stream's input.
diff --git a/python/quadrants/lang/__init__.py b/python/quadrants/lang/__init__.py
@@ -16,6 +16,7 @@
 from quadrants.lang.runtime_ops import *
 from quadrants.lang.snode import *
 from quadrants.lang.source_builder import *
+from quadrants.lang.stream import *
 from quadrants.lang.struct import *
 from quadrants.types.enums import DeviceCapability, Format, Layout  # noqa: F401
 
@@ -47,6 +48,7 @@
         "shell",
         "snode",
         "source_builder",
+        "stream",
         "struct",
         "util",
     ]

diff --git a/python/quadrants/lang/kernel.py b/python/quadrants/lang/kernel.py
@@ -453,7 +453,9 @@ def materialize(self, key: "CompiledKernelKeyType | None", py_args: tuple[Any, .
                     ]
                 runtime._current_global_context = None
 
-    def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: CompiledKernelData | None, *args) -> Any:
+    def launch_kernel(
+        self, key, t_kernel: KernelCxx, compiled_kernel_data: CompiledKernelData | None, *args, qd_stream=None
+    ) -> Any:
         assert len(args) == len(self.arg_metas), f"{len(self.arg_metas)} arguments needed but {len(args)} provided"
 
         callbacks: list[Callable[[], None]] = []
@@ -567,9 +569,21 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
                     self.src_ll_cache_observations.cache_stored = True
             self._last_compiled_kernel_data = compiled_kernel_data
             launch_ctx.use_graph = self.use_graph and _GRAPH_ENABLED
+            if self.use_graph and qd_stream is not None:
+                raise RuntimeError(
+                    "qd_stream is not compatible with graph=True kernels. "
+                    "See docs/source/user_guide/streams.md for details."
+                )
             if self.graph_do_while_arg is not None and hasattr(self, "_graph_do_while_cpp_arg_id"):
                 launch_ctx.graph_do_while_arg_id = self._graph_do_while_cpp_arg_id
-            prog.launch_kernel(compiled_kernel_data, launch_ctx)
+            stream_handle = qd_stream.handle if qd_stream is not None else 0
+            if stream_handle:
+                prog.set_current_cuda_stream(stream_handle)
+            try:
+                prog.launch_kernel(compiled_kernel_data, launch_ctx)
+            finally:
+                if stream_handle:
+                    prog.set_current_cuda_stream(0)
         except Exception as e:
             e = handle_exception_from_cpp(e)
             if impl.get_runtime().print_full_traceback:
@@ -581,6 +595,8 @@ def launch_kernel(self, key, t_kernel: KernelCxx, compiled_kernel_data: Compiled
 
         return_type = self.return_type
         if return_type or self.has_print:
+            if qd_stream is not None and self.has_print and not return_type:
+                qd_stream.synchronize()
             runtime_ops.sync()
 
         if not return_type:
@@ -647,6 +663,17 @@ def ensure_compiled(self, *py_args: tuple[Any, ...]) -> tuple[Callable, int, Aut
     # Thus this part needs to be fast. (i.e. < 3us on a 4 GHz x64 CPU)
     @_shell_pop_print
     def __call__(self, *py_args, **kwargs) -> Any:
+        qd_stream = kwargs.pop("qd_stream", None)
+        if qd_stream is not None and self.autodiff_mode != _NONE:
+            raise RuntimeError(
+                "qd_stream is not compatible with autodiff kernels. Streams cannot be used with "
+                "reverse-mode or forward-mode differentiation."
+            )
+        if qd_stream is not None and self.runtime.target_tape:
+            raise RuntimeError(
+                "qd_stream is not compatible with autograd Tape. Launch the kernel outside the Tape "
+                "context, or omit qd_stream."
+            )
         if impl.get_runtime()._arch == _ARCH_PYTHON:
             return self.func(*py_args, **kwargs)
         config = impl.current_cfg()
@@ -709,7 +736,7 @@ def __call__(self, *py_args, **kwargs) -> Any:
         kernel_cpp = self.materialized_kernels[key]
         compiled_kernel_data = self.compiled_kernel_data_by_key.get(key, None)
         self.launch_observations.found_kernel_in_materialize_cache = compiled_kernel_data is not None
-        ret = self.launch_kernel(key, kernel_cpp, compiled_kernel_data, *py_args)
+        ret = self.launch_kernel(key, kernel_cpp, compiled_kernel_data, *py_args, qd_stream=qd_stream)
         if compiled_kernel_data is None:
             assert self._last_compiled_kernel_data is not None
             self.compiled_kernel_data_by_key[key] = self._last_compiled_kernel_data

diff --git a/python/quadrants/lang/runtime_ops.py b/python/quadrants/lang/runtime_ops.py
@@ -4,8 +4,10 @@
 
 
 def sync():
-    """Blocks the calling thread until all the previously
-    launched Quadrants kernels have completed.
+    """Synchronizes the default stream.
+
+    Blocks the calling thread until all work on the default GPU stream has completed.  Kernels launched on explicit
+    streams created via :func:`quadrants.create_stream` are **not** waited on — call ``stream.synchronize()`` for those.
     """
     impl.get_runtime().sync()
-Original file line number
+Diff line change
@@ Expand Up / @@ -57,6 +57,7 @@ tile16 @@
     fastcache
     graph
+    streams
     perf_dispatch
     init_options
     ```
@@ Expand Down @@