Pytato exec: don't wait

inducer · inducer · commit 4ab1514101e4 · 2025-03-28T11:57:56.000-05:00
diff --git a/arraycontext/impl/pytato/__init__.py b/arraycontext/impl/pytato/__init__.py
@@ -546,10 +546,9 @@ def _to_frozen(key: tuple[Any, ...], ary) -> TaggableCLArray:
                     self._dag_transform_cache[normalized_expr])
 
         assert len(pt_prg.bound_arguments) == 0
-        evt, out_dict = pt_prg(self.queue,
+        _evt, out_dict = pt_prg(self.queue,
                 allocator=self.allocator,
                 **bound_arguments)
-        evt.wait()
         assert len(set(out_dict) & set(key_to_frozen_subary)) == 0
 
         key_to_frozen_subary = {
diff --git a/arraycontext/impl/pytato/compile.py b/arraycontext/impl/pytato/compile.py
@@ -636,15 +636,10 @@ def __call__(self, arg_id_to_arg) -> ArrayContainer:
         input_kwargs_for_loopy = _args_to_device_buffers(
                 self.actx, self.input_id_to_name_in_program, arg_id_to_arg, fn_name)
 
-        evt, out_dict = self.pytato_program(queue=self.actx.queue,
+        _evt, out_dict = self.pytato_program(queue=self.actx.queue,
                                             allocator=self.actx.allocator,
                                             **input_kwargs_for_loopy)
 
-        # FIXME Kernels (for now) allocate tons of memory in temporaries. If we
-        # race too far ahead with enqueuing, there is a distinct risk of
-        # running out of memory. This mitigates that risk a bit, for now.
-        evt.wait()
-
         def to_output_template(keys, _):
             name_in_program = self.output_id_to_name_in_program[keys]
             return self.actx.thaw(to_tagged_cl_array(
@@ -680,15 +675,10 @@ def __call__(self, arg_id_to_arg) -> ArrayContainer:
         input_kwargs_for_loopy = _args_to_device_buffers(
                 self.actx, self.input_id_to_name_in_program, arg_id_to_arg, fn_name)
 
-        evt, out_dict = self.pytato_program(queue=self.actx.queue,
+        _evt, out_dict = self.pytato_program(queue=self.actx.queue,
                                             allocator=self.actx.allocator,
                                             **input_kwargs_for_loopy)
 
-        # FIXME Kernels (for now) allocate tons of memory in temporaries. If we
-        # race too far ahead with enqueuing, there is a distinct risk of
-        # running out of memory. This mitigates that risk a bit, for now.
-        evt.wait()
-
         return self.actx.thaw(to_tagged_cl_array(out_dict[self.output_name],
                                                  axes=get_cl_axes_from_pt_axes(
                                                      self.output_axes),