NVIDIA · nsarka · Apr 29, 2025 · Apr 29, 2025 · Apr 29, 2025 · Apr 29, 2025
diff --git a/csrc/dispatch.h b/csrc/dispatch.h
@@ -161,7 +161,8 @@ class Val;
   f(StartCoalescing);                 \
   f(EndCoalescing);                   \
   f(ShareMemHandles);                 \
-  f(HirAliasSelect);
+  f(HirAliasSelect);                  \
+  f(Deallocate);
 
 // Forward declarations for all Val and Expr types
 

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
@@ -330,20 +330,35 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) {
   for (auto& input : launch_kernel->inputs()) {
     args.push(getKnownConcreteValue(input));
   }
+
+  // If all output buffers are known already, pass them to the executor
+  KernelArgumentHolder outputs;
+  bool preallocated_outputs = false;
+  for (Val* output : launch_kernel->outputs()) {
+    if (isKnown(output)) {
+      preallocated_outputs = true;
+      outputs.push(getKnownConcreteValue(output));
+    }
+  }
+
+  NVF_ERROR(
+      outputs.empty() || outputs.size() == launch_kernel->outputs().size());
+
   args.setDeviceIndex();
 
   // run the compiled kernel
-  KernelArgumentHolder outputs =
-      container_->getKernelExecutor(launch_kernel->getIndex())
-          ->run(
-              args,
-              {},
-              launch_kernel->launch_params(),
-              launch_kernel->compile_params());
-
-  // Store the outputs in the context
-  for (auto output_idx : arange(outputs.size())) {
-    bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]);
+  outputs = container_->getKernelExecutor(launch_kernel->getIndex())
+                ->run(
+                    args,
+                    outputs,
+                    launch_kernel->launch_params(),
+                    launch_kernel->compile_params());
+
+  if (!preallocated_outputs) {
+    // Store the outputs in the context
+    for (auto output_idx : arange(outputs.size())) {
+      bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]);
+    }
   }
 }
 
@@ -688,7 +703,7 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) {
       "Allocation must be on a TensorView but got ",
       allocate->buffer());
   TensorView* tv = allocate->buffer()->as<TensorView>();
-  if (expr_evaluator_.isKnown(tv)) {
+  if (isKnown(tv)) {
     return;
   }
   GlobalBufferInfo info =
@@ -786,6 +801,15 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) {
   }
 }
 
+void HostIrEvaluator::handle(Deallocate* deallocate) {
+  auto* tv = deallocate->allocation()->buffer()->as<TensorView>();
+  NVF_ERROR(
+      isKnown(tv),
+      "Tried to free buffer associated with unknown TensorView",
+      tv);
+  invalidate(tv);
+}
+
 void HostIrEvaluator::unhandled(Statement* stmt) {
   NVF_ERROR(stmt->isA<Expr>(), stmt, " must be an Expr");
   auto* expr = stmt->as<Expr>();

diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
@@ -143,6 +143,7 @@ class HostIrEvaluator final : public OptOutDispatch {
   void handle(ReductionOp* reduction_op) override;
   void handle(ShareMemHandles* share_mem_handles) override;
   void handle(HirAliasSelect* hir_alias_select) override;
+  void handle(Deallocate* deallocate) override;
   void unhandled(Statement* stmt) override;
 
   c10::cuda::CUDAStream getCUDAStream(Stream* stream);

diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp
@@ -153,6 +153,29 @@ std::string LaunchKernel::toInlineString(int indent_size) const {
   NVF_CHECK(false, "Can not be printed inline");
 }
 
+Deallocate::Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate)
+    : Expr(passkey) {
+  addAttribute(allocate);
+}
+
+NVFUSER_DEFINE_CLONE_AND_CREATE(Deallocate)
+
+const kir::Allocate* Deallocate::allocation() const {
+  return attributes_.at(0)->as<kir::Allocate>();
+}
+
+std::string Deallocate::toString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << "Deallocate {" << std::endl;
+  ss << allocation()->toString(indent_size + 1);
+  indent(ss, indent_size) << "}" << std::endl;
+  return ss.str();
+}
+
+std::string Deallocate::toInlineString(int indent_size) const {
+  return std::string("Deallocate ") + allocation()->buffer()->toInlineString();
+}
+
 Stream::Stream(IrBuilderPasskey passkey, Val* index)
     : Val(passkey, ValType::Stream), index_(index) {}
 

diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h
@@ -155,6 +155,27 @@ class LaunchKernel : public Expr {
   }
 };
 
+class Deallocate : public Expr {
+ public:
+  using Expr::Expr;
+  Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate);
+
+  Deallocate(const Deallocate& other) = delete;
+  Deallocate& operator=(const Deallocate& other) = delete;
+  Deallocate(Deallocate&& other) = delete;
+  Deallocate& operator=(Deallocate&& other) = delete;
+
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const char* getOpString() const override {
+    return "hir::Deallocate";
+  }
+
+  const kir::Allocate* allocation() const;
+};
+
 class Stream : public Val {
  public:
   // if index is provided, the IR represents the streams whose index is the

diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp
@@ -6,6 +6,7 @@
 */
 // clang-format on
 #include <fusion.h>
+#include <global_allocator.h>
 #include <host_ir/container.h>
 #include <host_ir/executor.h>
 #include <ir/all_nodes.h>
@@ -146,6 +147,36 @@ TEST_F(HostIrIntegrationTest, ViewPermute_ExprEval) {
       executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 }
 
+TEST_F(HostIrIntegrationTest, Deallocate) {
+  const std::vector<int64_t> sizes = {8, 64};
+  c10::DeviceIndex device_index = 0;
+
+  resetPeakMemoryStats(device_index);
+  at::cuda::clearCublasWorkspaces();
+  nvfuser::releaseZeroedMemory();
+  ASSERT_EQ(memoryAllocated(device_index), 0)
+      << "Previous tests leaked memory.";
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  for (int i = 0; i < 10; i++) {
+    TensorView* tv = makeConcreteTensor(sizes);
+    tv->setMemoryType(MemoryType::Global);
+    auto* allocate = IrBuilder::create<kir::Allocate>(tv, MemoryType::Global);
+    auto* deallocate = IrBuilder::create<Deallocate>(allocate);
+
+    hic->pushBackTopLevelExprs(allocate);
+    hic->pushBackTopLevelExprs(deallocate);
+  }
+
+  HostIrEvaluator hie(std::move(hic));
+
+  hie.runWithInput({});
+
+  EXPECT_EQ(memoryAllocated(device_index), 0);
+}
+
 } // namespace hir
 
 } // namespace nvfuser