From a9d5624d2f77f09ff86b72617b14d36d2332adf9 Mon Sep 17 00:00:00 2001
From: Nick Sarkauskas <nsarkauskas@nvidia.com>
Date: Tue, 29 Apr 2025 16:13:28 +0000
Subject: [PATCH 1/3] Revert "Revert "Deallocate HostIr Op and Test" (#4303)"

This reverts commit 85f98948a0e858fb78f05e9e61511056dfcc9661.
---
 csrc/dispatch.h                        |  3 +-
 csrc/host_ir/executor.cpp              | 48 +++++++++++++++++++-------
 csrc/host_ir/executor.h                |  1 +
 csrc/host_ir/host_ir.cpp               | 23 ++++++++++++
 csrc/host_ir/host_ir.h                 | 21 +++++++++++
 tests/cpp/test_host_ir_integration.cpp | 26 ++++++++++++++
 6 files changed, 109 insertions(+), 13 deletions(-)
diff --git a/csrc/dispatch.h b/csrc/dispatch.h
index 007287e49f7..4e09ef6fc02 100644
--- a/csrc/dispatch.h
+++ b/csrc/dispatch.h
@@ -161,7 +161,8 @@ class Val;
   f(StartCoalescing);                 \
   f(EndCoalescing);                   \
   f(ShareMemHandles);                 \
-  f(HirAliasSelect);
+  f(HirAliasSelect);                  \
+  f(Deallocate);
 
 // Forward declarations for all Val and Expr types
 
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 2f2cf9e7b92..2cfa242fab5 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -330,20 +330,35 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) {
   for (auto& input : launch_kernel->inputs()) {
     args.push(getKnownConcreteValue(input));
   }
+
+  // If all output buffers are known already, pass them to the executor
+  KernelArgumentHolder outputs;
+  bool preallocated_outputs = false;
+  for (Val* output : launch_kernel->outputs()) {
+    if (isKnown(output)) {
+      preallocated_outputs = true;
+      outputs.push(getKnownConcreteValue(output));
+    }
+  }
+
+  NVF_ERROR(
+      outputs.empty() || outputs.size() == launch_kernel->outputs().size());
+
   args.setDeviceIndex();
 
   // run the compiled kernel
-  KernelArgumentHolder outputs =
-      container_->getKernelExecutor(launch_kernel->getIndex())
-          ->run(
-              args,
-              {},
-              launch_kernel->launch_params(),
-              launch_kernel->compile_params());
-
-  // Store the outputs in the context
-  for (auto output_idx : arange(outputs.size())) {
-    bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]);
+  outputs = container_->getKernelExecutor(launch_kernel->getIndex())
+                ->run(
+                    args,
+                    outputs,
+                    launch_kernel->launch_params(),
+                    launch_kernel->compile_params());
+
+  if (!preallocated_outputs) {
+    // Store the outputs in the context
+    for (auto output_idx : arange(outputs.size())) {
+      bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]);
+    }
   }
 }
 
@@ -688,7 +703,7 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) {
       "Allocation must be on a TensorView but got ",
       allocate->buffer());
   TensorView* tv = allocate->buffer()->as<TensorView>();
-  if (expr_evaluator_.isKnown(tv)) {
+  if (isKnown(tv)) {
     return;
   }
   GlobalBufferInfo info =
@@ -786,6 +801,15 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) {
   }
 }
 
+void HostIrEvaluator::handle(Deallocate* deallocate) {
+  auto* tv = deallocate->allocation()->buffer()->as<TensorView>();
+  NVF_ERROR(
+      isKnown(tv),
+      "Tried to free buffer associated with unknown TensorView",
+      tv);
+  invalidate(tv);
+}
+
 void HostIrEvaluator::unhandled(Statement* stmt) {
   NVF_ERROR(stmt->isA<Expr>(), stmt, " must be an Expr");
   auto* expr = stmt->as<Expr>();
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index 3f147b7801b..82a8010abbc 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -143,6 +143,7 @@ class HostIrEvaluator final : public OptOutDispatch {
   void handle(ReductionOp* reduction_op) override;
   void handle(ShareMemHandles* share_mem_handles) override;
   void handle(HirAliasSelect* hir_alias_select) override;
+  void handle(Deallocate* deallocate) override;
   void unhandled(Statement* stmt) override;
 
   c10::cuda::CUDAStream getCUDAStream(Stream* stream);
diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp
index bf3d5cef9eb..ec49a76bc11 100644
--- a/csrc/host_ir/host_ir.cpp
+++ b/csrc/host_ir/host_ir.cpp
@@ -153,6 +153,29 @@ std::string LaunchKernel::toInlineString(int indent_size) const {
   NVF_CHECK(false, "Can not be printed inline");
 }
 
+Deallocate::Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate)
+    : Expr(passkey) {
+  addAttribute(allocate);
+}
+
+NVFUSER_DEFINE_CLONE_AND_CREATE(Deallocate)
+
+const kir::Allocate* Deallocate::allocation() const {
+  return attributes_.at(0)->as<kir::Allocate>();
+}
+
+std::string Deallocate::toString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << "Deallocate {" << std::endl;
+  ss << allocation()->toString(indent_size + 1);
+  indent(ss, indent_size) << "}" << std::endl;
+  return ss.str();
+}
+
+std::string Deallocate::toInlineString(int indent_size) const {
+  return std::string("Deallocate ") + allocation()->buffer()->toInlineString();
+}
+
 Stream::Stream(IrBuilderPasskey passkey, Val* index)
     : Val(passkey, ValType::Stream), index_(index) {}
 
diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h
index d267d23ab1f..f8fd02e64a4 100644
--- a/csrc/host_ir/host_ir.h
+++ b/csrc/host_ir/host_ir.h
@@ -155,6 +155,27 @@ class LaunchKernel : public Expr {
   }
 };
 
+class Deallocate : public Expr {
+ public:
+  using Expr::Expr;
+  Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate);
+
+  Deallocate(const Deallocate& other) = delete;
+  Deallocate& operator=(const Deallocate& other) = delete;
+  Deallocate(Deallocate&& other) = delete;
+  Deallocate& operator=(Deallocate&& other) = delete;
+
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const char* getOpString() const override {
+    return "hir::Deallocate";
+  }
+
+  const kir::Allocate* allocation() const;
+};
+
 class Stream : public Val {
  public:
   // if index is provided, the IR represents the streams whose index is the
diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp
index 98a919727bd..f412cd6c03f 100644
--- a/tests/cpp/test_host_ir_integration.cpp
+++ b/tests/cpp/test_host_ir_integration.cpp
@@ -146,6 +146,32 @@ TEST_F(HostIrIntegrationTest, ViewPermute_ExprEval) {
       executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 }
 
+TEST_F(HostIrIntegrationTest, Deallocate) {
+  const std::vector<int64_t> sizes = {8, 64};
+  c10::DeviceIndex device_index = 0;
+
+  resetPeakMemoryStats(device_index);
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  for (int i = 0; i < 10; i++) {
+    TensorView* tv = makeConcreteTensor(sizes);
+    tv->setMemoryType(MemoryType::Global);
+    auto* allocate = IrBuilder::create<kir::Allocate>(tv, MemoryType::Global);
+    auto* deallocate = IrBuilder::create<Deallocate>(allocate);
+
+    hic->pushBackTopLevelExprs(allocate);
+    hic->pushBackTopLevelExprs(deallocate);
+  }
+
+  HostIrEvaluator hie(std::move(hic));
+
+  hie.runWithInput({});
+
+  EXPECT_EQ(memoryAllocated(device_index), 0);
+}
+
 } // namespace hir
 
 } // namespace nvfuser

From c1dc3a79e071c0d46573717b9814fcecad567728 Mon Sep 17 00:00:00 2001
From: Nick Sarkauskas <nsarkauskas@nvidia.com>
Date: Tue, 29 Apr 2025 16:34:14 +0000
Subject: [PATCH 2/3] Fix CI issue

---
 tests/cpp/test_host_ir_integration.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp
index f412cd6c03f..a6addf68535 100644
--- a/tests/cpp/test_host_ir_integration.cpp
+++ b/tests/cpp/test_host_ir_integration.cpp
@@ -12,6 +12,7 @@
 #include <ops/all_ops.h>
 #include <tests/cpp/utils.h>
 #include <tests/cpp/validator.h>
+#include <global_allocator.h>
 
 namespace nvfuser {
 
@@ -151,6 +152,9 @@ TEST_F(HostIrIntegrationTest, Deallocate) {
   c10::DeviceIndex device_index = 0;
 
   resetPeakMemoryStats(device_index);
+  at::cuda::clearCublasWorkspaces();
+  nvfuser::releaseZeroedMemory();
+  ASSERT_EQ(memoryAllocated(device_index), 0) << "Previous tests leaked memory.";
 
   auto hic = std::make_unique<HostIrContainer>();
   FusionGuard fg(hic.get());

From c0a0e9cacd947aa7261f5a1989ae838ab579879a Mon Sep 17 00:00:00 2001
From: Nick Sarkauskas <nsarkauskas@nvidia.com>
Date: Tue, 29 Apr 2025 16:46:31 +0000
Subject: [PATCH 3/3] Lint fix

---
 tests/cpp/test_host_ir_integration.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp
index a6addf68535..7f7506d3e3a 100644
--- a/tests/cpp/test_host_ir_integration.cpp
+++ b/tests/cpp/test_host_ir_integration.cpp
@@ -6,13 +6,13 @@
 */
 // clang-format on
 #include <fusion.h>
+#include <global_allocator.h>
 #include <host_ir/container.h>
 #include <host_ir/executor.h>
 #include <ir/all_nodes.h>
 #include <ops/all_ops.h>
 #include <tests/cpp/utils.h>
 #include <tests/cpp/validator.h>
-#include <global_allocator.h>
 
 namespace nvfuser {
 
@@ -154,7 +154,8 @@ TEST_F(HostIrIntegrationTest, Deallocate) {
   resetPeakMemoryStats(device_index);
   at::cuda::clearCublasWorkspaces();
   nvfuser::releaseZeroedMemory();
-  ASSERT_EQ(memoryAllocated(device_index), 0) << "Previous tests leaked memory.";
+  ASSERT_EQ(memoryAllocated(device_index), 0)
+      << "Previous tests leaked memory.";
 
   auto hic = std::make_unique<HostIrContainer>();
   FusionGuard fg(hic.get());