From a9d5624d2f77f09ff86b72617b14d36d2332adf9 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Tue, 29 Apr 2025 16:13:28 +0000 Subject: [PATCH 1/3] Revert "Revert "Deallocate HostIr Op and Test" (#4303)" This reverts commit 85f98948a0e858fb78f05e9e61511056dfcc9661. --- csrc/dispatch.h | 3 +- csrc/host_ir/executor.cpp | 48 +++++++++++++++++++------- csrc/host_ir/executor.h | 1 + csrc/host_ir/host_ir.cpp | 23 ++++++++++++ csrc/host_ir/host_ir.h | 21 +++++++++++ tests/cpp/test_host_ir_integration.cpp | 26 ++++++++++++++ 6 files changed, 109 insertions(+), 13 deletions(-) diff --git a/csrc/dispatch.h b/csrc/dispatch.h index 007287e49f7..4e09ef6fc02 100644 --- a/csrc/dispatch.h +++ b/csrc/dispatch.h @@ -161,7 +161,8 @@ class Val; f(StartCoalescing); \ f(EndCoalescing); \ f(ShareMemHandles); \ - f(HirAliasSelect); + f(HirAliasSelect); \ + f(Deallocate); // Forward declarations for all Val and Expr types diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 2f2cf9e7b92..2cfa242fab5 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -330,20 +330,35 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { for (auto& input : launch_kernel->inputs()) { args.push(getKnownConcreteValue(input)); } + + // If all output buffers are known already, pass them to the executor + KernelArgumentHolder outputs; + bool preallocated_outputs = false; + for (Val* output : launch_kernel->outputs()) { + if (isKnown(output)) { + preallocated_outputs = true; + outputs.push(getKnownConcreteValue(output)); + } + } + + NVF_ERROR( + outputs.empty() || outputs.size() == launch_kernel->outputs().size()); + args.setDeviceIndex(); // run the compiled kernel - KernelArgumentHolder outputs = - container_->getKernelExecutor(launch_kernel->getIndex()) - ->run( - args, - {}, - launch_kernel->launch_params(), - launch_kernel->compile_params()); - - // Store the outputs in the context - for (auto output_idx : arange(outputs.size())) { - bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]); + outputs = container_->getKernelExecutor(launch_kernel->getIndex()) + ->run( + args, + outputs, + launch_kernel->launch_params(), + launch_kernel->compile_params()); + + if (!preallocated_outputs) { + // Store the outputs in the context + for (auto output_idx : arange(outputs.size())) { + bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]); + } } } @@ -688,7 +703,7 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { "Allocation must be on a TensorView but got ", allocate->buffer()); TensorView* tv = allocate->buffer()->as(); - if (expr_evaluator_.isKnown(tv)) { + if (isKnown(tv)) { return; } GlobalBufferInfo info = @@ -786,6 +801,15 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) { } } +void HostIrEvaluator::handle(Deallocate* deallocate) { + auto* tv = deallocate->allocation()->buffer()->as(); + NVF_ERROR( + isKnown(tv), + "Tried to free buffer associated with unknown TensorView", + tv); + invalidate(tv); +} + void HostIrEvaluator::unhandled(Statement* stmt) { NVF_ERROR(stmt->isA(), stmt, " must be an Expr"); auto* expr = stmt->as(); diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 3f147b7801b..82a8010abbc 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -143,6 +143,7 @@ class HostIrEvaluator final : public OptOutDispatch { void handle(ReductionOp* reduction_op) override; void handle(ShareMemHandles* share_mem_handles) override; void handle(HirAliasSelect* hir_alias_select) override; + void handle(Deallocate* deallocate) override; void unhandled(Statement* stmt) override; c10::cuda::CUDAStream getCUDAStream(Stream* stream); diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp index bf3d5cef9eb..ec49a76bc11 100644 --- a/csrc/host_ir/host_ir.cpp +++ b/csrc/host_ir/host_ir.cpp @@ -153,6 +153,29 @@ std::string LaunchKernel::toInlineString(int indent_size) const { NVF_CHECK(false, "Can not be printed inline"); } +Deallocate::Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate) + : Expr(passkey) { + addAttribute(allocate); +} + +NVFUSER_DEFINE_CLONE_AND_CREATE(Deallocate) + +const kir::Allocate* Deallocate::allocation() const { + return attributes_.at(0)->as(); +} + +std::string Deallocate::toString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << "Deallocate {" << std::endl; + ss << allocation()->toString(indent_size + 1); + indent(ss, indent_size) << "}" << std::endl; + return ss.str(); +} + +std::string Deallocate::toInlineString(int indent_size) const { + return std::string("Deallocate ") + allocation()->buffer()->toInlineString(); +} + Stream::Stream(IrBuilderPasskey passkey, Val* index) : Val(passkey, ValType::Stream), index_(index) {} diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h index d267d23ab1f..f8fd02e64a4 100644 --- a/csrc/host_ir/host_ir.h +++ b/csrc/host_ir/host_ir.h @@ -155,6 +155,27 @@ class LaunchKernel : public Expr { } }; +class Deallocate : public Expr { + public: + using Expr::Expr; + Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate); + + Deallocate(const Deallocate& other) = delete; + Deallocate& operator=(const Deallocate& other) = delete; + Deallocate(Deallocate&& other) = delete; + Deallocate& operator=(Deallocate&& other) = delete; + + NVFUSER_DECLARE_CLONE_AND_CREATE + + std::string toString(int indent_size = 0) const override; + std::string toInlineString(int indent_size = 0) const override; + const char* getOpString() const override { + return "hir::Deallocate"; + } + + const kir::Allocate* allocation() const; +}; + class Stream : public Val { public: // if index is provided, the IR represents the streams whose index is the diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp index 98a919727bd..f412cd6c03f 100644 --- a/tests/cpp/test_host_ir_integration.cpp +++ b/tests/cpp/test_host_ir_integration.cpp @@ -146,6 +146,32 @@ TEST_F(HostIrIntegrationTest, ViewPermute_ExprEval) { executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } +TEST_F(HostIrIntegrationTest, Deallocate) { + const std::vector sizes = {8, 64}; + c10::DeviceIndex device_index = 0; + + resetPeakMemoryStats(device_index); + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + for (int i = 0; i < 10; i++) { + TensorView* tv = makeConcreteTensor(sizes); + tv->setMemoryType(MemoryType::Global); + auto* allocate = IrBuilder::create(tv, MemoryType::Global); + auto* deallocate = IrBuilder::create(allocate); + + hic->pushBackTopLevelExprs(allocate); + hic->pushBackTopLevelExprs(deallocate); + } + + HostIrEvaluator hie(std::move(hic)); + + hie.runWithInput({}); + + EXPECT_EQ(memoryAllocated(device_index), 0); +} + } // namespace hir } // namespace nvfuser From c1dc3a79e071c0d46573717b9814fcecad567728 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Tue, 29 Apr 2025 16:34:14 +0000 Subject: [PATCH 2/3] Fix CI issue --- tests/cpp/test_host_ir_integration.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp index f412cd6c03f..a6addf68535 100644 --- a/tests/cpp/test_host_ir_integration.cpp +++ b/tests/cpp/test_host_ir_integration.cpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace nvfuser { @@ -151,6 +152,9 @@ TEST_F(HostIrIntegrationTest, Deallocate) { c10::DeviceIndex device_index = 0; resetPeakMemoryStats(device_index); + at::cuda::clearCublasWorkspaces(); + nvfuser::releaseZeroedMemory(); + ASSERT_EQ(memoryAllocated(device_index), 0) << "Previous tests leaked memory."; auto hic = std::make_unique(); FusionGuard fg(hic.get()); From c0a0e9cacd947aa7261f5a1989ae838ab579879a Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Tue, 29 Apr 2025 16:46:31 +0000 Subject: [PATCH 3/3] Lint fix --- tests/cpp/test_host_ir_integration.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp index a6addf68535..7f7506d3e3a 100644 --- a/tests/cpp/test_host_ir_integration.cpp +++ b/tests/cpp/test_host_ir_integration.cpp @@ -6,13 +6,13 @@ */ // clang-format on #include +#include #include #include #include #include #include #include -#include namespace nvfuser { @@ -154,7 +154,8 @@ TEST_F(HostIrIntegrationTest, Deallocate) { resetPeakMemoryStats(device_index); at::cuda::clearCublasWorkspaces(); nvfuser::releaseZeroedMemory(); - ASSERT_EQ(memoryAllocated(device_index), 0) << "Previous tests leaked memory."; + ASSERT_EQ(memoryAllocated(device_index), 0) + << "Previous tests leaked memory."; auto hic = std::make_unique(); FusionGuard fg(hic.get());