From 17aee500a9c0825487a6c35dbecd0a25a37a1799 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Mon, 24 Feb 2025 23:00:33 +0000 Subject: [PATCH 01/11] Deallocate HostIr and test --- csrc/dispatch.h | 3 +- csrc/host_ir/executor.cpp | 44 +++++++++++++++++++------- csrc/host_ir/executor.h | 1 + csrc/host_ir/host_ir.cpp | 21 ++++++++++++ csrc/host_ir/host_ir.h | 25 +++++++++++++++ tests/cpp/test_host_ir_integration.cpp | 26 +++++++++++++++ 6 files changed, 107 insertions(+), 13 deletions(-) diff --git a/csrc/dispatch.h b/csrc/dispatch.h index f1f4153d1d2..fed2b39511c 100644 --- a/csrc/dispatch.h +++ b/csrc/dispatch.h @@ -160,7 +160,8 @@ class Val; f(Synchronize); \ f(StartCoalescing); \ f(EndCoalescing); \ - f(ShareMemHandles); + f(ShareMemHandles); \ + f(Deallocate); // Forward declarations for all Val and Expr types diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 89710eaae4b..0c47e37b9c6 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -325,24 +325,38 @@ void HostIrEvaluator::handle(Synchronize* synchronize) { } void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { - KernelArgumentHolder args; + KernelArgumentHolder args, outputs; + bool preallocated_outputs = true; for (auto& input : launch_kernel->inputs()) { args.push(getKnownConcreteValue(input)); } + + // If all output buffers are known already, pass them to the executor + for (auto& output : launch_kernel->outputs()) { + if (expr_evaluator_.isKnown(output)) { + outputs.push(expr_evaluator_.evaluate(output)); + } else { + outputs = {}; + preallocated_outputs = false; + break; + } + } + args.setDeviceIndex(); // run the compiled kernel - KernelArgumentHolder outputs = - container_->getKernelExecutor(launch_kernel->getIndex()) - ->run( - args, - {}, - launch_kernel->launch_params(), - launch_kernel->compile_params()); - - // Store the outputs in the context - for (auto output_idx : arange(outputs.size())) { - bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]); + outputs = container_->getKernelExecutor(launch_kernel->getIndex()) + ->run( + args, + outputs, + launch_kernel->launch_params(), + launch_kernel->compile_params()); + + if (!preallocated_outputs) { + // Store the outputs in the context + for (auto output_idx : arange(outputs.size())) { + bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]); + } } } @@ -654,6 +668,12 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { bind(tv, tensor); } +void HostIrEvaluator::handle(Deallocate* deallocate) { + TensorView* tv = deallocate->allocation()->buffer()->as(); + NVF_ERROR(expr_evaluator_.isKnown(tv), "Tried to free buffer associated with unknown TensorView", tv); + expr_evaluator_.invalidate(tv); +} + void HostIrEvaluator::unhandled(Statement* stmt) { NVF_ERROR(stmt->isA(), stmt, " must be an Expr"); auto* expr = stmt->as(); diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index d71b74e0dda..c854d2312fc 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -135,6 +135,7 @@ class HostIrEvaluator final : public OptOutDispatch { void handle(LinearOp* linear) override; void handle(kir::Allocate* allocate) override; void handle(ShareMemHandles* share_mem_handles) override; + void handle(Deallocate* deallocate) override; void unhandled(Statement* stmt) override; c10::cuda::CUDAStream getCUDAStream(Stream* stream); diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp index 9e1386d0d3d..83bb2e17193 100644 --- a/csrc/host_ir/host_ir.cpp +++ b/csrc/host_ir/host_ir.cpp @@ -153,6 +153,27 @@ std::string LaunchKernel::toInlineString(int indent_size) const { NVF_CHECK(false, "Can not be printed inline"); } +Deallocate::Deallocate( + IrBuilderPasskey passkey, + kir::Allocate* allocate) + : Expr(passkey) { + addAttribute(allocate); +} + +NVFUSER_DEFINE_CLONE_AND_CREATE(Deallocate) + +std::string Deallocate::toString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << "Deallocate {" << std::endl; + ss << allocation()->toString(indent_size + 1); + indent(ss, indent_size) << "}" << std::endl; + return ss.str(); +} + +std::string Deallocate::toInlineString(int indent_size) const { + return std::string("Deallocate"); +} + Stream::Stream(IrBuilderPasskey passkey, Val* index) : Val(passkey, ValType::Stream), index_(index) {} diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h index bad3a6ef722..8a56f64561e 100644 --- a/csrc/host_ir/host_ir.h +++ b/csrc/host_ir/host_ir.h @@ -155,6 +155,31 @@ class LaunchKernel : public Expr { } }; +class Deallocate : public Expr { + public: + using Expr::Expr; + Deallocate( + IrBuilderPasskey passkey, + kir::Allocate* allocate); + + Deallocate(const Deallocate& other) = delete; + Deallocate& operator=(const Deallocate& other) = delete; + Deallocate(Deallocate&& other) = delete; + Deallocate& operator=(Deallocate&& other) = delete; + + NVFUSER_DECLARE_CLONE_AND_CREATE + + std::string toString(int indent_size = 0) const override; + std::string toInlineString(int indent_size = 0) const override; + const char* getOpString() const override { + return "hir::Deallocate"; + } + + const auto allocation() const { + return attributes_.at(0)->as(); + } +}; + class Stream : public Val { public: // if index is provided, the IR represents the streams whose index is the diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp index 149f1af7310..52e45435118 100644 --- a/tests/cpp/test_host_ir_integration.cpp +++ b/tests/cpp/test_host_ir_integration.cpp @@ -113,6 +113,32 @@ TEST_F(HostIrIntegrationTest, Sum) { ""); } +TEST_F(HostIrIntegrationTest, Deallocate) { + const std::vector sizes = {8, 64}; + uint8_t device_index = 0; + + resetPeakMemoryStats(device_index); + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + for (int i = 0; i < 10; i++) { + TensorView* tv = makeConcreteTensor(sizes); + tv->setMemoryType(MemoryType::Global); + auto *allocate = IrBuilder::create(tv, MemoryType::Global); + auto *deallocate = IrBuilder::create(allocate); + + hic->pushBackTopLevelExprs(allocate); + hic->pushBackTopLevelExprs(deallocate); + } + + HostIrEvaluator hie(std::move(hic)); + + hie.runWithInput({}); + + EXPECT_LE(memoryAllocated(device_index), 0); +} + } // namespace hir } // namespace nvfuser From 082766a965273d1aad4b8b69298d7e5e11b6c4af Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Mon, 21 Apr 2025 22:47:33 +0000 Subject: [PATCH 02/11] Linter --- csrc/host_ir/executor.cpp | 19 +++++++++++-------- csrc/host_ir/host_ir.cpp | 6 ++---- csrc/host_ir/host_ir.h | 4 +--- tests/cpp/test_host_ir_integration.cpp | 4 ++-- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 0c47e37b9c6..a21aeede997 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -330,7 +330,7 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { for (auto& input : launch_kernel->inputs()) { args.push(getKnownConcreteValue(input)); } - + // If all output buffers are known already, pass them to the executor for (auto& output : launch_kernel->outputs()) { if (expr_evaluator_.isKnown(output)) { @@ -346,11 +346,11 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { // run the compiled kernel outputs = container_->getKernelExecutor(launch_kernel->getIndex()) - ->run( - args, - outputs, - launch_kernel->launch_params(), - launch_kernel->compile_params()); + ->run( + args, + outputs, + launch_kernel->launch_params(), + launch_kernel->compile_params()); if (!preallocated_outputs) { // Store the outputs in the context @@ -670,8 +670,11 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { void HostIrEvaluator::handle(Deallocate* deallocate) { TensorView* tv = deallocate->allocation()->buffer()->as(); - NVF_ERROR(expr_evaluator_.isKnown(tv), "Tried to free buffer associated with unknown TensorView", tv); - expr_evaluator_.invalidate(tv); + NVF_ERROR( + expr_evaluator_.isKnown(tv), + "Tried to free buffer associated with unknown TensorView", + tv); + invalidate(tv); } void HostIrEvaluator::unhandled(Statement* stmt) { diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp index 83bb2e17193..303de5a5658 100644 --- a/csrc/host_ir/host_ir.cpp +++ b/csrc/host_ir/host_ir.cpp @@ -153,9 +153,7 @@ std::string LaunchKernel::toInlineString(int indent_size) const { NVF_CHECK(false, "Can not be printed inline"); } -Deallocate::Deallocate( - IrBuilderPasskey passkey, - kir::Allocate* allocate) +Deallocate::Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate) : Expr(passkey) { addAttribute(allocate); } @@ -165,7 +163,7 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(Deallocate) std::string Deallocate::toString(int indent_size) const { std::stringstream ss; indent(ss, indent_size) << "Deallocate {" << std::endl; - ss << allocation()->toString(indent_size + 1); + ss << allocation()->toString(indent_size + 1); indent(ss, indent_size) << "}" << std::endl; return ss.str(); } diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h index 8a56f64561e..aa9cd68cdf3 100644 --- a/csrc/host_ir/host_ir.h +++ b/csrc/host_ir/host_ir.h @@ -158,9 +158,7 @@ class LaunchKernel : public Expr { class Deallocate : public Expr { public: using Expr::Expr; - Deallocate( - IrBuilderPasskey passkey, - kir::Allocate* allocate); + Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate); Deallocate(const Deallocate& other) = delete; Deallocate& operator=(const Deallocate& other) = delete; diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp index 52e45435118..0fa58f8a6cd 100644 --- a/tests/cpp/test_host_ir_integration.cpp +++ b/tests/cpp/test_host_ir_integration.cpp @@ -125,8 +125,8 @@ TEST_F(HostIrIntegrationTest, Deallocate) { for (int i = 0; i < 10; i++) { TensorView* tv = makeConcreteTensor(sizes); tv->setMemoryType(MemoryType::Global); - auto *allocate = IrBuilder::create(tv, MemoryType::Global); - auto *deallocate = IrBuilder::create(allocate); + auto* allocate = IrBuilder::create(tv, MemoryType::Global); + auto* deallocate = IrBuilder::create(allocate); hic->pushBackTopLevelExprs(allocate); hic->pushBackTopLevelExprs(deallocate); From 894aaa3f4dbf31d35598374b36978fc82484e3e0 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Mon, 21 Apr 2025 23:21:17 +0000 Subject: [PATCH 03/11] Fix linter --- csrc/host_ir/host_ir.cpp | 4 ++++ csrc/host_ir/host_ir.h | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp index 303de5a5658..da1943ebc1c 100644 --- a/csrc/host_ir/host_ir.cpp +++ b/csrc/host_ir/host_ir.cpp @@ -160,6 +160,10 @@ Deallocate::Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate) NVFUSER_DEFINE_CLONE_AND_CREATE(Deallocate) +const kir::Allocate* Deallocate::allocation() const { + return attributes_.at(0)->as(); +} + std::string Deallocate::toString(int indent_size) const { std::stringstream ss; indent(ss, indent_size) << "Deallocate {" << std::endl; diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h index aa9cd68cdf3..09b6d9ba51a 100644 --- a/csrc/host_ir/host_ir.h +++ b/csrc/host_ir/host_ir.h @@ -173,9 +173,7 @@ class Deallocate : public Expr { return "hir::Deallocate"; } - const auto allocation() const { - return attributes_.at(0)->as(); - } + const kir::Allocate* allocation() const; }; class Stream : public Val { From 28473be2eea8f0eca494d09bfb072cb495936d03 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Tue, 22 Apr 2025 11:47:28 -0400 Subject: [PATCH 04/11] Update csrc/host_ir/executor.cpp Co-authored-by: Jingyue Wu --- csrc/host_ir/executor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index a21aeede997..d6507f38e95 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -332,7 +332,7 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { } // If all output buffers are known already, pass them to the executor - for (auto& output : launch_kernel->outputs()) { + for (Val* output : launch_kernel->outputs()) { if (expr_evaluator_.isKnown(output)) { outputs.push(expr_evaluator_.evaluate(output)); } else { From b22bbadbfb0ffe3369c2bd284daf4896b30cc88d Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Tue, 22 Apr 2025 11:47:42 -0400 Subject: [PATCH 05/11] Update tests/cpp/test_host_ir_integration.cpp Co-authored-by: samnordmann --- tests/cpp/test_host_ir_integration.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp index 0fa58f8a6cd..14c86e907dd 100644 --- a/tests/cpp/test_host_ir_integration.cpp +++ b/tests/cpp/test_host_ir_integration.cpp @@ -136,7 +136,7 @@ TEST_F(HostIrIntegrationTest, Deallocate) { hie.runWithInput({}); - EXPECT_LE(memoryAllocated(device_index), 0); + EXPECT_EQ(memoryAllocated(device_index), 0); } } // namespace hir From 14b4aa34e5fe4f57de5a40b19d8cfb241528183d Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Tue, 22 Apr 2025 16:38:40 +0000 Subject: [PATCH 06/11] Review feedback --- csrc/host_ir/executor.cpp | 7 ++++--- csrc/host_ir/host_ir.cpp | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index d6507f38e95..58c149ef066 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -325,13 +325,14 @@ void HostIrEvaluator::handle(Synchronize* synchronize) { } void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { - KernelArgumentHolder args, outputs; - bool preallocated_outputs = true; + KernelArgumentHolder args; for (auto& input : launch_kernel->inputs()) { args.push(getKnownConcreteValue(input)); } // If all output buffers are known already, pass them to the executor + KernelArgumentHolder outputs; + bool preallocated_outputs = true; for (Val* output : launch_kernel->outputs()) { if (expr_evaluator_.isKnown(output)) { outputs.push(expr_evaluator_.evaluate(output)); @@ -671,7 +672,7 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { void HostIrEvaluator::handle(Deallocate* deallocate) { TensorView* tv = deallocate->allocation()->buffer()->as(); NVF_ERROR( - expr_evaluator_.isKnown(tv), + isKnown(tv), "Tried to free buffer associated with unknown TensorView", tv); invalidate(tv); diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp index da1943ebc1c..06b20963314 100644 --- a/csrc/host_ir/host_ir.cpp +++ b/csrc/host_ir/host_ir.cpp @@ -173,7 +173,7 @@ std::string Deallocate::toString(int indent_size) const { } std::string Deallocate::toInlineString(int indent_size) const { - return std::string("Deallocate"); + return std::string("Deallocate ") + allocation()->buffer()->toInlineString(); } Stream::Stream(IrBuilderPasskey passkey, Val* index) From 8706c018d79a1f9c1402c6b953aae6268e77c700 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Tue, 22 Apr 2025 16:43:49 +0000 Subject: [PATCH 07/11] use native method --- csrc/host_ir/executor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 58c149ef066..a0246fbd949 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -334,7 +334,7 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { KernelArgumentHolder outputs; bool preallocated_outputs = true; for (Val* output : launch_kernel->outputs()) { - if (expr_evaluator_.isKnown(output)) { + if (isKnown(output)) { outputs.push(expr_evaluator_.evaluate(output)); } else { outputs = {}; @@ -652,7 +652,7 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { "Allocation must be on a TensorView but got ", allocate->buffer()); TensorView* tv = allocate->buffer()->as(); - if (expr_evaluator_.isKnown(tv)) { + if (isKnown(tv)) { return; } GlobalBufferInfo info = From 823650f75a440d60e331a305f8e83478f698da23 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Tue, 22 Apr 2025 20:26:57 +0000 Subject: [PATCH 08/11] use native method --- csrc/host_ir/executor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index a0246fbd949..8313a29124f 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -335,7 +335,7 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { bool preallocated_outputs = true; for (Val* output : launch_kernel->outputs()) { if (isKnown(output)) { - outputs.push(expr_evaluator_.evaluate(output)); + outputs.push(getKnownConcreteValue(output)); } else { outputs = {}; preallocated_outputs = false; From 81b667440c2357caf9b8eb7160ec8f805a64f556 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Wed, 23 Apr 2025 14:08:13 -0400 Subject: [PATCH 09/11] Update tests/cpp/test_host_ir_integration.cpp Co-authored-by: Jingyue Wu --- tests/cpp/test_host_ir_integration.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp index 14c86e907dd..e0cc2b7a25f 100644 --- a/tests/cpp/test_host_ir_integration.cpp +++ b/tests/cpp/test_host_ir_integration.cpp @@ -115,7 +115,7 @@ TEST_F(HostIrIntegrationTest, Sum) { TEST_F(HostIrIntegrationTest, Deallocate) { const std::vector sizes = {8, 64}; - uint8_t device_index = 0; + c10::DeviceIndex device_index = 0; resetPeakMemoryStats(device_index); From 0a3c9f2001592db770ef73a40909fa1c51ff9959 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Wed, 23 Apr 2025 18:46:19 +0000 Subject: [PATCH 10/11] Review feedback --- csrc/host_ir/executor.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 8313a29124f..585bc7dac50 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -332,17 +332,17 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { // If all output buffers are known already, pass them to the executor KernelArgumentHolder outputs; - bool preallocated_outputs = true; + bool preallocated_outputs = false; for (Val* output : launch_kernel->outputs()) { if (isKnown(output)) { + preallocated_outputs = true; outputs.push(getKnownConcreteValue(output)); - } else { - outputs = {}; - preallocated_outputs = false; - break; } } + NVF_ERROR( + outputs.empty() || outputs.size() == launch_kernel->outputs().size()); + args.setDeviceIndex(); // run the compiled kernel From 615a825fb085a15b6f7cb8831ad6e02464aa4a70 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Wed, 23 Apr 2025 14:47:50 -0400 Subject: [PATCH 11/11] Update csrc/host_ir/executor.cpp Co-authored-by: Jingyue Wu --- csrc/host_ir/executor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 585bc7dac50..b029f748671 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -670,7 +670,7 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { } void HostIrEvaluator::handle(Deallocate* deallocate) { - TensorView* tv = deallocate->allocation()->buffer()->as(); + auto* tv = deallocate->allocation()->buffer()->as(); NVF_ERROR( isKnown(tv), "Tried to free buffer associated with unknown TensorView",