diff --git a/csrc/dispatch.h b/csrc/dispatch.h index 007287e49f7..4e09ef6fc02 100644 --- a/csrc/dispatch.h +++ b/csrc/dispatch.h @@ -161,7 +161,8 @@ class Val; f(StartCoalescing); \ f(EndCoalescing); \ f(ShareMemHandles); \ - f(HirAliasSelect); + f(HirAliasSelect); \ + f(Deallocate); // Forward declarations for all Val and Expr types diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 2f2cf9e7b92..2cfa242fab5 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -330,20 +330,35 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { for (auto& input : launch_kernel->inputs()) { args.push(getKnownConcreteValue(input)); } + + // If all output buffers are known already, pass them to the executor + KernelArgumentHolder outputs; + bool preallocated_outputs = false; + for (Val* output : launch_kernel->outputs()) { + if (isKnown(output)) { + preallocated_outputs = true; + outputs.push(getKnownConcreteValue(output)); + } + } + + NVF_ERROR( + outputs.empty() || outputs.size() == launch_kernel->outputs().size()); + args.setDeviceIndex(); // run the compiled kernel - KernelArgumentHolder outputs = - container_->getKernelExecutor(launch_kernel->getIndex()) - ->run( - args, - {}, - launch_kernel->launch_params(), - launch_kernel->compile_params()); - - // Store the outputs in the context - for (auto output_idx : arange(outputs.size())) { - bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]); + outputs = container_->getKernelExecutor(launch_kernel->getIndex()) + ->run( + args, + outputs, + launch_kernel->launch_params(), + launch_kernel->compile_params()); + + if (!preallocated_outputs) { + // Store the outputs in the context + for (auto output_idx : arange(outputs.size())) { + bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]); + } } } @@ -688,7 +703,7 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { "Allocation must be on a TensorView but got ", allocate->buffer()); TensorView* tv = allocate->buffer()->as(); - if (expr_evaluator_.isKnown(tv)) { + if (isKnown(tv)) { return; } GlobalBufferInfo info = @@ -786,6 +801,15 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) { } } +void HostIrEvaluator::handle(Deallocate* deallocate) { + auto* tv = deallocate->allocation()->buffer()->as(); + NVF_ERROR( + isKnown(tv), + "Tried to free buffer associated with unknown TensorView", + tv); + invalidate(tv); +} + void HostIrEvaluator::unhandled(Statement* stmt) { NVF_ERROR(stmt->isA(), stmt, " must be an Expr"); auto* expr = stmt->as(); diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 3f147b7801b..82a8010abbc 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -143,6 +143,7 @@ class HostIrEvaluator final : public OptOutDispatch { void handle(ReductionOp* reduction_op) override; void handle(ShareMemHandles* share_mem_handles) override; void handle(HirAliasSelect* hir_alias_select) override; + void handle(Deallocate* deallocate) override; void unhandled(Statement* stmt) override; c10::cuda::CUDAStream getCUDAStream(Stream* stream); diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp index bf3d5cef9eb..ec49a76bc11 100644 --- a/csrc/host_ir/host_ir.cpp +++ b/csrc/host_ir/host_ir.cpp @@ -153,6 +153,29 @@ std::string LaunchKernel::toInlineString(int indent_size) const { NVF_CHECK(false, "Can not be printed inline"); } +Deallocate::Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate) + : Expr(passkey) { + addAttribute(allocate); +} + +NVFUSER_DEFINE_CLONE_AND_CREATE(Deallocate) + +const kir::Allocate* Deallocate::allocation() const { + return attributes_.at(0)->as(); +} + +std::string Deallocate::toString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << "Deallocate {" << std::endl; + ss << allocation()->toString(indent_size + 1); + indent(ss, indent_size) << "}" << std::endl; + return ss.str(); +} + +std::string Deallocate::toInlineString(int indent_size) const { + return std::string("Deallocate ") + allocation()->buffer()->toInlineString(); +} + Stream::Stream(IrBuilderPasskey passkey, Val* index) : Val(passkey, ValType::Stream), index_(index) {} diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h index d267d23ab1f..f8fd02e64a4 100644 --- a/csrc/host_ir/host_ir.h +++ b/csrc/host_ir/host_ir.h @@ -155,6 +155,27 @@ class LaunchKernel : public Expr { } }; +class Deallocate : public Expr { + public: + using Expr::Expr; + Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate); + + Deallocate(const Deallocate& other) = delete; + Deallocate& operator=(const Deallocate& other) = delete; + Deallocate(Deallocate&& other) = delete; + Deallocate& operator=(Deallocate&& other) = delete; + + NVFUSER_DECLARE_CLONE_AND_CREATE + + std::string toString(int indent_size = 0) const override; + std::string toInlineString(int indent_size = 0) const override; + const char* getOpString() const override { + return "hir::Deallocate"; + } + + const kir::Allocate* allocation() const; +}; + class Stream : public Val { public: // if index is provided, the IR represents the streams whose index is the diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp index 98a919727bd..7f7506d3e3a 100644 --- a/tests/cpp/test_host_ir_integration.cpp +++ b/tests/cpp/test_host_ir_integration.cpp @@ -6,6 +6,7 @@ */ // clang-format on #include +#include #include #include #include @@ -146,6 +147,36 @@ TEST_F(HostIrIntegrationTest, ViewPermute_ExprEval) { executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); } +TEST_F(HostIrIntegrationTest, Deallocate) { + const std::vector sizes = {8, 64}; + c10::DeviceIndex device_index = 0; + + resetPeakMemoryStats(device_index); + at::cuda::clearCublasWorkspaces(); + nvfuser::releaseZeroedMemory(); + ASSERT_EQ(memoryAllocated(device_index), 0) + << "Previous tests leaked memory."; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + for (int i = 0; i < 10; i++) { + TensorView* tv = makeConcreteTensor(sizes); + tv->setMemoryType(MemoryType::Global); + auto* allocate = IrBuilder::create(tv, MemoryType::Global); + auto* deallocate = IrBuilder::create(allocate); + + hic->pushBackTopLevelExprs(allocate); + hic->pushBackTopLevelExprs(deallocate); + } + + HostIrEvaluator hie(std::move(hic)); + + hie.runWithInput({}); + + EXPECT_EQ(memoryAllocated(device_index), 0); +} + } // namespace hir } // namespace nvfuser