diff --git a/csrc/dispatch.h b/csrc/dispatch.h index f1f4153d1d2..fed2b39511c 100644 --- a/csrc/dispatch.h +++ b/csrc/dispatch.h @@ -160,7 +160,8 @@ class Val; f(Synchronize); \ f(StartCoalescing); \ f(EndCoalescing); \ - f(ShareMemHandles); + f(ShareMemHandles); \ + f(Deallocate); // Forward declarations for all Val and Expr types diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 89710eaae4b..b029f748671 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -329,20 +329,35 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { for (auto& input : launch_kernel->inputs()) { args.push(getKnownConcreteValue(input)); } + + // If all output buffers are known already, pass them to the executor + KernelArgumentHolder outputs; + bool preallocated_outputs = false; + for (Val* output : launch_kernel->outputs()) { + if (isKnown(output)) { + preallocated_outputs = true; + outputs.push(getKnownConcreteValue(output)); + } + } + + NVF_ERROR( + outputs.empty() || outputs.size() == launch_kernel->outputs().size()); + args.setDeviceIndex(); // run the compiled kernel - KernelArgumentHolder outputs = - container_->getKernelExecutor(launch_kernel->getIndex()) - ->run( - args, - {}, - launch_kernel->launch_params(), - launch_kernel->compile_params()); - - // Store the outputs in the context - for (auto output_idx : arange(outputs.size())) { - bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]); + outputs = container_->getKernelExecutor(launch_kernel->getIndex()) + ->run( + args, + outputs, + launch_kernel->launch_params(), + launch_kernel->compile_params()); + + if (!preallocated_outputs) { + // Store the outputs in the context + for (auto output_idx : arange(outputs.size())) { + bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]); + } } } @@ -637,7 +652,7 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { "Allocation must be on a TensorView but got ", allocate->buffer()); TensorView* tv = allocate->buffer()->as(); - if (expr_evaluator_.isKnown(tv)) { + if (isKnown(tv)) { return; } GlobalBufferInfo info = @@ -654,6 +669,15 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { bind(tv, tensor); } +void HostIrEvaluator::handle(Deallocate* deallocate) { + auto* tv = deallocate->allocation()->buffer()->as(); + NVF_ERROR( + isKnown(tv), + "Tried to free buffer associated with unknown TensorView", + tv); + invalidate(tv); +} + void HostIrEvaluator::unhandled(Statement* stmt) { NVF_ERROR(stmt->isA(), stmt, " must be an Expr"); auto* expr = stmt->as(); diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index d71b74e0dda..c854d2312fc 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -135,6 +135,7 @@ class HostIrEvaluator final : public OptOutDispatch { void handle(LinearOp* linear) override; void handle(kir::Allocate* allocate) override; void handle(ShareMemHandles* share_mem_handles) override; + void handle(Deallocate* deallocate) override; void unhandled(Statement* stmt) override; c10::cuda::CUDAStream getCUDAStream(Stream* stream); diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp index 9e1386d0d3d..06b20963314 100644 --- a/csrc/host_ir/host_ir.cpp +++ b/csrc/host_ir/host_ir.cpp @@ -153,6 +153,29 @@ std::string LaunchKernel::toInlineString(int indent_size) const { NVF_CHECK(false, "Can not be printed inline"); } +Deallocate::Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate) + : Expr(passkey) { + addAttribute(allocate); +} + +NVFUSER_DEFINE_CLONE_AND_CREATE(Deallocate) + +const kir::Allocate* Deallocate::allocation() const { + return attributes_.at(0)->as(); +} + +std::string Deallocate::toString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << "Deallocate {" << std::endl; + ss << allocation()->toString(indent_size + 1); + indent(ss, indent_size) << "}" << std::endl; + return ss.str(); +} + +std::string Deallocate::toInlineString(int indent_size) const { + return std::string("Deallocate ") + allocation()->buffer()->toInlineString(); +} + Stream::Stream(IrBuilderPasskey passkey, Val* index) : Val(passkey, ValType::Stream), index_(index) {} diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h index bad3a6ef722..09b6d9ba51a 100644 --- a/csrc/host_ir/host_ir.h +++ b/csrc/host_ir/host_ir.h @@ -155,6 +155,27 @@ class LaunchKernel : public Expr { } }; +class Deallocate : public Expr { + public: + using Expr::Expr; + Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate); + + Deallocate(const Deallocate& other) = delete; + Deallocate& operator=(const Deallocate& other) = delete; + Deallocate(Deallocate&& other) = delete; + Deallocate& operator=(Deallocate&& other) = delete; + + NVFUSER_DECLARE_CLONE_AND_CREATE + + std::string toString(int indent_size = 0) const override; + std::string toInlineString(int indent_size = 0) const override; + const char* getOpString() const override { + return "hir::Deallocate"; + } + + const kir::Allocate* allocation() const; +}; + class Stream : public Val { public: // if index is provided, the IR represents the streams whose index is the diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp index 149f1af7310..e0cc2b7a25f 100644 --- a/tests/cpp/test_host_ir_integration.cpp +++ b/tests/cpp/test_host_ir_integration.cpp @@ -113,6 +113,32 @@ TEST_F(HostIrIntegrationTest, Sum) { ""); } +TEST_F(HostIrIntegrationTest, Deallocate) { + const std::vector sizes = {8, 64}; + c10::DeviceIndex device_index = 0; + + resetPeakMemoryStats(device_index); + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + for (int i = 0; i < 10; i++) { + TensorView* tv = makeConcreteTensor(sizes); + tv->setMemoryType(MemoryType::Global); + auto* allocate = IrBuilder::create(tv, MemoryType::Global); + auto* deallocate = IrBuilder::create(allocate); + + hic->pushBackTopLevelExprs(allocate); + hic->pushBackTopLevelExprs(deallocate); + } + + HostIrEvaluator hie(std::move(hic)); + + hie.runWithInput({}); + + EXPECT_EQ(memoryAllocated(device_index), 0); +} + } // namespace hir } // namespace nvfuser