Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion csrc/dispatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ class Val;
f(StartCoalescing); \
f(EndCoalescing); \
f(ShareMemHandles); \
f(HirAliasSelect);
f(HirAliasSelect); \
f(Deallocate);

// Forward declarations for all Val and Expr types

Expand Down
48 changes: 36 additions & 12 deletions csrc/host_ir/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,20 +330,35 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) {
for (auto& input : launch_kernel->inputs()) {
args.push(getKnownConcreteValue(input));
}

// If all output buffers are known already, pass them to the executor
KernelArgumentHolder outputs;
bool preallocated_outputs = false;
for (Val* output : launch_kernel->outputs()) {
if (isKnown(output)) {
preallocated_outputs = true;
outputs.push(getKnownConcreteValue(output));
}
}

NVF_ERROR(
outputs.empty() || outputs.size() == launch_kernel->outputs().size());

args.setDeviceIndex();

// run the compiled kernel
KernelArgumentHolder outputs =
container_->getKernelExecutor(launch_kernel->getIndex())
->run(
args,
{},
launch_kernel->launch_params(),
launch_kernel->compile_params());

// Store the outputs in the context
for (auto output_idx : arange(outputs.size())) {
bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]);
outputs = container_->getKernelExecutor(launch_kernel->getIndex())
->run(
args,
outputs,
launch_kernel->launch_params(),
launch_kernel->compile_params());

if (!preallocated_outputs) {
// Store the outputs in the context
for (auto output_idx : arange(outputs.size())) {
bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]);
}
}
}

Expand Down Expand Up @@ -688,7 +703,7 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) {
"Allocation must be on a TensorView but got ",
allocate->buffer());
TensorView* tv = allocate->buffer()->as<TensorView>();
if (expr_evaluator_.isKnown(tv)) {
if (isKnown(tv)) {
return;
}
GlobalBufferInfo info =
Expand Down Expand Up @@ -786,6 +801,15 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) {
}
}

void HostIrEvaluator::handle(Deallocate* deallocate) {
auto* tv = deallocate->allocation()->buffer()->as<TensorView>();
NVF_ERROR(
isKnown(tv),
"Tried to free buffer associated with unknown TensorView",
tv);
invalidate(tv);
}

void HostIrEvaluator::unhandled(Statement* stmt) {
NVF_ERROR(stmt->isA<Expr>(), stmt, " must be an Expr");
auto* expr = stmt->as<Expr>();
Expand Down
1 change: 1 addition & 0 deletions csrc/host_ir/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ class HostIrEvaluator final : public OptOutDispatch {
void handle(ReductionOp* reduction_op) override;
void handle(ShareMemHandles* share_mem_handles) override;
void handle(HirAliasSelect* hir_alias_select) override;
void handle(Deallocate* deallocate) override;
void unhandled(Statement* stmt) override;

c10::cuda::CUDAStream getCUDAStream(Stream* stream);
Expand Down
23 changes: 23 additions & 0 deletions csrc/host_ir/host_ir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,29 @@ std::string LaunchKernel::toInlineString(int indent_size) const {
NVF_CHECK(false, "Can not be printed inline");
}

Deallocate::Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate)
: Expr(passkey) {
addAttribute(allocate);
}

NVFUSER_DEFINE_CLONE_AND_CREATE(Deallocate)

const kir::Allocate* Deallocate::allocation() const {
return attributes_.at(0)->as<kir::Allocate>();
}

std::string Deallocate::toString(int indent_size) const {
std::stringstream ss;
indent(ss, indent_size) << "Deallocate {" << std::endl;
ss << allocation()->toString(indent_size + 1);
indent(ss, indent_size) << "}" << std::endl;
return ss.str();
}

std::string Deallocate::toInlineString(int indent_size) const {
return std::string("Deallocate ") + allocation()->buffer()->toInlineString();
}

Stream::Stream(IrBuilderPasskey passkey, Val* index)
: Val(passkey, ValType::Stream), index_(index) {}

Expand Down
21 changes: 21 additions & 0 deletions csrc/host_ir/host_ir.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,27 @@ class LaunchKernel : public Expr {
}
};

class Deallocate : public Expr {
public:
using Expr::Expr;
Deallocate(IrBuilderPasskey passkey, kir::Allocate* allocate);

Deallocate(const Deallocate& other) = delete;
Deallocate& operator=(const Deallocate& other) = delete;
Deallocate(Deallocate&& other) = delete;
Deallocate& operator=(Deallocate&& other) = delete;

NVFUSER_DECLARE_CLONE_AND_CREATE

std::string toString(int indent_size = 0) const override;
std::string toInlineString(int indent_size = 0) const override;
const char* getOpString() const override {
return "hir::Deallocate";
}

const kir::Allocate* allocation() const;
};

class Stream : public Val {
public:
// if index is provided, the IR represents the streams whose index is the
Expand Down
31 changes: 31 additions & 0 deletions tests/cpp/test_host_ir_integration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
*/
// clang-format on
#include <fusion.h>
#include <global_allocator.h>
#include <host_ir/container.h>
#include <host_ir/executor.h>
#include <ir/all_nodes.h>
Expand Down Expand Up @@ -146,6 +147,36 @@ TEST_F(HostIrIntegrationTest, ViewPermute_ExprEval) {
executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
}

TEST_F(HostIrIntegrationTest, Deallocate) {
const std::vector<int64_t> sizes = {8, 64};
c10::DeviceIndex device_index = 0;

resetPeakMemoryStats(device_index);
at::cuda::clearCublasWorkspaces();
nvfuser::releaseZeroedMemory();
ASSERT_EQ(memoryAllocated(device_index), 0)
<< "Previous tests leaked memory.";
Copy link
Member Author

@nsarka nsarka Apr 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the CI fix suggested by Jingyue. I'm waiting for the !test command to finish to see if it's enough.


auto hic = std::make_unique<HostIrContainer>();
FusionGuard fg(hic.get());

for (int i = 0; i < 10; i++) {
TensorView* tv = makeConcreteTensor(sizes);
tv->setMemoryType(MemoryType::Global);
auto* allocate = IrBuilder::create<kir::Allocate>(tv, MemoryType::Global);
auto* deallocate = IrBuilder::create<Deallocate>(allocate);

hic->pushBackTopLevelExprs(allocate);
hic->pushBackTopLevelExprs(deallocate);
}

HostIrEvaluator hie(std::move(hic));

hie.runWithInput({});

EXPECT_EQ(memoryAllocated(device_index), 0);
}

} // namespace hir

} // namespace nvfuser
Loading