Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
9a0dc9e
host ir alias and prealloc output support
samnordmann Mar 26, 2025
9820d5a
harden and simplify allocation in for loop test
samnordmann Mar 26, 2025
8c49c95
Merge branch 'main' of github.com:NVIDIA/Fuser into host_irs/alias_su…
samnordmann Apr 11, 2025
2ad510d
refactor and clean host ir lowering and segmentation
samnordmann Mar 26, 2025
46c6717
lint
samnordmann Mar 26, 2025
73d5d7b
put back isResharding as the condition for lower to a standalone host…
samnordmann Mar 26, 2025
e35ddd0
minor comments
samnordmann Apr 11, 2025
4964680
lint
samnordmann Apr 11, 2025
ed8dc7c
add host ir support for set reduce and binary op
samnordmann Mar 26, 2025
85b7b75
move .contiguous to be in postScatter
samnordmann Apr 11, 2025
01e94a7
lint and build issue
samnordmann Apr 11, 2025
e1db518
reviews
samnordmann Apr 14, 2025
7e6cef6
Merge branch 'main' of github.com:NVIDIA/Fuser into host_irs/alias_su…
samnordmann Apr 14, 2025
59622ff
add comment
samnordmann Apr 15, 2025
25c618c
add comment
samnordmann Apr 15, 2025
eb46aef
minor comment
samnordmann Apr 16, 2025
5f161f5
lint
samnordmann Apr 16, 2025
b936420
Merge branch 'host_irs/refactor_lowering_and_segmentation' into host_…
samnordmann Apr 16, 2025
97b1743
Merge branch 'main' of github.com:NVIDIA/Fuser into host_irs/alias_su…
samnordmann Apr 16, 2025
684118f
Merge branch 'host_irs/alias_support' into host_irs/refactor_lowering…
samnordmann Apr 16, 2025
e29abe4
Merge branch 'host_irs/refactor_lowering_and_segmentation' into host_…
samnordmann Apr 16, 2025
d265584
Revert "move .contiguous to be in postScatter"
samnordmann Apr 16, 2025
dffcd51
Merge branch 'main' of github.com:NVIDIA/Fuser into host_irs/LoadStor…
samnordmann Apr 16, 2025
b55d4e7
minor comment
samnordmann Apr 18, 2025
6b479de
lower as HIR only set without permute
samnordmann Apr 24, 2025
15552c2
Fix handle(LoadStoreOp*)
wujingyue Apr 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions csrc/host_ir/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <host_ir/executor.h>
#include <host_ir/lower.h>
#include <instrumentation.h>
#include <ir/iostream.h>
#include <ir/utils.h>
#include <multidevice/communication.h>
#include <multidevice/cuda_p2p.h>
Expand Down Expand Up @@ -631,6 +632,56 @@ void HostIrEvaluator::handle(LinearOp* linear) {
}
}

void HostIrEvaluator::handle(LoadStoreOp* load_store_op) {
NVF_ERROR(
load_store_op->opType() == LoadStoreOpType::Set,
"LoadStoreOp must be a Set");
NVF_ERROR(
load_store_op->out()->isA<TensorView>(), "out must be a TensorView");
auto* out_tv = load_store_op->out()->as<TensorView>();
auto in_tensor = getKnownConcreteValue(load_store_op->in()).as<at::Tensor>();

at::Tensor t;
if (out_tv->hasRoot()) {
std::optional<std::vector<int64_t>> permutation =
ir_utils::computePermutation(
out_tv->getRootDomain(), out_tv->getLogicalDomain());
NVF_ERROR(
permutation.has_value(),
"The logical domain of a Set.Permute is supposed to be a permutation"
" of the root domain: ",
out_tv);
t = in_tensor.permute(*permutation);
} else {
t = in_tensor;
}

if (isKnown(out_tv)) {
auto out_tensor =
getKnownConcreteValue(load_store_op->out()).as<at::Tensor>();
out_tensor.copy_(t);
} else {
// For completeness, we may check if out_tv's allocation matches `t` and
// copy data if yes. For example,
//
// clang-format off
// ```
// const auto& [sizes, strides] = inferShapeOfOutput(out_tv, expr_evaluator_);
// if (strides == t.strides()) {
// bind(out_tv, t);
// } else {
// auto out_tensor = at::empty_strided(sizes, strides, in_tensor.dtype());
// out_tensor.copy_(t);
// bind_(out_tv, out_tensor);
// }
// ```
// clang-format on
//
// For now, I choose to keep code simple for the limited use cases.
bind(out_tv, t);
}
}

void HostIrEvaluator::handle(kir::Allocate* allocate) {
NVF_ERROR(
allocate->buffer()->isA<TensorView>(),
Expand All @@ -654,6 +705,78 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) {
bind(tv, tensor);
}

void HostIrEvaluator::handle(BinaryOp* binary_op) {
if (!isKnown(binary_op->outputs().at(0))) {
return unhandled(binary_op);
}

auto lhs = getKnownConcreteValue(binary_op->inputs().at(0)).as<at::Tensor>();
auto rhs = getKnownConcreteValue(binary_op->inputs().at(1)).as<at::Tensor>();
auto output =
getKnownConcreteValue(binary_op->outputs().at(0)).as<at::Tensor>();

switch (binary_op->getBinaryOpType()) {
case BinaryOpType::Add:
at::add_out(output, lhs, rhs);
break;
case BinaryOpType::Sub:
at::sub_out(output, lhs, rhs);
break;
case BinaryOpType::Mul:
at::mul_out(output, lhs, rhs);
break;
case BinaryOpType::Div:
at::div_out(output, lhs, rhs);
break;
default:
NVF_THROW(
"Unexpected operator type: ",
binary_op->getBinaryOpType(),
" in ",
binary_op);
}
}

void HostIrEvaluator::handle(ReductionOp* reduction_op) {
auto input_tv = reduction_op->in()->as<TensorView>();
auto output_tv = reduction_op->out()->as<TensorView>();
if (!isKnown(output_tv)) {
return unhandled(reduction_op);
}

NVF_ERROR(
!output_tv->hasRoot(),
"Evaluation for rFactored reductions is not supported.");
auto input = getKnownConcreteValue(input_tv).as<at::Tensor>();
auto output = getKnownConcreteValue(output_tv).as<at::Tensor>();

std::vector<int64_t> reduction_axes;
for (const auto i :
c10::irange(int64_t(output_tv->getLogicalDomain().size()))) {
auto ax = output_tv->getLogicalDomain().at(i);
if (ax->isReduction()) {
reduction_axes.push_back(i);
}
}
switch (reduction_op->getReductionOpType()) {
case BinaryOpType::Add:
at::sum_out(output, input, reduction_axes);
return;
case BinaryOpType::Max:
at::amax_out(output, input, reduction_axes);
return;
case BinaryOpType::Min:
at::amin_out(output, input, reduction_axes);
return;
default:
NVF_THROW(
"Unexpected operator type: ",
reduction_op->getReductionOpType(),
" in ",
reduction_op);
}
}

void HostIrEvaluator::unhandled(Statement* stmt) {
NVF_ERROR(stmt->isA<Expr>(), stmt, " must be an Expr");
auto* expr = stmt->as<Expr>();
Expand Down
3 changes: 3 additions & 0 deletions csrc/host_ir/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ class HostIrEvaluator final : public OptOutDispatch {
void handle(MatmulOp* matmul) override;
void handle(LinearOp* linear) override;
void handle(kir::Allocate* allocate) override;
void handle(LoadStoreOp* load_store_op) override;
void handle(BinaryOp* binary_op) override;
void handle(ReductionOp* reduction_op) override;
void handle(ShareMemHandles* share_mem_handles) override;
void unhandled(Statement* stmt) override;

Expand Down
28 changes: 27 additions & 1 deletion csrc/host_ir/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,33 @@ std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
}

bool HostIrLower::isLowerableAsStandaloneHostOp(Expr* expr) {
return isResharding(expr);
if (expr->isOneOf<
MatmulOp,
SliceOp,
SelectOp,
LinearOp,
BinaryOp,
ReductionOp,
Communication,
P2PCommunication>()) {
return true;
}

// Lower as standalone op "set" ops, i.e., LoadStoreOp of "Set" type with no
// permute
if (expr->isA<LoadStoreOp>()) {
auto* load_store = expr->as<LoadStoreOp>();
if (load_store->opType() == LoadStoreOpType::Set &&
load_store->out()->isA<TensorView>()) {
auto* tv = load_store->out()->as<TensorView>();
// If the output tensor has no root, it means it has no permute
if (!tv->hasRoot()) {
return true;
}
}
}

return false;
}

bool HostIrLower::shouldMergeSegmentedGroups(
Expand Down
180 changes: 180 additions & 0 deletions tests/cpp/test_host_irs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,186 @@ TEST_F(HirAlias, ThrowOnInputAlias) {
EXPECT_ANY_THROW(HostIrEvaluator hie(std::move(hic)));
}

using HirSetTest = NVFuserTest;

TEST_F(HirSetTest, HostIr) {
const std::vector<int64_t> sizes = {8, 64};

auto hic = std::make_unique<HostIrContainer>();
FusionGuard fg(hic.get());

auto* in = makeConcreteTensor(sizes);
auto* out = makeConcreteTensor(sizes);
auto* set = IrBuilder::create<LoadStoreOp>(LoadStoreOpType::Set, out, in);
hic->addInput(in);
hic->addInput(out);
hic->pushBackTopLevelExprs(set);

HostIrEvaluator hie(std::move(hic));

auto options = at::TensorOptions().device(at::kCUDA, 0);
auto in_aten = at::randn(sizes, options);
auto out_aten = at::empty(sizes, options);

hie.runWithInput({{in, in_aten}, {out, out_aten}});

EXPECT_TRUE(out_aten.equal(in_aten))
<< "Obtained output: " << out_aten << "\n"
<< "Expected output: " << in_aten;
}

class HirBinaryOpTest : public NVFuserFixtureParamTest<BinaryOpType> {
protected:
at::Tensor executeBinaryOp(at::Tensor lhs, at::Tensor rhs) {
switch (GetParam()) {
case BinaryOpType::Add:
return lhs + rhs;
case BinaryOpType::Sub:
return lhs - rhs;
case BinaryOpType::Mul:
return lhs * rhs;
case BinaryOpType::Div:
return lhs / rhs;
default:
NVF_ERROR("Unsupported binary op type ", GetParam());
return at::Tensor();
}
}
};

TEST_P(HirBinaryOpTest, PreAllocatedOutputs) {
const std::vector<int64_t> sizes = {8, 64};
const auto& binary_op_type = GetParam();

auto hic = std::make_unique<HostIrContainer>();
FusionGuard fg(hic.get());

auto* lhs = makeConcreteTensor(sizes);
auto* rhs = makeConcreteTensor(sizes);
auto* out = makeConcreteTensor(sizes);
auto* binary_op = IrBuilder::create<BinaryOp>(binary_op_type, out, lhs, rhs);
hic->addInput(lhs);
hic->addInput(rhs);
hic->addInput(out);
hic->pushBackTopLevelExprs(binary_op);

HostIrEvaluator hie(std::move(hic));

auto options = at::TensorOptions().device(at::kCUDA, 0);
auto lhs_aten = at::randn(sizes, options);
auto rhs_aten = at::randn(sizes, options);
auto out_aten = at::empty(sizes, options);

hie.runWithInput({{lhs, lhs_aten}, {rhs, rhs_aten}, {out, out_aten}});

at::Tensor expected_out = executeBinaryOp(lhs_aten, rhs_aten);
EXPECT_TRUE(expected_out.equal(out_aten))
<< "Obtained output: " << out_aten << "\n"
<< "Expected output: " << expected_out;
}

TEST_P(HirBinaryOpTest, NonPreAllocatedOutputs) {
const std::vector<int64_t> sizes = {8, 64};
const auto& binary_op_type = GetParam();

auto hic = std::make_unique<HostIrContainer>();
FusionGuard fg(hic.get());

auto* lhs = makeConcreteTensor(sizes);
auto* rhs = makeConcreteTensor(sizes);
auto* out = binaryOp(binary_op_type, lhs, rhs);
hic->addInput(lhs);
hic->addInput(rhs);
hic->addOutput(out);
hic->pushBackTopLevelExprs(out->definition());

HostIrEvaluator hie(std::move(hic));

auto options = at::TensorOptions().device(at::kCUDA, 0);
auto lhs_aten = at::randn(sizes, options);
auto rhs_aten = at::randn(sizes, options);

auto out_aten =
hie.runWithInput({{lhs, lhs_aten}, {rhs, rhs_aten}})[0].as<at::Tensor>();

at::Tensor expected_out = executeBinaryOp(lhs_aten, rhs_aten);
EXPECT_TRUE(expected_out.equal(out_aten))
<< "Obtained output: " << out_aten << "\n"
<< "Expected output: " << expected_out;
}

INSTANTIATE_TEST_SUITE_P(
,
HirBinaryOpTest,
testing::Values(
BinaryOpType::Add,
BinaryOpType::Sub,
BinaryOpType::Mul,
BinaryOpType::Div),
[](const testing::TestParamInfo<BinaryOpType>& info) -> std::string {
std::stringstream ss;
ss << "BinaryOpType_" << info.param;
return ss.str();
});

using HirReductionOpTest = NVFuserTest;

TEST_F(HirReductionOpTest, PreAllocatedOutputs) {
constexpr int64_t size0 = 8, size1 = 64;
constexpr int64_t reduction_axis = 1;

auto hic = std::make_unique<HostIrContainer>();
FusionGuard fg(hic.get());

auto* in = makeConcreteTensor({size0, size1});
auto* out = newForReduction(in, {reduction_axis}, in->dtype());
auto* reduction_op = IrBuilder::create<ReductionOp>(
BinaryOpType::Add, hic->zeroVal(), out, in);
hic->addInput(in);
hic->addOutput(out);
hic->pushBackTopLevelExprs(reduction_op);

HostIrEvaluator hie(std::move(hic));

auto options = at::TensorOptions().device(at::kCUDA, 0);
auto in_aten = at::randn({size0, size1}, options);
auto out_aten = at::empty({size0}, options);

hie.runWithInput({{in, in_aten}, {out, out_aten}});

at::Tensor expected_out = in_aten.sum(reduction_axis);
EXPECT_TRUE(expected_out.equal(out_aten))
<< "Obtained output: " << out_aten << "\n"
<< "Expected output: " << expected_out;
}

TEST_F(HirReductionOpTest, NonPreAllocatedOutputs) {
constexpr int64_t size0 = 8, size1 = 64;
constexpr int64_t reduction_axis = 1;

auto hic = std::make_unique<HostIrContainer>();
FusionGuard fg(hic.get());

auto* in = makeConcreteTensor({size0, size1});
auto* out = sum(in, {reduction_axis});
hic->addInput(in);
hic->addOutput(out);
hic->pushBackTopLevelExprs(out->definition());

HostIrEvaluator hie(std::move(hic));

auto options = at::TensorOptions().device(at::kCUDA, 0);
auto in_aten = at::randn({size0, size1}, options);
auto out_aten = at::empty({size0}, options);

hie.runWithInput({{in, in_aten}, {out, out_aten}});

at::Tensor expected_out = in_aten.sum(reduction_axis);
EXPECT_TRUE(expected_out.equal(out_aten))
<< "Obtained output: " << out_aten << "\n"
<< "Expected output: " << expected_out;
}

} // namespace hir

} // namespace nvfuser
Loading