diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 89710eaae4b..4ceef8927ed 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -631,6 +632,56 @@ void HostIrEvaluator::handle(LinearOp* linear) { } } +void HostIrEvaluator::handle(LoadStoreOp* load_store_op) { + NVF_ERROR( + load_store_op->opType() == LoadStoreOpType::Set, + "LoadStoreOp must be a Set"); + NVF_ERROR( + load_store_op->out()->isA(), "out must be a TensorView"); + auto* out_tv = load_store_op->out()->as(); + auto in_tensor = getKnownConcreteValue(load_store_op->in()).as(); + + at::Tensor t; + if (out_tv->hasRoot()) { + std::optional> permutation = + ir_utils::computePermutation( + out_tv->getRootDomain(), out_tv->getLogicalDomain()); + NVF_ERROR( + permutation.has_value(), + "The logical domain of a Set.Permute is supposed to be a permutation" + " of the root domain: ", + out_tv); + t = in_tensor.permute(*permutation); + } else { + t = in_tensor; + } + + if (isKnown(out_tv)) { + auto out_tensor = + getKnownConcreteValue(load_store_op->out()).as(); + out_tensor.copy_(t); + } else { + // For completeness, we may check if out_tv's allocation matches `t` and + // copy data if yes. For example, + // + // clang-format off + // ``` + // const auto& [sizes, strides] = inferShapeOfOutput(out_tv, expr_evaluator_); + // if (strides == t.strides()) { + // bind(out_tv, t); + // } else { + // auto out_tensor = at::empty_strided(sizes, strides, in_tensor.dtype()); + // out_tensor.copy_(t); + // bind_(out_tv, out_tensor); + // } + // ``` + // clang-format on + // + // For now, I choose to keep code simple for the limited use cases. + bind(out_tv, t); + } +} + void HostIrEvaluator::handle(kir::Allocate* allocate) { NVF_ERROR( allocate->buffer()->isA(), @@ -654,6 +705,78 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { bind(tv, tensor); } +void HostIrEvaluator::handle(BinaryOp* binary_op) { + if (!isKnown(binary_op->outputs().at(0))) { + return unhandled(binary_op); + } + + auto lhs = getKnownConcreteValue(binary_op->inputs().at(0)).as(); + auto rhs = getKnownConcreteValue(binary_op->inputs().at(1)).as(); + auto output = + getKnownConcreteValue(binary_op->outputs().at(0)).as(); + + switch (binary_op->getBinaryOpType()) { + case BinaryOpType::Add: + at::add_out(output, lhs, rhs); + break; + case BinaryOpType::Sub: + at::sub_out(output, lhs, rhs); + break; + case BinaryOpType::Mul: + at::mul_out(output, lhs, rhs); + break; + case BinaryOpType::Div: + at::div_out(output, lhs, rhs); + break; + default: + NVF_THROW( + "Unexpected operator type: ", + binary_op->getBinaryOpType(), + " in ", + binary_op); + } +} + +void HostIrEvaluator::handle(ReductionOp* reduction_op) { + auto input_tv = reduction_op->in()->as(); + auto output_tv = reduction_op->out()->as(); + if (!isKnown(output_tv)) { + return unhandled(reduction_op); + } + + NVF_ERROR( + !output_tv->hasRoot(), + "Evaluation for rFactored reductions is not supported."); + auto input = getKnownConcreteValue(input_tv).as(); + auto output = getKnownConcreteValue(output_tv).as(); + + std::vector reduction_axes; + for (const auto i : + c10::irange(int64_t(output_tv->getLogicalDomain().size()))) { + auto ax = output_tv->getLogicalDomain().at(i); + if (ax->isReduction()) { + reduction_axes.push_back(i); + } + } + switch (reduction_op->getReductionOpType()) { + case BinaryOpType::Add: + at::sum_out(output, input, reduction_axes); + return; + case BinaryOpType::Max: + at::amax_out(output, input, reduction_axes); + return; + case BinaryOpType::Min: + at::amin_out(output, input, reduction_axes); + return; + default: + NVF_THROW( + "Unexpected operator type: ", + reduction_op->getReductionOpType(), + " in ", + reduction_op); + } +} + void HostIrEvaluator::unhandled(Statement* stmt) { NVF_ERROR(stmt->isA(), stmt, " must be an Expr"); auto* expr = stmt->as(); diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index d71b74e0dda..dfe84fba068 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -134,6 +134,9 @@ class HostIrEvaluator final : public OptOutDispatch { void handle(MatmulOp* matmul) override; void handle(LinearOp* linear) override; void handle(kir::Allocate* allocate) override; + void handle(LoadStoreOp* load_store_op) override; + void handle(BinaryOp* binary_op) override; + void handle(ReductionOp* reduction_op) override; void handle(ShareMemHandles* share_mem_handles) override; void unhandled(Statement* stmt) override; diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index 1cdc21e60c0..fd14096b190 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -615,7 +615,33 @@ std::vector HostIrLower::lowerToCollectiveBasedPipelinedGemmComm( } bool HostIrLower::isLowerableAsStandaloneHostOp(Expr* expr) { - return isResharding(expr); + if (expr->isOneOf< + MatmulOp, + SliceOp, + SelectOp, + LinearOp, + BinaryOp, + ReductionOp, + Communication, + P2PCommunication>()) { + return true; + } + + // Lower as standalone op "set" ops, i.e., LoadStoreOp of "Set" type with no + // permute + if (expr->isA()) { + auto* load_store = expr->as(); + if (load_store->opType() == LoadStoreOpType::Set && + load_store->out()->isA()) { + auto* tv = load_store->out()->as(); + // If the output tensor has no root, it means it has no permute + if (!tv->hasRoot()) { + return true; + } + } + } + + return false; } bool HostIrLower::shouldMergeSegmentedGroups( diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index 6a41e47c744..633ebc83504 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -1307,6 +1307,186 @@ TEST_F(HirAlias, ThrowOnInputAlias) { EXPECT_ANY_THROW(HostIrEvaluator hie(std::move(hic))); } +using HirSetTest = NVFuserTest; + +TEST_F(HirSetTest, HostIr) { + const std::vector sizes = {8, 64}; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto* in = makeConcreteTensor(sizes); + auto* out = makeConcreteTensor(sizes); + auto* set = IrBuilder::create(LoadStoreOpType::Set, out, in); + hic->addInput(in); + hic->addInput(out); + hic->pushBackTopLevelExprs(set); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + auto in_aten = at::randn(sizes, options); + auto out_aten = at::empty(sizes, options); + + hie.runWithInput({{in, in_aten}, {out, out_aten}}); + + EXPECT_TRUE(out_aten.equal(in_aten)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << in_aten; +} + +class HirBinaryOpTest : public NVFuserFixtureParamTest { + protected: + at::Tensor executeBinaryOp(at::Tensor lhs, at::Tensor rhs) { + switch (GetParam()) { + case BinaryOpType::Add: + return lhs + rhs; + case BinaryOpType::Sub: + return lhs - rhs; + case BinaryOpType::Mul: + return lhs * rhs; + case BinaryOpType::Div: + return lhs / rhs; + default: + NVF_ERROR("Unsupported binary op type ", GetParam()); + return at::Tensor(); + } + } +}; + +TEST_P(HirBinaryOpTest, PreAllocatedOutputs) { + const std::vector sizes = {8, 64}; + const auto& binary_op_type = GetParam(); + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto* lhs = makeConcreteTensor(sizes); + auto* rhs = makeConcreteTensor(sizes); + auto* out = makeConcreteTensor(sizes); + auto* binary_op = IrBuilder::create(binary_op_type, out, lhs, rhs); + hic->addInput(lhs); + hic->addInput(rhs); + hic->addInput(out); + hic->pushBackTopLevelExprs(binary_op); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + auto lhs_aten = at::randn(sizes, options); + auto rhs_aten = at::randn(sizes, options); + auto out_aten = at::empty(sizes, options); + + hie.runWithInput({{lhs, lhs_aten}, {rhs, rhs_aten}, {out, out_aten}}); + + at::Tensor expected_out = executeBinaryOp(lhs_aten, rhs_aten); + EXPECT_TRUE(expected_out.equal(out_aten)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << expected_out; +} + +TEST_P(HirBinaryOpTest, NonPreAllocatedOutputs) { + const std::vector sizes = {8, 64}; + const auto& binary_op_type = GetParam(); + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto* lhs = makeConcreteTensor(sizes); + auto* rhs = makeConcreteTensor(sizes); + auto* out = binaryOp(binary_op_type, lhs, rhs); + hic->addInput(lhs); + hic->addInput(rhs); + hic->addOutput(out); + hic->pushBackTopLevelExprs(out->definition()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + auto lhs_aten = at::randn(sizes, options); + auto rhs_aten = at::randn(sizes, options); + + auto out_aten = + hie.runWithInput({{lhs, lhs_aten}, {rhs, rhs_aten}})[0].as(); + + at::Tensor expected_out = executeBinaryOp(lhs_aten, rhs_aten); + EXPECT_TRUE(expected_out.equal(out_aten)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << expected_out; +} + +INSTANTIATE_TEST_SUITE_P( + , + HirBinaryOpTest, + testing::Values( + BinaryOpType::Add, + BinaryOpType::Sub, + BinaryOpType::Mul, + BinaryOpType::Div), + [](const testing::TestParamInfo& info) -> std::string { + std::stringstream ss; + ss << "BinaryOpType_" << info.param; + return ss.str(); + }); + +using HirReductionOpTest = NVFuserTest; + +TEST_F(HirReductionOpTest, PreAllocatedOutputs) { + constexpr int64_t size0 = 8, size1 = 64; + constexpr int64_t reduction_axis = 1; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto* in = makeConcreteTensor({size0, size1}); + auto* out = newForReduction(in, {reduction_axis}, in->dtype()); + auto* reduction_op = IrBuilder::create( + BinaryOpType::Add, hic->zeroVal(), out, in); + hic->addInput(in); + hic->addOutput(out); + hic->pushBackTopLevelExprs(reduction_op); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + auto in_aten = at::randn({size0, size1}, options); + auto out_aten = at::empty({size0}, options); + + hie.runWithInput({{in, in_aten}, {out, out_aten}}); + + at::Tensor expected_out = in_aten.sum(reduction_axis); + EXPECT_TRUE(expected_out.equal(out_aten)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << expected_out; +} + +TEST_F(HirReductionOpTest, NonPreAllocatedOutputs) { + constexpr int64_t size0 = 8, size1 = 64; + constexpr int64_t reduction_axis = 1; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto* in = makeConcreteTensor({size0, size1}); + auto* out = sum(in, {reduction_axis}); + hic->addInput(in); + hic->addOutput(out); + hic->pushBackTopLevelExprs(out->definition()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + auto in_aten = at::randn({size0, size1}, options); + auto out_aten = at::empty({size0}, options); + + hie.runWithInput({{in, in_aten}, {out, out_aten}}); + + at::Tensor expected_out = in_aten.sum(reduction_axis); + EXPECT_TRUE(expected_out.equal(out_aten)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << expected_out; +} + } // namespace hir } // namespace nvfuser diff --git a/tests/cpp/test_multidevice_pipeline.cpp b/tests/cpp/test_multidevice_pipeline.cpp index 5985571c57a..12dfed5dd43 100644 --- a/tests/cpp/test_multidevice_pipeline.cpp +++ b/tests/cpp/test_multidevice_pipeline.cpp @@ -457,135 +457,4 @@ INSTANTIATE_TEST_SUITE_P( testing::Values(0, 1), testing::Values(true))); -// Different scheduling modes used in -// PipelineTestStagedReduction.StagedReduction -enum class SchedulingMode { - // Manual interdevice scheduling, no intra-device scheduling - InterDeviceOnly, - // Manual inter-/intra-device scheduling - Manual, - // Manual inter-device scheduling, composed with fully automated intra-device - // scheduling (through FusionExecutorCache) - Automatic, -}; - -std::ostream& operator<<(std::ostream& out, const SchedulingMode& mode) { - switch (mode) { - case SchedulingMode::InterDeviceOnly: - out << "InterDeviceOnly"; - break; - case SchedulingMode::Manual: - out << "Manual"; - break; - case SchedulingMode::Automatic: - out << "Automatic"; - break; - } - return out; -} - -class PipelineTestStagedReduction - : public PipelineTest, - public ::testing::WithParamInterface {}; - -// 1D staged reduction -// Inputs: X[num_devices,B,C] -TEST_P(PipelineTestStagedReduction, StagedReduction) { - auto scheduling_mode = GetParam(); - - const int num_devices = communicator_->size(); - constexpr int B = 8; - constexpr int C = 64; - - FusionGuard fg(fusion.get()); - // The first dimension is made symbolic so `tv_out->definition()` won't - // become a squeeze when num_devices == 1. This wouldn't be a problem for - // automatic mode. However, for the manual mode, the scheduling code below - // assumes `tv_out->definition()` can be lowered to communication. A squeeze - // can't. - TensorView* tv0 = TensorViewBuilder() - .dtype(DataType::Float) - .contiguity(true) - .shape({-1, B, C}) - .build(); - auto mesh = DeviceMesh::createForNumDevices(num_devices); - tv0->setDeviceMesh(mesh); - TensorView* tv1 = sum(tv0, {2}); - TensorView* tv_out = sum(tv1, {0}); - fusion->addInput(tv0); - fusion->addOutput(tv_out); - - for (auto* tv : {tv0, tv1}) { - tv->axis(0)->parallelize(ParallelType::DIDx); - } - - // Intra-device reduction scheduling for the first reduction: - switch (scheduling_mode) { - case SchedulingMode::InterDeviceOnly: - break; - case SchedulingMode::Manual: { - // inspired from NVFuserTest.FusionReduction1_CUDA - // tv0[I0{A}, I1{B}, I2{C}] - tv1->split(2, 32); - // tv1[I0{A}, I1{B}, R2o{C/32}, R2i{32}] = tv0[I0{A}, I1{B}, I2{C}] - tv1->split(2, 4); - // clang-format off - // tv1[I0{A}, I1{B}, R2oo{C/32/4)}, R2oi{4}, R2i{32}] = tv0[I0{A}, I1{B}, I2{C}] - // clang-format on - - TensorView* tv2 = tv1->rFactor({2}); - // clang-format off - // tv2[I0{A}, I1{B}, R2oo{C/32/4)}, I2oi{4}, I2i{32}] = tv0[I0{A}, I1{B}, I2{C}] - // tv1[I0{A}, I1{B}, R2oi{4}, R2i{32}] = tv2[I0{A}, I1{B}, R2oo{C/32/4)}, I2oi{4}, I2i{32}] - // clang-format on - - TensorView* tv3 = tv1->rFactor({2}); - // clang-format off - // tv2[I0{A}, I1{B}, R2oo{C/32/4)}, I2oi{4}, I2i{32}] = tv0[I0{A}, I1{B}, I2{C}] - // tv3[I0{A}, I1{B}, R2oi{4}, I2i{32}] = tv2[I0{A}, I1{B}, R2oo{C/32/4)}, I2oi{4}, I2i{32}] - // tv1[I0{A}, I1{B}, R2i{32}] = tv3[I0{A}, I1{B}, R2oi{4}, I2i{32}] - // clang-format on - - // tv1 is a segment boundary so must be in global. This wouldn't be - // needed if the fusion were scheduled automatically. - tv1->setMemoryType(MemoryType::Global); - - // Use `tv2` as the reference tensor because it contains the most - // parallel IterDomains. - tv2->axis(1)->parallelize(ParallelType::BIDx); - tv2->axis(3)->parallelize(ParallelType::Unroll); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - scheduler_utils::parallelizeAllLike( - tv2, - /*pos=*/-1, - // Don't propagate the parallelization to `tv_out` because that's in - // a different, resharding segment. - /*selected_tv=*/{tv0, tv1, tv2, tv3}); - inlineMost(); - break; - } - case SchedulingMode::Automatic: - host_ir_executor_params.use_fusion_executor_cache = true; - break; - } - - at::Tensor unsharded_input_tensor = - at::randn({num_devices, B, C}, tensor_options); - at::Tensor ref_unsharded_output_tensor = - unsharded_input_tensor.sum(at::IntArrayRef({0, 2})); - unsharded_args = {unsharded_input_tensor}; - ref_unsharded_outputs = {ref_unsharded_output_tensor}; - - executeAndValidate(/* validate_with_prescribed_values */ true); -} - -INSTANTIATE_TEST_SUITE_P( - , - PipelineTestStagedReduction, - testing::Values( - SchedulingMode::InterDeviceOnly, - SchedulingMode::Manual, - SchedulingMode::Automatic), - testing::PrintToStringParamName()); - } // namespace nvfuser