From 9a0dc9e8d6e0cdc8699384a8bae203591d01ec4b Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 26 Mar 2025 04:29:58 -0700 Subject: [PATCH 01/38] host ir alias and prealloc output support --- csrc/host_ir/container.cpp | 5 + csrc/host_ir/container.h | 12 ++ csrc/host_ir/executor.cpp | 216 ++++++++++++++++++------------------ csrc/host_ir/executor.h | 33 +++++- tests/cpp/test_host_irs.cpp | 112 ++++++++++++++++++- 5 files changed, 268 insertions(+), 110 deletions(-) diff --git a/csrc/host_ir/container.cpp b/csrc/host_ir/container.cpp index ca4c98f2a56..a133c0521ba 100644 --- a/csrc/host_ir/container.cpp +++ b/csrc/host_ir/container.cpp @@ -35,6 +35,11 @@ Stream* HostIrContainer::getDefaultStream() { std::ostream& HostIrContainer::print(std::ostream& os) const { IrMathPrinter op_exprs(os); op_exprs.handle(this); + os << "Aliases:{"; + for (const auto& alias : alias_) { + os << "\n " << alias.first << " -> " << alias.second; + } + os << "\n}\n"; return os; } diff --git a/csrc/host_ir/container.h b/csrc/host_ir/container.h index ab029f63fa2..7dcd66b4436 100644 --- a/csrc/host_ir/container.h +++ b/csrc/host_ir/container.h @@ -55,10 +55,22 @@ class HostIrContainer final : public Fusion { Stream* getDefaultStream(); + void markAlias(TensorView* original, const TensorView* new_alias) { + if (alias_.count(original)) { + original = alias_[original]->as(); + } + alias_[new_alias] = original; + } + + const auto& alias() const { + return alias_; + } + private: std::vector top_level_exprs_; std::vector> kernel_executors_; Stream* default_stream_ = nullptr; + std::unordered_map alias_; }; } // namespace hir diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 4b6c2fe6796..4b631bd0061 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -171,32 +171,6 @@ KernelArgumentHolder HostIrExecutor::run( namespace hir { -namespace { - -at::Tensor getKnownTensorOrUndefined( - Val* val, - const ExpressionEvaluator& expr_evaluator) { - return expr_evaluator.isKnown(val) - ? expr_evaluator.evaluate(val).as() - : at::Tensor(); -} - -KernelArgumentHolder getKnownTensorOrUndefined( - const std::vector& vals, - const ExpressionEvaluator& expr_evaluator) { - std::vector tensors(vals.size()); - std::transform( - vals.begin(), - vals.end(), - tensors.begin(), - [&expr_evaluator](Val* val) -> at::Tensor { - return getKnownTensorOrUndefined(val, expr_evaluator); - }); - return KernelArgumentHolder(tensors); -} - -} // namespace - HostIrEvaluator::HostIrEvaluator( std::unique_ptr container, Communicator* communicator, @@ -216,10 +190,23 @@ HostIrEvaluator::HostIrEvaluator( {container_->getDefaultStream(), c10::cuda::getDefaultCUDAStream( static_cast(device_index))}); - expr_evaluator_.bind("numberOfStreams", params_.number_of_streams); + NVF_ERROR( + std::all_of( + container_->inputs().begin(), + container_->inputs().end(), + [this](Val* input) { return !container_->alias().count(input); }), + "Inputs cannot be aliased"); } -KernelArgumentHolder HostIrEvaluator::dispatchAndCollectOutputs() { +KernelArgumentHolder HostIrEvaluator::runWithInput( + const std::unordered_map& val_to_PValue) { + expr_evaluator_ = ExpressionEvaluator(); + expr_evaluator_.bind("numberOfStreams", params_.number_of_streams); + // process input values, converting IValue to PolymorphicValue + for (const auto& [val, pvalue] : val_to_PValue) { + bind(val, pvalue); + } + // Interpret each instruction in an "eager" way by iterate over the Host Ir // Container's top level expression list for (auto expr : container_->topLevelExprs()) { @@ -227,17 +214,15 @@ KernelArgumentHolder HostIrEvaluator::dispatchAndCollectOutputs() { } // Collect global outputs - return getKnownTensorOrUndefined(container_->outputs(), expr_evaluator_); -} - -KernelArgumentHolder HostIrEvaluator::runWithInput( - const std::unordered_map& val_to_PValue) { - // process input values, converting IValue to PolymorphicValue - for (const auto& [val, pvalue] : val_to_PValue) { - expr_evaluator_.bind(val, pvalue); - } - - return dispatchAndCollectOutputs(); + std::vector outputs(container_->outputs().size()); + std::transform( + container_->outputs().begin(), + container_->outputs().end(), + outputs.begin(), + [this](Val* val) -> at::Tensor { + return this->getKnownTensorOrUndefined(val); + }); + return KernelArgumentHolder(outputs); } std::string HostIrEvaluator::canRun() const { @@ -320,13 +305,7 @@ void HostIrEvaluator::handle(Synchronize* synchronize) { void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { KernelArgumentHolder args; for (auto& input : launch_kernel->inputs()) { - NVF_ERROR( - expr_evaluator_.isKnown(input), - "No buffer associated with Val ", - input, - " for handling ", - launch_kernel->toString()); - args.push(expr_evaluator_.evaluate(input)); + args.push(getKnownConcreteData(input)); } args.setDeviceIndex(); @@ -341,25 +320,35 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { // Store the outputs in the context for (auto output_idx : c10::irange(outputs.size())) { - expr_evaluator_.bind( - launch_kernel->outputs().at(output_idx), outputs[output_idx]); + bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]); } } void HostIrEvaluator::handle(PostOnStream* post_ir) { KernelArgumentHolder input_args; for (auto& input : post_ir->inputs()) { - NVF_ERROR( - expr_evaluator_.isKnown(input), - "No buffer associated with Val ", - input, - " for handling ", - post_ir->toString()); - input_args.push(expr_evaluator_.evaluate(input)); + input_args.push(getKnownConcreteData(input)); } input_args.setDeviceIndex(); // placeholder for storing the outputs KernelArgumentHolder outputs; + bool use_preallocated_outputs = std::all_of( + post_ir->outputs().begin(), + post_ir->outputs().end(), + [this](Val* output) { return this->isKnown(output); }); + NVF_ERROR( + use_preallocated_outputs || + std::all_of( + post_ir->outputs().begin(), + post_ir->outputs().end(), + [this](Val* output) { return !this->isKnown(output); }), + "outputs must be all or none preallocated in expr ", + post_ir); + if (use_preallocated_outputs) { + for (auto output : post_ir->outputs()) { + outputs.push(getKnownConcreteData(output)); + } + } NVF_ERROR( post_ir->hostOpToPost()->isA(), @@ -376,16 +365,23 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { /*fusion_id=*/0, !params_.skip_auto_scheduling); } - outputs = fec_.at(hu).runFusionWithInputs(input_args); + if (use_preallocated_outputs) { + TORCH_WARN( + "FusionExecutorCache does not support with preallocated outputs, so we are copying the outputs in expr ", + post_ir); + auto tmp_outputs = fec_.at(hu).runFusionWithInputs(input_args); + for (auto output_idx : c10::irange(tmp_outputs.size())) { + outputs[output_idx].as().copy_( + tmp_outputs[output_idx].as()); + } + } else { + outputs = fec_.at(hu).runFusionWithInputs(input_args); + } } else { // This path should generally be avoided as it will likely send the fusion // held in HostUnit directly to KernelExecutor which means it will try to // compile and run a device kernel with a single thread. - if (auto it = executors_.find(hu); it != executors_.end()) { - ExecutorAbstract* ea = it->second.get(); - outputs = ExecutorDispatch::run(ea, input_args); - - } else { + if (auto it = executors_.find(hu); it == executors_.end()) { DynamicTransform::concretizeFusion(hu->fusion_to_execute(), input_args); auto it2 = executors_.insert( {hu, @@ -402,14 +398,20 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { } else { ExecutorDispatch::compile(ea, hu->fusion_to_execute()); } + } + ExecutorAbstract* ea = executors_[hu].get(); + if (use_preallocated_outputs) { + ExecutorDispatch::run(ea, input_args, outputs); + } else { outputs = ExecutorDispatch::run(ea, input_args); } } - // Store the outputs in the context - for (auto output_idx : c10::irange(outputs.size())) { - expr_evaluator_.bind( - post_ir->outputs().at(output_idx), outputs[output_idx]); + if (!use_preallocated_outputs) { + // Store the outputs in the context + for (auto output_idx : c10::irange(outputs.size())) { + bind(post_ir->outputs().at(output_idx), outputs[output_idx]); + } } } @@ -418,10 +420,9 @@ void HostIrEvaluator::handle(Communication* communication) { communicator_ != nullptr && communicator_->is_available(), "A valid communicator must be provided"); - at::Tensor input_tensor = - getKnownTensorOrUndefined(communication->input(0), expr_evaluator_); + at::Tensor input_tensor = getKnownTensorOrUndefined(communication->input(0)); at::Tensor output_tensor = - getKnownTensorOrUndefined(communication->output(0), expr_evaluator_); + getKnownTensorOrUndefined(communication->output(0)); CommunicatorBackend backend_type = communication->backend(); c10d::Backend* backend = @@ -439,8 +440,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { communicator_ != nullptr && communicator_->is_available(), "A valid communicator must be provided"); - at::Tensor buffer = - getKnownTensorOrUndefined(communication->buffer(), expr_evaluator_); + at::Tensor buffer = getKnownTensorOrUndefined(communication->buffer()); works_[communication] = postSingleCommunication( communication, @@ -495,11 +495,11 @@ void HostIrEvaluator::handle(ForLoop* for_loop) { for (auto i = start; i < stop; i += step) { // invalidate i and its consumers before binding - expr_evaluator_.invalidate(for_loop->index()); + invalidate(for_loop->index()); for (auto consumer : allConsumerValsOf(for_loop->index())) { - expr_evaluator_.invalidate(consumer); + invalidate(consumer); } - expr_evaluator_.bind(for_loop->index(), i); + bind(for_loop->index(), i); for (Expr* expr : for_loop->body().exprs()) { dispatch(expr); } @@ -536,15 +536,11 @@ void HostIrEvaluator::handle(MatmulOp* matmul) { TensorView* a = matmul->inA(); TensorView* b = matmul->inB(); TensorView* out = matmul->out(); - NVF_ERROR( - expr_evaluator_.isKnown(a) && expr_evaluator_.isKnown(b), - "Inputs of the matmul ", - matmul->toString(), - "must be precomputed before being retrieved"); - if (expr_evaluator_.isKnown(out)) { - auto t_a = expr_evaluator_.evaluate(a).as(); - auto t_b = expr_evaluator_.evaluate(b).as(); - auto t_out = expr_evaluator_.evaluate(out).as(); + + if (isKnown(out)) { + auto t_a = getKnownConcreteData(a).as(); + auto t_b = getKnownConcreteData(b).as(); + auto t_out = getKnownConcreteData(out).as(); at::matmul_out(t_out, t_a, t_b); } else { unhandled(matmul); @@ -556,24 +552,18 @@ void HostIrEvaluator::handle(LinearOp* linear) { TensorView* weight = linear->inB()->as(); TensorView* bias = linear->bias()->as(); TensorView* out = linear->out()->as(); - NVF_ERROR( - expr_evaluator_.isKnown(in) && expr_evaluator_.isKnown(weight) && - (!linear->has_bias() || expr_evaluator_.isKnown(bias)), - "Inputs of the Linear Op ", - linear->toString(), - "must be precomputed before being retrieved"); - if (!expr_evaluator_.isKnown(out)) { + if (!isKnown(out)) { unhandled(linear); return; } - auto in_at = expr_evaluator_.evaluate(in).as(); - auto weight_at = expr_evaluator_.evaluate(weight).as(); - auto out_at = expr_evaluator_.evaluate(out).as(); + auto in_at = getKnownConcreteData(in).as(); + auto weight_at = getKnownConcreteData(weight).as(); + auto out_at = getKnownConcreteData(out).as(); if (linear->has_bias()) { - auto bias_at = expr_evaluator_.evaluate(bias).as(); + auto bias_at = getKnownConcreteData(bias).as(); at::linear_out(out_at, in_at, weight_at.squeeze(), bias_at.squeeze()); } else { at::linear_out(out_at, in_at, weight_at.squeeze()); @@ -600,25 +590,37 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { c10::nullopt, device, c10::nullopt); - - expr_evaluator_.bind(tv, tensor); + bind(tv, tensor); } void HostIrEvaluator::unhandled(Statement* stmt) { NVF_ERROR(stmt->isA(), stmt, " must be an Expr"); auto* expr = stmt->as(); - for (auto input : ir_utils::filterByType(expr->inputs())) { - NVF_ERROR( - expr_evaluator_.isKnown(input), - "input ", - input->toString(), - " of the expression ", - expr->toString(), - "must be precomputed before being retrieved"); - } - for (auto output : expr->outputs()) { - expr_evaluator_.bind( - output, expr_evaluator_.evaluate(output), /*evaluate_validate=*/true); + std::vector inputs; + for (auto input : expr->inputs()) { + if (input->isA()) { + // Tensor inputs must be already computed at this point + inputs.push_back(getKnownConcreteData(input)); + } else { + inputs.push_back(expr_evaluator_.evaluate(input)); + } + } + + // Check that there is no pre-allocated output + NVF_ERROR( + std::all_of( + expr->outputs().begin(), + expr->outputs().end(), + [this](Val* output) { + return !this->expr_evaluator_.isKnown(output); + }), + "Do not support pre-allocated outputs for the op ", + expr); + // using ExpressionEvaluator::evaluate to evaluate the output is not valid + // here if the output or one of its producer is an alias + auto concrete_outputs = expr->evaluate(expr_evaluator_, inputs); + for (int64_t i : c10::irange(expr->outputs().size())) { + bind(expr->output(i), concrete_outputs.at(i)); } } diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index f1b8ed4ef88..73f52a7bb90 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -133,7 +133,38 @@ class HostIrEvaluator final : public OptOutDispatch { c10::cuda::CUDAStream getCUDAStream(Stream* stream); - KernelArgumentHolder dispatchAndCollectOutputs(); + Val* getAlias(Val* val) const { + const auto& aliases = container_->alias(); + auto it = aliases.find(val); + return it != aliases.end() ? it->second : val; + } + + bool isKnown(Val* value) const { + return expr_evaluator_.isKnown(getAlias(value)); + } + + PolymorphicValue getKnownConcreteData(Val* val) const { + NVF_ERROR( + isKnown(val), + "value ", + val->toString(), + "must be precomputed before being retrieved"); + return expr_evaluator_.evaluate(getAlias(val)); + } + + at::Tensor getKnownTensorOrUndefined(Val* val) const { + return isKnown(val) + ? expr_evaluator_.evaluate(getAlias(val)).as() + : at::Tensor(); + } + + void bind(Val* value, PolymorphicValue concrete_value) { + expr_evaluator_.bind(getAlias(value), concrete_value); + } + + void invalidate(Val* value) { + expr_evaluator_.invalidate(getAlias(value)); + } std::unique_ptr container_; Communicator* communicator_; diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index da85466c8a0..ba296f6f357 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -456,6 +456,62 @@ TEST_P(HostIrTest, ForLoops) { EXPECT_TRUE(expected_result.equal(buffer_at)); } +TEST_P(HostIrTest, PreAllocatedOutputs) { + const std::vector input_sizes = {4, 8, 32}; + const std::vector output_sizes = { + input_sizes.at(1), input_sizes.at(2)}; + + auto get_fusion = [input_sizes]() -> std::unique_ptr { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + auto tv0 = makeConcreteTensor(input_sizes); + auto tv1 = add(tv0, tv0); + auto tv2 = sum(tv1, {0}); + fusion->addInput(tv0); + fusion->addOutput(tv2); + return fusion; + }; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto host_unit = IrBuilder::create(get_fusion()); + + IrCloner ir_cloner(hic.get()); + std::vector post_on_stream_inputs = { + ir_cloner.clone(host_unit->fusion_to_execute()->inputs().at(0))}; + std::vector post_on_stream_outputs = { + ir_cloner.clone(host_unit->fusion_to_execute()->outputs().at(0))}; + + auto post_on_stream = IrBuilder::create( + host_unit, post_on_stream_inputs, post_on_stream_outputs); + + hic->pushBackTopLevelExprs(post_on_stream); + + hic->addInput(post_on_stream->inputs().at(0)); + hic->addInput(post_on_stream->outputs().at(0)); + + HostIrEvaluatorParams params; + auto [use_fusion_executor_cache] = GetParam(); + params.use_fusion_executor_cache = use_fusion_executor_cache; + HostIrEvaluator hie(std::move(hic), nullptr, params); + + // define concrete inputs and compute ref output for validation + auto options = at::TensorOptions().device(at::kCUDA, 0); + auto input = at::randn(input_sizes, options); + auto output = at::empty(output_sizes, options); + auto ref_output = at::sum(input * 2, {0}); + + hie.runWithInput( + {{post_on_stream->inputs().at(0), input}, + {post_on_stream->outputs().at(0), output}}); + + // validate the obtained results + GTEST_EXPECT_TRUE(torch::allclose(ref_output, output)) + << "Output: " << output << " Expected: " << ref_output; +} + INSTANTIATE_TEST_SUITE_P( , HostIrTest, @@ -1095,7 +1151,12 @@ TEST_F(IfThenElseTest, HostIr) { hic->addOutput(output_buffer); hic->pushBackTopLevelExprs(if_then_else); - HostIrEvaluator hie(std::move(hic)); + // Need to use FusionExecutorCache, otherwise hitting error + // https://github.com/NVIDIA/Fuser/blob/4d032f74d2347fd68f5be607ef94956500eb917b/csrc/runtime/executor.cpp#L750 + HostIrEvaluator hie( + std::move(hic), + /*Communicator=*/nullptr, + {.use_fusion_executor_cache = true}); for (auto boolean : {true, false}) { auto options = @@ -1155,7 +1216,7 @@ TEST_F(AllocationTest, inHostForLoop) { TensorView* tv0 = makeConcreteTensor(sizes); tv0->setMemoryType(MemoryType::Global); auto* allocate = IrBuilder::create(tv0, MemoryType::Global); - TensorView* tv1 = abs(tv0); + TensorView* tv1 = set(tv0); for_loop->body().push_back(allocate); for_loop->body().push_back(tv1->definition()); @@ -1170,6 +1231,53 @@ TEST_F(AllocationTest, inHostForLoop) { EXPECT_EQ(sizes, outputs[0].as().sizes()); } +using HirAlias = NVFuserTest; + +TEST_F(HirAlias, SetAndGet) { + const std::vector sizes = {8, 64}; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + TensorView* tv0 = makeConcreteTensor(sizes); + TensorView* tv1 = set(tv0); + TensorView* tv2 = makeConcreteTensor(sizes); + hic->markAlias(tv1, tv2); + TensorView* tv3 = set(tv2); + TensorView* tv4 = makeConcreteTensor(sizes); + hic->markAlias(tv3, tv4); + hic->addInput(tv0); + hic->addOutput(tv4); + hic->pushBackTopLevelExprs(tv1->definition()); + hic->pushBackTopLevelExprs(tv3->definition()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor tv0_aten = at::randn(sizes, options); + + at::Tensor out_aten = hie.runWithInput({{tv0, tv0_aten}})[0].as(); + + at::Tensor expected_out = tv0_aten; + EXPECT_TRUE(out_aten.equal(expected_out)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << expected_out; +} + +TEST_F(HirAlias, ThrowOnInputAlias) { + const std::vector sizes = {8, 64}; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + TensorView* tv0 = makeConcreteTensor(sizes); + TensorView* tv1 = set(tv0); + hic->markAlias(tv1, tv0); + hic->addInput(tv0); + + EXPECT_ANY_THROW(HostIrEvaluator hie(std::move(hic))); +} + } // namespace hir } // namespace nvfuser From 9820d5aba5b81a02ee96dbcc5ba651837d6add8c Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 26 Mar 2025 04:48:56 -0700 Subject: [PATCH 02/38] harden and simplify allocation in for loop test --- tests/cpp/test_host_irs.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index ba296f6f357..654e60c5d31 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -1216,13 +1216,11 @@ TEST_F(AllocationTest, inHostForLoop) { TensorView* tv0 = makeConcreteTensor(sizes); tv0->setMemoryType(MemoryType::Global); auto* allocate = IrBuilder::create(tv0, MemoryType::Global); - TensorView* tv1 = set(tv0); for_loop->body().push_back(allocate); - for_loop->body().push_back(tv1->definition()); hic->pushBackTopLevelExprs(for_loop); - hic->addOutput(tv1); + hic->addOutput(tv0); HostIrEvaluator hie(std::move(hic)); From 2ad510d47138e675e45ae9ff0af4c2726fea7c0a Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 26 Mar 2025 05:16:37 -0700 Subject: [PATCH 03/38] refactor and clean host ir lowering and segmentation --- csrc/fusion_segmenter.cpp | 15 ++---- csrc/fusion_segmenter.h | 3 +- csrc/host_ir/container.h | 4 ++ csrc/host_ir/lower.cpp | 87 ++++++++++++++++++++++++----------- csrc/host_ir/lower.h | 7 +++ tests/cpp/test_resharding.cpp | 2 +- 6 files changed, 79 insertions(+), 39 deletions(-) diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index d5368c41066..90e5ba20cf8 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -3919,15 +3919,8 @@ bool SegmentCandidateFinder::codeGenSupportedMerge( NVF_ERROR( areDirectlyConnected(group1, group2), "only support testing immediate producer-consumer groups"); - if (options_.only_segment_resharding_exprs) { - for (auto group : {group1, group2}) { - for (auto expr : group->exprs()) { - if (isResharding(expr)) { - return false; - } - } - } - return true; + if (options_.custom_should_merge_groups != nullptr) { + return (*options_.custom_should_merge_groups)(group1, group2); } return tryMerge(segmented_fusion_.get(), runtimeInfo(), group1, group2) != SchedulerType::None; @@ -3938,7 +3931,7 @@ bool SegmentCandidateFinder::codeGenSupportedMerge( SchedulerType SegmentCandidateFinder::deriveSchedulerType( SegmentedGroup* group) { FUSER_PERF_SCOPE("SegmentCandidateFinder::deriveSchedulerType"); - if (options_.only_segment_resharding_exprs) { + if (options_.custom_should_merge_groups != nullptr) { // We don't need to generate a SchedulerType for multidevice segments at // this moment return SchedulerType::None; @@ -3958,7 +3951,7 @@ SegmentCandidateFinder::SegmentCandidateFinder( : options_(options), runtime_inputs_(inputs) { FUSER_PERF_SCOPE("SegmentCandidateFinder::SegmentCandidateFinder"); NVF_ERROR( - !options_.only_segment_resharding_exprs || + options_.custom_should_merge_groups == nullptr || (!options_.run_translate_welford && !options_.run_combine_reductions && options_.run_herrmann_merge && options_.run_final_merge), diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h index 1e32d42bb36..cc721d59301 100644 --- a/csrc/fusion_segmenter.h +++ b/csrc/fusion_segmenter.h @@ -501,7 +501,8 @@ struct SegmentCandidateFinderOptions { bool run_combine_reductions = true; bool run_herrmann_merge = true; bool run_final_merge = true; - bool only_segment_resharding_exprs = false; + bool (*custom_should_merge_groups)(SegmentedGroup*, SegmentedGroup*) = + nullptr; }; //! SegmentCandidateFinder diff --git a/csrc/host_ir/container.h b/csrc/host_ir/container.h index 7dcd66b4436..eb322275422 100644 --- a/csrc/host_ir/container.h +++ b/csrc/host_ir/container.h @@ -41,6 +41,10 @@ class HostIrContainer final : public Fusion { //! Print to an output stream std::ostream& print(std::ostream& os) const; + void resetTopLevelExprs(std::vector exprs) { + top_level_exprs_ = std::move(exprs); + } + const std::vector& topLevelExprs() const; void pushBackTopLevelExprs(Expr* expr); diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index 32febda37a0..4838961ed85 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -6,8 +6,8 @@ */ // clang-format on #include -#include #include +#include #include #include #include @@ -592,6 +592,29 @@ std::vector HostIrLower::lowerToCollectiveBasedPipelinedGemmComm( get_current_stream, allocate_tva_allgathered, allocate_tv_out, for_loop}; } +bool HostIrLower::isLoweredAsStandaloneHostOp(Expr* expr) { + return expr->isOneOf< + MatmulOp, + SliceOp, + SelectOp, + LinearOp, + Communication, + P2PCommunication>(); +} + +bool HostIrLower::ShouldMergeSegmentedGroups( + SegmentedGroup* group1, + SegmentedGroup* group2) { + for (auto group : {group1, group2}) { + for (auto expr : group->exprs()) { + if (isLoweredAsStandaloneHostOp(expr)) { + return false; + } + } + } + return true; +} + std::unique_ptr HostIrLower::lower( std::unique_ptr fusion, DeviceIdxType my_device_index) { @@ -615,7 +638,7 @@ std::unique_ptr HostIrLower::lower( .run_combine_reductions = false, .run_herrmann_merge = true, .run_final_merge = true, - .only_segment_resharding_exprs = true}; + .custom_should_merge_groups = &ShouldMergeSegmentedGroups}; std::unique_ptr staged_fusion = SegmentCandidateFinder::segment( std::move(fusion), KernelArgumentHolder(), options, true); @@ -643,32 +666,18 @@ std::unique_ptr HostIrLower::lower( if (involvedDevices(group->exprs().at(0)).count(my_device_index) == 0) { continue; } - const bool is_resharding = std::any_of( - group->exprs().begin(), group->exprs().end(), [](auto expr) { - return isResharding(expr); - }); - if (is_resharding) { + // we decide whether to insert the Expr as a standalone op in the + // HostIRContainer, which will result in using ATen Op to evaluate it -- + // or, alternatively, to wrap them into a PostOnStream(HostUnit(.)) which + // will result in a kernel code generation. + if (std::all_of( + group->exprs().begin(), + group->exprs().end(), + isLoweredAsStandaloneHostOp)) { NVF_ERROR( group->exprs().size() == 1, - "Communication segments must contain only one Expr"); - for (auto* expr : HostIrLower::lower( - ir_cloner.clone(group->exprs().at(0)), my_device_index)) { - // Allocate the recv buffers of communications - if (expr->isA()) { - auto* communication = expr->as(); - TensorView* tv = communication->out(); - if (tv->getDeviceMesh().has(my_device_index)) { - auto* allocate = - IrBuilder::create(tv, MemoryType::Global); - hic->pushBackTopLevelExprs(allocate); - } - } - hic->pushBackTopLevelExprs(expr); - if (expr->isA()) { - auto wait = IrBuilder::create(expr->as()); - hic->pushBackTopLevelExprs(wait); - } - } + "Expr executed as a standalone op cannot be fused"); + hic->pushBackTopLevelExprs(ir_cloner.clone(group->exprs().at(0))); } else { auto host_unit = IrBuilder::create( staged_fusion->makeFusion(group).second); @@ -684,6 +693,32 @@ std::unique_ptr HostIrLower::lower( hic->addOutput(ir_cloner.clone(output)); } + std::vector new_top_level_exprs; + for (auto top_level_expr : hic->topLevelExprs()) { + if (!isResharding(top_level_expr)) { + new_top_level_exprs.push_back(top_level_expr); + continue; + } + for (auto* expr : HostIrLower::lower(top_level_expr, my_device_index)) { + // Allocate the recv buffers of communications + if (expr->isA()) { + auto* communication = expr->as(); + TensorView* tv = communication->out(); + if (tv->getDeviceMesh().has(my_device_index)) { + auto* allocate = + IrBuilder::create(tv, MemoryType::Global); + new_top_level_exprs.push_back(allocate); + } + } + new_top_level_exprs.push_back(expr); + if (expr->isA()) { + auto wait = IrBuilder::create(expr->as()); + new_top_level_exprs.push_back(wait); + } + } + } + hic->resetTopLevelExprs(new_top_level_exprs); + return hic; } diff --git a/csrc/host_ir/lower.h b/csrc/host_ir/lower.h index 314c880c264..5ce2386a187 100644 --- a/csrc/host_ir/lower.h +++ b/csrc/host_ir/lower.h @@ -7,6 +7,7 @@ // clang-format on #pragma once +#include #include #include #include @@ -35,6 +36,12 @@ class HostIrLower { std::unique_ptr fusion, DeviceIdxType my_device_index); + static bool isLoweredAsStandaloneHostOp(Expr* expr); + + static bool ShouldMergeSegmentedGroups( + SegmentedGroup* group1, + SegmentedGroup* group2); + private: std::vector lowerToCollectiveBasedPipelinedGemmComm(Expr* expr); const HostIrLowerParams params_; diff --git a/tests/cpp/test_resharding.cpp b/tests/cpp/test_resharding.cpp index 3606582cdb1..69e6b307f42 100644 --- a/tests/cpp/test_resharding.cpp +++ b/tests/cpp/test_resharding.cpp @@ -53,7 +53,7 @@ class ReshardingTest : public NVFuserFixtureParamTest { .run_combine_reductions = false, .run_herrmann_merge = true, .run_final_merge = true, - .only_segment_resharding_exprs = true}; + .custom_should_merge_groups = &HostIrLower::ShouldMergeSegmentedGroups}; auto segmented_fusion = SegmentCandidateFinder::segment( std::move(fusion_), KernelArgumentHolder(), options, true); From 46c6717d5bea2a4cdf11bc418ac4e3dc201ac3a5 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 26 Mar 2025 05:55:05 -0700 Subject: [PATCH 04/38] lint --- csrc/host_ir/lower.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index 4838961ed85..9741aca1bf0 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -594,12 +594,12 @@ std::vector HostIrLower::lowerToCollectiveBasedPipelinedGemmComm( bool HostIrLower::isLoweredAsStandaloneHostOp(Expr* expr) { return expr->isOneOf< - MatmulOp, - SliceOp, - SelectOp, - LinearOp, - Communication, - P2PCommunication>(); + MatmulOp, + SliceOp, + SelectOp, + LinearOp, + Communication, + P2PCommunication>(); } bool HostIrLower::ShouldMergeSegmentedGroups( From 73d5d7b4db958dc18698ae1edd577f63d12911e9 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 26 Mar 2025 07:26:05 -0700 Subject: [PATCH 05/38] put back isResharding as the condition for lower to a standalone host expr --- csrc/host_ir/lower.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index 9741aca1bf0..b2622cccd90 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -593,13 +593,7 @@ std::vector HostIrLower::lowerToCollectiveBasedPipelinedGemmComm( } bool HostIrLower::isLoweredAsStandaloneHostOp(Expr* expr) { - return expr->isOneOf< - MatmulOp, - SliceOp, - SelectOp, - LinearOp, - Communication, - P2PCommunication>(); + return isResharding(expr); } bool HostIrLower::ShouldMergeSegmentedGroups( From e35ddd0302b26c05464de7efc36de92660d871f3 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 11 Apr 2025 05:01:53 -0700 Subject: [PATCH 06/38] minor comments --- csrc/fusion_segmenter.cpp | 2 +- csrc/fusion_segmenter.h | 4 ++-- csrc/host_ir/lower.cpp | 12 ++++++------ csrc/host_ir/lower.h | 4 ++-- tests/cpp/test_resharding.cpp | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index 90e5ba20cf8..b522f6d11ef 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -3920,7 +3920,7 @@ bool SegmentCandidateFinder::codeGenSupportedMerge( areDirectlyConnected(group1, group2), "only support testing immediate producer-consumer groups"); if (options_.custom_should_merge_groups != nullptr) { - return (*options_.custom_should_merge_groups)(group1, group2); + return (options_.custom_should_merge_groups)(group1, group2); } return tryMerge(segmented_fusion_.get(), runtimeInfo(), group1, group2) != SchedulerType::None; diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h index cc721d59301..6cca1f0c727 100644 --- a/csrc/fusion_segmenter.h +++ b/csrc/fusion_segmenter.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -501,8 +502,7 @@ struct SegmentCandidateFinderOptions { bool run_combine_reductions = true; bool run_herrmann_merge = true; bool run_final_merge = true; - bool (*custom_should_merge_groups)(SegmentedGroup*, SegmentedGroup*) = - nullptr; + std::function custom_should_merge_groups = nullptr; }; //! SegmentCandidateFinder diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index b2622cccd90..d647f0d442e 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -592,16 +592,16 @@ std::vector HostIrLower::lowerToCollectiveBasedPipelinedGemmComm( get_current_stream, allocate_tva_allgathered, allocate_tv_out, for_loop}; } -bool HostIrLower::isLoweredAsStandaloneHostOp(Expr* expr) { +bool HostIrLower::isLowerableAsStandaloneHostOp(Expr* expr) { return isResharding(expr); } -bool HostIrLower::ShouldMergeSegmentedGroups( +bool HostIrLower::shouldMergeSegmentedGroups( SegmentedGroup* group1, SegmentedGroup* group2) { for (auto group : {group1, group2}) { - for (auto expr : group->exprs()) { - if (isLoweredAsStandaloneHostOp(expr)) { + for (Expr* expr : group->exprs()) { + if (isLowerableAsStandaloneHostOp(expr)) { return false; } } @@ -632,7 +632,7 @@ std::unique_ptr HostIrLower::lower( .run_combine_reductions = false, .run_herrmann_merge = true, .run_final_merge = true, - .custom_should_merge_groups = &ShouldMergeSegmentedGroups}; + .custom_should_merge_groups = &shouldMergeSegmentedGroups}; std::unique_ptr staged_fusion = SegmentCandidateFinder::segment( std::move(fusion), KernelArgumentHolder(), options, true); @@ -667,7 +667,7 @@ std::unique_ptr HostIrLower::lower( if (std::all_of( group->exprs().begin(), group->exprs().end(), - isLoweredAsStandaloneHostOp)) { + isLowerableAsStandaloneHostOp)) { NVF_ERROR( group->exprs().size() == 1, "Expr executed as a standalone op cannot be fused"); diff --git a/csrc/host_ir/lower.h b/csrc/host_ir/lower.h index 5ce2386a187..5b1ecadece8 100644 --- a/csrc/host_ir/lower.h +++ b/csrc/host_ir/lower.h @@ -36,9 +36,9 @@ class HostIrLower { std::unique_ptr fusion, DeviceIdxType my_device_index); - static bool isLoweredAsStandaloneHostOp(Expr* expr); + static bool isLowerableAsStandaloneHostOp(Expr* expr); - static bool ShouldMergeSegmentedGroups( + static bool shouldMergeSegmentedGroups( SegmentedGroup* group1, SegmentedGroup* group2); diff --git a/tests/cpp/test_resharding.cpp b/tests/cpp/test_resharding.cpp index 69e6b307f42..5e4bd2b749a 100644 --- a/tests/cpp/test_resharding.cpp +++ b/tests/cpp/test_resharding.cpp @@ -53,7 +53,7 @@ class ReshardingTest : public NVFuserFixtureParamTest { .run_combine_reductions = false, .run_herrmann_merge = true, .run_final_merge = true, - .custom_should_merge_groups = &HostIrLower::ShouldMergeSegmentedGroups}; + .custom_should_merge_groups = &HostIrLower::shouldMergeSegmentedGroups}; auto segmented_fusion = SegmentCandidateFinder::segment( std::move(fusion_), KernelArgumentHolder(), options, true); From 4964680a32b58bb0818a784eff6e758625e49ace Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 11 Apr 2025 05:07:59 -0700 Subject: [PATCH 07/38] lint --- csrc/fusion_segmenter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h index 6cca1f0c727..a975716676d 100644 --- a/csrc/fusion_segmenter.h +++ b/csrc/fusion_segmenter.h @@ -502,7 +502,8 @@ struct SegmentCandidateFinderOptions { bool run_combine_reductions = true; bool run_herrmann_merge = true; bool run_final_merge = true; - std::function custom_should_merge_groups = nullptr; + std::function + custom_should_merge_groups = nullptr; }; //! SegmentCandidateFinder From ed8dc7c825a466f6cf9b3e9ebbe8fb8017eda182 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 26 Mar 2025 05:52:08 -0700 Subject: [PATCH 08/38] add host ir support for set reduce and binary op --- csrc/host_ir/executor.cpp | 99 +++++++++++++ csrc/host_ir/executor.h | 3 + csrc/host_ir/lower.cpp | 13 +- tests/cpp/test_host_irs.cpp | 180 ++++++++++++++++++++++++ tests/cpp/test_multidevice_pipeline.cpp | 131 ----------------- 5 files changed, 293 insertions(+), 133 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 30bca9920db..4c0e7f441eb 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -570,6 +570,31 @@ void HostIrEvaluator::handle(LinearOp* linear) { } } +void HostIrEvaluator::handle(LoadStoreOp* load_store_op) { + NVF_ERROR( + load_store_op->out()->isA(), "out must be a TensorView"); + auto* out_tv = load_store_op->out()->as(); + auto in_tensor = getKnownConcreteData(load_store_op->in()).as(); + + // If output has root domain, compute and apply permutation + if (out_tv->hasRoot()) { + auto permutation = ir_utils::computePermutation( + out_tv->getRootDomain(), out_tv->getLogicalDomain()); + NVF_ERROR( + permutation.has_value(), + "The logical domain of a Set.Permute is supposed to be a permutation of the root domain: ", + out_tv->toString()); + in_tensor = in_tensor.permute(*permutation).contiguous(); + } + if (!isKnown(load_store_op->out())) { + bind(load_store_op->out(), in_tensor); + } else { + auto out_tensor = + getKnownConcreteData(load_store_op->out()).as(); + out_tensor.copy_(in_tensor); + } +} + void HostIrEvaluator::handle(kir::Allocate* allocate) { NVF_ERROR( allocate->buffer()->isA(), @@ -593,6 +618,80 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { bind(tv, tensor); } +void HostIrEvaluator::handle(BinaryOp* binary_op) { + if (!isKnown(binary_op->outputs().at(0))) { + return unhandled(binary_op); + } + + auto lhs = getKnownConcreteData(binary_op->inputs().at(0)).as(); + auto rhs = getKnownConcreteData(binary_op->inputs().at(1)).as(); + auto output = + getKnownConcreteData(binary_op->outputs().at(0)).as(); + + switch (binary_op->getBinaryOpType()) { + case BinaryOpType::Add: + at::add_out(output, lhs, rhs); + break; + case BinaryOpType::Sub: + at::sub_out(output, lhs, rhs); + break; + case BinaryOpType::Mul: + at::mul_out(output, lhs, rhs); + break; + case BinaryOpType::Div: + at::div_out(output, lhs, rhs); + break; + default: + NVF_CHECK( + false, + "Unexpected operator type: ", + binary_op->getBinaryOpType(), + " in ", + binary_op); + } +} + +void HostIrEvaluator::handle(ReductionOp* reduction_op) { + auto input_tv = reduction_op->in()->as(); + auto output_tv = reduction_op->out()->as(); + if (!isKnown(output_tv)) { + return unhandled(reduction_op); + } + + NVF_ERROR( + !output_tv->hasRoot(), + "Evaluation for rFactored reductions is not supported."); + auto input = getKnownConcreteData(input_tv).as(); + auto output = getKnownConcreteData(output_tv).as(); + + std::vector reduction_axes; + for (const auto i : + c10::irange(int64_t(output_tv->getLogicalDomain().size()))) { + auto ax = output_tv->getLogicalDomain().at(i); + if (ax->isReduction()) { + reduction_axes.push_back(i); + } + } + switch (reduction_op->getReductionOpType()) { + case BinaryOpType::Add: + at::sum_out(output, input, reduction_axes); + return; + case BinaryOpType::Max: + at::amax_out(output, input, reduction_axes); + return; + case BinaryOpType::Min: + at::amin_out(output, input, reduction_axes); + return; + default: + NVF_CHECK( + false, + "Unexpected operator type: ", + reduction_op->getReductionOpType(), + " in ", + reduction_op); + } +} + void HostIrEvaluator::unhandled(Statement* stmt) { NVF_ERROR(stmt->isA(), stmt, " must be an Expr"); auto* expr = stmt->as(); diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 8f4d425fd09..faf301f0819 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -133,6 +133,9 @@ class HostIrEvaluator final : public OptOutDispatch { void handle(MatmulOp* matmul) override; void handle(LinearOp* linear) override; void handle(kir::Allocate* allocate) override; + void handle(LoadStoreOp* load_store_op) override; + void handle(BinaryOp* binary_op) override; + void handle(ReductionOp* reduction_op) override; void unhandled(Statement* stmt) override; c10::cuda::CUDAStream getCUDAStream(Stream* stream); diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index d647f0d442e..d8302a8a119 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -592,8 +592,17 @@ std::vector HostIrLower::lowerToCollectiveBasedPipelinedGemmComm( get_current_stream, allocate_tva_allgathered, allocate_tv_out, for_loop}; } -bool HostIrLower::isLowerableAsStandaloneHostOp(Expr* expr) { - return isResharding(expr); +bool HostIrLower::isLoweredAsStandaloneHostOp(Expr* expr) { + return expr->isOneOf< + MatmulOp, + SliceOp, + SelectOp, + LinearOp, + LoadStoreOp, + BinaryOp, + ReductionOp, + Communication, + P2PCommunication>(); } bool HostIrLower::shouldMergeSegmentedGroups( diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index 654e60c5d31..751d5251aab 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -1276,6 +1276,186 @@ TEST_F(HirAlias, ThrowOnInputAlias) { EXPECT_ANY_THROW(HostIrEvaluator hie(std::move(hic))); } +using HirSetTest = NVFuserTest; + +TEST_F(HirSetTest, HostIr) { + const std::vector sizes = {8, 64}; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto* in = makeConcreteTensor(sizes); + auto* out = makeConcreteTensor(sizes); + auto* set = IrBuilder::create(LoadStoreOpType::Set, out, in); + hic->addInput(in); + hic->addInput(out); + hic->pushBackTopLevelExprs(set); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + auto in_aten = at::randn(sizes, options); + auto out_aten = at::empty(sizes, options); + + hie.runWithInput({{in, in_aten}, {out, out_aten}}); + + EXPECT_TRUE(out_aten.equal(in_aten)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << in_aten; +} + +class HirBinaryOpTest : public NVFuserFixtureParamTest { + protected: + at::Tensor executeBinaryOp(at::Tensor lhs, at::Tensor rhs) { + switch (GetParam()) { + case BinaryOpType::Add: + return lhs + rhs; + case BinaryOpType::Sub: + return lhs - rhs; + case BinaryOpType::Mul: + return lhs * rhs; + case BinaryOpType::Div: + return lhs / rhs; + default: + NVF_ERROR("Unsupported binary op type ", GetParam()); + return at::Tensor(); + } + } +}; + +TEST_P(HirBinaryOpTest, PreAllocatedOutputs) { + const std::vector sizes = {8, 64}; + const auto& binary_op_type = GetParam(); + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto* lhs = makeConcreteTensor(sizes); + auto* rhs = makeConcreteTensor(sizes); + auto* out = makeConcreteTensor(sizes); + auto* binary_op = IrBuilder::create(binary_op_type, out, lhs, rhs); + hic->addInput(lhs); + hic->addInput(rhs); + hic->addInput(out); + hic->pushBackTopLevelExprs(binary_op); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + auto lhs_aten = at::randn(sizes, options); + auto rhs_aten = at::randn(sizes, options); + auto out_aten = at::empty(sizes, options); + + hie.runWithInput({{lhs, lhs_aten}, {rhs, rhs_aten}, {out, out_aten}}); + + at::Tensor expected_out = executeBinaryOp(lhs_aten, rhs_aten); + EXPECT_TRUE(expected_out.equal(out_aten)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << expected_out; +} + +TEST_P(HirBinaryOpTest, NonPreAllocatedOutputs) { + const std::vector sizes = {8, 64}; + const auto& binary_op_type = GetParam(); + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto* lhs = makeConcreteTensor(sizes); + auto* rhs = makeConcreteTensor(sizes); + auto* out = binaryOp(binary_op_type, lhs, rhs); + hic->addInput(lhs); + hic->addInput(rhs); + hic->addOutput(out); + hic->pushBackTopLevelExprs(out->definition()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + auto lhs_aten = at::randn(sizes, options); + auto rhs_aten = at::randn(sizes, options); + + auto out_aten = + hie.runWithInput({{lhs, lhs_aten}, {rhs, rhs_aten}})[0].as(); + + at::Tensor expected_out = executeBinaryOp(lhs_aten, rhs_aten); + EXPECT_TRUE(expected_out.equal(out_aten)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << expected_out; +} + +INSTANTIATE_TEST_SUITE_P( + , + HirBinaryOpTest, + testing::Values( + BinaryOpType::Add, + BinaryOpType::Sub, + BinaryOpType::Mul, + BinaryOpType::Div), + [](const testing::TestParamInfo& info) -> std::string { + std::stringstream ss; + ss << "BinaryOpType_" << info.param; + return ss.str(); + }); + +using HirReductionOpTest = NVFuserTest; + +TEST_F(HirReductionOpTest, PreAllocatedOutputs) { + constexpr int64_t size0 = 8, size1 = 64; + constexpr int64_t reduction_axis = 1; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto* in = makeConcreteTensor({size0, size1}); + auto* out = newForReduction(in, {reduction_axis}, in->dtype()); + auto* reduction_op = IrBuilder::create( + BinaryOpType::Add, hic->zeroVal(), out, in); + hic->addInput(in); + hic->addOutput(out); + hic->pushBackTopLevelExprs(reduction_op); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + auto in_aten = at::randn({size0, size1}, options); + auto out_aten = at::empty({size0}, options); + + hie.runWithInput({{in, in_aten}, {out, out_aten}}); + + at::Tensor expected_out = in_aten.sum(reduction_axis); + EXPECT_TRUE(expected_out.equal(out_aten)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << expected_out; +} + +TEST_F(HirReductionOpTest, NonPreAllocatedOutputs) { + constexpr int64_t size0 = 8, size1 = 64; + constexpr int64_t reduction_axis = 1; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto* in = makeConcreteTensor({size0, size1}); + auto* out = sum(in, {reduction_axis}); + hic->addInput(in); + hic->addOutput(out); + hic->pushBackTopLevelExprs(out->definition()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + auto in_aten = at::randn({size0, size1}, options); + auto out_aten = at::empty({size0}, options); + + hie.runWithInput({{in, in_aten}, {out, out_aten}}); + + at::Tensor expected_out = in_aten.sum(reduction_axis); + EXPECT_TRUE(expected_out.equal(out_aten)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << expected_out; +} + } // namespace hir } // namespace nvfuser diff --git a/tests/cpp/test_multidevice_pipeline.cpp b/tests/cpp/test_multidevice_pipeline.cpp index 5985571c57a..12dfed5dd43 100644 --- a/tests/cpp/test_multidevice_pipeline.cpp +++ b/tests/cpp/test_multidevice_pipeline.cpp @@ -457,135 +457,4 @@ INSTANTIATE_TEST_SUITE_P( testing::Values(0, 1), testing::Values(true))); -// Different scheduling modes used in -// PipelineTestStagedReduction.StagedReduction -enum class SchedulingMode { - // Manual interdevice scheduling, no intra-device scheduling - InterDeviceOnly, - // Manual inter-/intra-device scheduling - Manual, - // Manual inter-device scheduling, composed with fully automated intra-device - // scheduling (through FusionExecutorCache) - Automatic, -}; - -std::ostream& operator<<(std::ostream& out, const SchedulingMode& mode) { - switch (mode) { - case SchedulingMode::InterDeviceOnly: - out << "InterDeviceOnly"; - break; - case SchedulingMode::Manual: - out << "Manual"; - break; - case SchedulingMode::Automatic: - out << "Automatic"; - break; - } - return out; -} - -class PipelineTestStagedReduction - : public PipelineTest, - public ::testing::WithParamInterface {}; - -// 1D staged reduction -// Inputs: X[num_devices,B,C] -TEST_P(PipelineTestStagedReduction, StagedReduction) { - auto scheduling_mode = GetParam(); - - const int num_devices = communicator_->size(); - constexpr int B = 8; - constexpr int C = 64; - - FusionGuard fg(fusion.get()); - // The first dimension is made symbolic so `tv_out->definition()` won't - // become a squeeze when num_devices == 1. This wouldn't be a problem for - // automatic mode. However, for the manual mode, the scheduling code below - // assumes `tv_out->definition()` can be lowered to communication. A squeeze - // can't. - TensorView* tv0 = TensorViewBuilder() - .dtype(DataType::Float) - .contiguity(true) - .shape({-1, B, C}) - .build(); - auto mesh = DeviceMesh::createForNumDevices(num_devices); - tv0->setDeviceMesh(mesh); - TensorView* tv1 = sum(tv0, {2}); - TensorView* tv_out = sum(tv1, {0}); - fusion->addInput(tv0); - fusion->addOutput(tv_out); - - for (auto* tv : {tv0, tv1}) { - tv->axis(0)->parallelize(ParallelType::DIDx); - } - - // Intra-device reduction scheduling for the first reduction: - switch (scheduling_mode) { - case SchedulingMode::InterDeviceOnly: - break; - case SchedulingMode::Manual: { - // inspired from NVFuserTest.FusionReduction1_CUDA - // tv0[I0{A}, I1{B}, I2{C}] - tv1->split(2, 32); - // tv1[I0{A}, I1{B}, R2o{C/32}, R2i{32}] = tv0[I0{A}, I1{B}, I2{C}] - tv1->split(2, 4); - // clang-format off - // tv1[I0{A}, I1{B}, R2oo{C/32/4)}, R2oi{4}, R2i{32}] = tv0[I0{A}, I1{B}, I2{C}] - // clang-format on - - TensorView* tv2 = tv1->rFactor({2}); - // clang-format off - // tv2[I0{A}, I1{B}, R2oo{C/32/4)}, I2oi{4}, I2i{32}] = tv0[I0{A}, I1{B}, I2{C}] - // tv1[I0{A}, I1{B}, R2oi{4}, R2i{32}] = tv2[I0{A}, I1{B}, R2oo{C/32/4)}, I2oi{4}, I2i{32}] - // clang-format on - - TensorView* tv3 = tv1->rFactor({2}); - // clang-format off - // tv2[I0{A}, I1{B}, R2oo{C/32/4)}, I2oi{4}, I2i{32}] = tv0[I0{A}, I1{B}, I2{C}] - // tv3[I0{A}, I1{B}, R2oi{4}, I2i{32}] = tv2[I0{A}, I1{B}, R2oo{C/32/4)}, I2oi{4}, I2i{32}] - // tv1[I0{A}, I1{B}, R2i{32}] = tv3[I0{A}, I1{B}, R2oi{4}, I2i{32}] - // clang-format on - - // tv1 is a segment boundary so must be in global. This wouldn't be - // needed if the fusion were scheduled automatically. - tv1->setMemoryType(MemoryType::Global); - - // Use `tv2` as the reference tensor because it contains the most - // parallel IterDomains. - tv2->axis(1)->parallelize(ParallelType::BIDx); - tv2->axis(3)->parallelize(ParallelType::Unroll); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - scheduler_utils::parallelizeAllLike( - tv2, - /*pos=*/-1, - // Don't propagate the parallelization to `tv_out` because that's in - // a different, resharding segment. - /*selected_tv=*/{tv0, tv1, tv2, tv3}); - inlineMost(); - break; - } - case SchedulingMode::Automatic: - host_ir_executor_params.use_fusion_executor_cache = true; - break; - } - - at::Tensor unsharded_input_tensor = - at::randn({num_devices, B, C}, tensor_options); - at::Tensor ref_unsharded_output_tensor = - unsharded_input_tensor.sum(at::IntArrayRef({0, 2})); - unsharded_args = {unsharded_input_tensor}; - ref_unsharded_outputs = {ref_unsharded_output_tensor}; - - executeAndValidate(/* validate_with_prescribed_values */ true); -} - -INSTANTIATE_TEST_SUITE_P( - , - PipelineTestStagedReduction, - testing::Values( - SchedulingMode::InterDeviceOnly, - SchedulingMode::Manual, - SchedulingMode::Automatic), - testing::PrintToStringParamName()); - } // namespace nvfuser From 85b7b751b82c31f5b019773d5adfdec21210ab8b Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 11 Apr 2025 06:26:51 -0700 Subject: [PATCH 09/38] move .contiguous to be in postScatter --- csrc/host_ir/executor.cpp | 2 +- csrc/multidevice/communication.cpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 4c0e7f441eb..ab3e08bfb6f 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -584,7 +584,7 @@ void HostIrEvaluator::handle(LoadStoreOp* load_store_op) { permutation.has_value(), "The logical domain of a Set.Permute is supposed to be a permutation of the root domain: ", out_tv->toString()); - in_tensor = in_tensor.permute(*permutation).contiguous(); + in_tensor = in_tensor.permute(*permutation); } if (!isKnown(load_store_op->out())) { bind(load_store_op->out(), in_tensor); diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp index 13fc3ce36ea..463cffd2879 100644 --- a/csrc/multidevice/communication.cpp +++ b/csrc/multidevice/communication.cpp @@ -352,6 +352,11 @@ c10::intrusive_ptr postScatter( c10d::Backend* backend, at::Tensor input_tensor, at::Tensor output_tensor) { + + if (my_device_index == communication->root()) { + input_tensor = input_tensor.contiguous(); + } + if (my_device_index == communication->root() && !communication->out()->getDeviceMesh().has(communication->root())) { output_tensor = at::empty_like(input_tensor.slice(0, 0, 1)); From 01e94a7bf3dd6ab271338d23853cd37117ece3af Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 11 Apr 2025 06:57:46 -0700 Subject: [PATCH 10/38] lint and build issue --- csrc/host_ir/lower.cpp | 2 +- csrc/multidevice/communication.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index d8302a8a119..93badafceb4 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -592,7 +592,7 @@ std::vector HostIrLower::lowerToCollectiveBasedPipelinedGemmComm( get_current_stream, allocate_tva_allgathered, allocate_tv_out, for_loop}; } -bool HostIrLower::isLoweredAsStandaloneHostOp(Expr* expr) { +bool HostIrLower::isLowerableAsStandaloneHostOp(Expr* expr) { return expr->isOneOf< MatmulOp, SliceOp, diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp index 463cffd2879..b8bcf857fa2 100644 --- a/csrc/multidevice/communication.cpp +++ b/csrc/multidevice/communication.cpp @@ -352,7 +352,6 @@ c10::intrusive_ptr postScatter( c10d::Backend* backend, at::Tensor input_tensor, at::Tensor output_tensor) { - if (my_device_index == communication->root()) { input_tensor = input_tensor.contiguous(); } From e1db5183158a3662827a6cb70caae7e63c0e2191 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 14 Apr 2025 06:49:48 -0700 Subject: [PATCH 11/38] reviews --- csrc/host_ir/container.h | 2 +- csrc/host_ir/executor.h | 2 +- tests/cpp/test_host_irs.cpp | 41 ++++++++++++++++++++++++++++++++----- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/csrc/host_ir/container.h b/csrc/host_ir/container.h index 7dcd66b4436..07bd896790b 100644 --- a/csrc/host_ir/container.h +++ b/csrc/host_ir/container.h @@ -56,7 +56,7 @@ class HostIrContainer final : public Fusion { Stream* getDefaultStream(); void markAlias(TensorView* original, const TensorView* new_alias) { - if (alias_.count(original)) { + while (alias_.count(original)) { original = alias_[original]->as(); } alias_[new_alias] = original; diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 8f4d425fd09..c1486285d19 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -140,7 +140,7 @@ class HostIrEvaluator final : public OptOutDispatch { Val* getAlias(Val* val) const { const auto& aliases = container_->alias(); auto it = aliases.find(val); - return it != aliases.end() ? it->second : val; + return it != aliases.end() ? getAlias(it->second) : val; } bool isKnown(Val* value) const { diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index 654e60c5d31..6a41e47c744 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -139,7 +139,7 @@ TEST_P(HostIrTest, SingleFusion) { auto outputs = hie.runWithInput({{post_on_stream->inputs().at(0), t0}}); // validate the obtained results - GTEST_EXPECT_TRUE(torch::allclose(ref_output, outputs[0].as())); + EXPECT_TRUE(torch::allclose(ref_output, outputs[0].as())); } /* @@ -236,7 +236,7 @@ TEST_P(HostIrTest, TwoFusions) { auto outputs = hie.runWithInput({{post_on_stream_0->inputs().at(0), t0}}); // validate the obtained results - GTEST_EXPECT_TRUE(torch::allclose(ref_output, outputs[0].as())); + EXPECT_TRUE(torch::allclose(ref_output, outputs[0].as())); } /* @@ -365,7 +365,7 @@ TEST_P(HostIrTest, ThreeFusions) { auto outputs = hie.runWithInput({{post_on_stream_0->inputs().at(0), t0_0}}); // validate the obtained results - GTEST_EXPECT_TRUE(torch::allclose(t2_2, outputs[0].as())); + EXPECT_TRUE(torch::allclose(t2_2, outputs[0].as())); } // This unit test the for-loop IR by implementing a program that could be @@ -508,7 +508,7 @@ TEST_P(HostIrTest, PreAllocatedOutputs) { {post_on_stream->outputs().at(0), output}}); // validate the obtained results - GTEST_EXPECT_TRUE(torch::allclose(ref_output, output)) + EXPECT_TRUE(torch::allclose(ref_output, output)) << "Output: " << output << " Expected: " << ref_output; } @@ -724,7 +724,7 @@ TEST_P(StreamHostIrTest, SingleFusionMultipleStreams) { // validate the obtained results for (int i = 0; i < n_iterations; i++) { - GTEST_EXPECT_TRUE(torch::allclose(ref_output, outputs[i].as())); + EXPECT_TRUE(torch::allclose(ref_output, outputs[i].as())); } EXPECT_NE( c10::cuda::getDefaultCUDAStream(0), c10::cuda::getCurrentCUDAStream(0)); @@ -1262,6 +1262,37 @@ TEST_F(HirAlias, SetAndGet) { << "Expected output: " << expected_out; } +TEST_F(HirAlias, SetAndGetReversedOrder) { + const std::vector sizes = {8, 64}; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + TensorView* tv0 = makeConcreteTensor(sizes); + TensorView* tv1 = set(tv0); + TensorView* tv2 = makeConcreteTensor(sizes); + TensorView* tv3 = set(tv2); + TensorView* tv4 = makeConcreteTensor(sizes); + hic->markAlias(tv3, tv4); + hic->markAlias(tv1, tv2); + hic->addInput(tv0); + hic->addOutput(tv4); + hic->pushBackTopLevelExprs(tv1->definition()); + hic->pushBackTopLevelExprs(tv3->definition()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor tv0_aten = at::randn(sizes, options); + + at::Tensor out_aten = hie.runWithInput({{tv0, tv0_aten}})[0].as(); + + at::Tensor expected_out = tv0_aten; + EXPECT_TRUE(out_aten.equal(expected_out)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << expected_out; +} + TEST_F(HirAlias, ThrowOnInputAlias) { const std::vector sizes = {8, 64}; From 59622ff48520269c02ed7a6893fd32c39a2a5848 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 15 Apr 2025 09:33:13 -0700 Subject: [PATCH 12/38] add comment --- csrc/fusion_segmenter.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h index a975716676d..7141085dcdf 100644 --- a/csrc/fusion_segmenter.h +++ b/csrc/fusion_segmenter.h @@ -502,6 +502,10 @@ struct SegmentCandidateFinderOptions { bool run_combine_reductions = true; bool run_herrmann_merge = true; bool run_final_merge = true; + // if provided, this custom function will be used to determine if two groups + // should be merged. If not provided, the tryMerge function will be used. This + // option is used in the context of MultiGpus where we proceed to a first + // segmentation to scoop out communications from compute. std::function custom_should_merge_groups = nullptr; }; From 25c618c787ef37cf671cae3d652c9cc778443d52 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 15 Apr 2025 11:04:14 -0700 Subject: [PATCH 13/38] add comment --- csrc/fusion_segmenter.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index b522f6d11ef..c98786a086d 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -3919,6 +3919,10 @@ bool SegmentCandidateFinder::codeGenSupportedMerge( NVF_ERROR( areDirectlyConnected(group1, group2), "only support testing immediate producer-consumer groups"); + // The segmemter should ideally be redesigned to be more flexible and + // decoupled from the schedulers, but for now, we just return + // `SchedulerType::None` as it is not relevant when the segmenter is + // used with a custom should-merge function. if (options_.custom_should_merge_groups != nullptr) { return (options_.custom_should_merge_groups)(group1, group2); } From eb46aef5aa46340e883d858fb4e707f27e6d28d3 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 16 Apr 2025 02:22:24 -0700 Subject: [PATCH 14/38] minor comment --- csrc/host_ir/container.cpp | 2 +- csrc/host_ir/executor.cpp | 22 +++++++++++----------- csrc/host_ir/executor.h | 2 +- tests/cpp/test_host_irs.cpp | 1 + 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/csrc/host_ir/container.cpp b/csrc/host_ir/container.cpp index a133c0521ba..83e668770fc 100644 --- a/csrc/host_ir/container.cpp +++ b/csrc/host_ir/container.cpp @@ -26,7 +26,7 @@ HostIrContainer::HostIrContainer(int64_t num_kernel_executors) HostIrContainer::~HostIrContainer() = default; Stream* HostIrContainer::getDefaultStream() { - if (!default_stream_) { + if (default_stream_ == nullptr) { default_stream_ = IrBuilder::createInContainer(this); } return default_stream_; diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 551f35487c9..89710eaae4b 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -327,7 +327,7 @@ void HostIrEvaluator::handle(Synchronize* synchronize) { void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { KernelArgumentHolder args; for (auto& input : launch_kernel->inputs()) { - args.push(getKnownConcreteData(input)); + args.push(getKnownConcreteValue(input)); } args.setDeviceIndex(); @@ -349,7 +349,7 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { void HostIrEvaluator::handle(PostOnStream* post_ir) { KernelArgumentHolder input_args; for (auto& input : post_ir->inputs()) { - input_args.push(getKnownConcreteData(input)); + input_args.push(getKnownConcreteValue(input)); } input_args.setDeviceIndex(); // placeholder for storing the outputs @@ -368,7 +368,7 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { post_ir); if (use_preallocated_outputs) { for (auto output : post_ir->outputs()) { - outputs.push(getKnownConcreteData(output)); + outputs.push(getKnownConcreteValue(output)); } } @@ -599,9 +599,9 @@ void HostIrEvaluator::handle(MatmulOp* matmul) { TensorView* out = matmul->out(); if (isKnown(out)) { - auto t_a = getKnownConcreteData(a).as(); - auto t_b = getKnownConcreteData(b).as(); - auto t_out = getKnownConcreteData(out).as(); + auto t_a = getKnownConcreteValue(a).as(); + auto t_b = getKnownConcreteValue(b).as(); + auto t_out = getKnownConcreteValue(out).as(); at::matmul_out(t_out, t_a, t_b); } else { unhandled(matmul); @@ -619,12 +619,12 @@ void HostIrEvaluator::handle(LinearOp* linear) { return; } - auto in_at = getKnownConcreteData(in).as(); - auto weight_at = getKnownConcreteData(weight).as(); - auto out_at = getKnownConcreteData(out).as(); + auto in_at = getKnownConcreteValue(in).as(); + auto weight_at = getKnownConcreteValue(weight).as(); + auto out_at = getKnownConcreteValue(out).as(); if (linear->has_bias()) { - auto bias_at = getKnownConcreteData(bias).as(); + auto bias_at = getKnownConcreteValue(bias).as(); at::linear_out(out_at, in_at, weight_at.squeeze(), bias_at.squeeze()); } else { at::linear_out(out_at, in_at, weight_at.squeeze()); @@ -661,7 +661,7 @@ void HostIrEvaluator::unhandled(Statement* stmt) { for (auto input : expr->inputs()) { if (input->isA()) { // Tensor inputs must be already computed at this point - inputs.push_back(getKnownConcreteData(input)); + inputs.push_back(getKnownConcreteValue(input)); } else { inputs.push_back(expr_evaluator_.evaluate(input)); } diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index b603359d5b3..d71b74e0dda 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -149,7 +149,7 @@ class HostIrEvaluator final : public OptOutDispatch { return expr_evaluator_.isKnown(getAlias(value)); } - PolymorphicValue getKnownConcreteData(Val* val) const { + PolymorphicValue getKnownConcreteValue(Val* val) const { NVF_ERROR( isKnown(val), "value ", diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index 6a41e47c744..f8660f3bb83 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -6,6 +6,7 @@ */ // clang-format on #include +#include #include #include From 5f161f5357d9994cd8e9122e06262b02a343103b Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 16 Apr 2025 02:44:03 -0700 Subject: [PATCH 15/38] lint --- tests/cpp/test_host_irs.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index f8660f3bb83..6a41e47c744 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -6,7 +6,6 @@ */ // clang-format on #include -#include #include #include From d2655842281d22f3fdc811f0074ea959f12f378f Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 16 Apr 2025 03:29:01 -0700 Subject: [PATCH 16/38] Revert "move .contiguous to be in postScatter" This reverts commit 85b7b751b82c31f5b019773d5adfdec21210ab8b. --- csrc/host_ir/executor.cpp | 2 +- csrc/multidevice/communication.cpp | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index e5dd2a86ba1..b973c7f3277 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -645,7 +645,7 @@ void HostIrEvaluator::handle(LoadStoreOp* load_store_op) { permutation.has_value(), "The logical domain of a Set.Permute is supposed to be a permutation of the root domain: ", out_tv->toString()); - in_tensor = in_tensor.permute(*permutation); + in_tensor = in_tensor.permute(*permutation).contiguous(); } if (!isKnown(load_store_op->out())) { bind(load_store_op->out(), in_tensor); diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp index c88aa5944e7..041e13eb80c 100644 --- a/csrc/multidevice/communication.cpp +++ b/csrc/multidevice/communication.cpp @@ -378,10 +378,6 @@ c10::intrusive_ptr postScatter( c10d::Backend* backend, at::Tensor input_tensor, at::Tensor output_tensor) { - if (my_device_index == communication->root()) { - input_tensor = input_tensor.contiguous(); - } - if (my_device_index == communication->root() && !communication->out()->getDeviceMesh().has(communication->root())) { output_tensor = at::empty_like(input_tensor.slice(0, 0, 1)); From a177cb4a015c49e7dfe7d376f237ecd643bda422 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 26 Mar 2025 06:08:43 -0700 Subject: [PATCH 17/38] add ParallelType::Stream lowering pass in host Ir for single device fusions --- CMakeLists.txt | 2 + csrc/host_ir/executor.h | 4 + csrc/host_ir/lower.cpp | 8 + csrc/ir/internal_nodes.h | 4 + csrc/multidevice/executor.h | 4 + csrc/ops/indexing.cpp | 10 +- csrc/ops/indexing.h | 6 +- csrc/ops/utils.cpp | 27 +- csrc/ops/utils.h | 14 +- csrc/preseg_passes/stream_parallel_type.cpp | 347 +++++++++ csrc/preseg_passes/stream_parallel_type.h | 26 + tests/cpp/test_host_ir_stream_lowering.cpp | 823 ++++++++++++++++++++ tests/cpp/test_multidevice_host_ir.cpp | 10 + 13 files changed, 1271 insertions(+), 14 deletions(-) create mode 100644 csrc/preseg_passes/stream_parallel_type.cpp create mode 100644 csrc/preseg_passes/stream_parallel_type.h create mode 100644 tests/cpp/test_host_ir_stream_lowering.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b9865da34a7..3f2750b59b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,6 +212,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/preseg_passes/remove_empty.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/segment_inplace_update.cpp + ${NVFUSER_SRCS_DIR}/preseg_passes/stream_parallel_type.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/translate_no_reduction_matmul_to_mul_squeeze.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/translate_repeat_to_expand.cpp ${NVFUSER_SRCS_DIR}/rng.cpp @@ -731,6 +732,7 @@ if(BUILD_TEST) list(APPEND HOSTIR_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp ${NVFUSER_ROOT}/tests/cpp/test_host_ir_integration.cpp + ${NVFUSER_ROOT}/tests/cpp/test_host_ir_stream_lowering.cpp ) add_test(test_host_ir "${HOSTIR_TEST_SRCS}" "") list(APPEND TEST_BINARIES test_host_ir) diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index dfe84fba068..89ac5119681 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -97,6 +97,10 @@ class HostIrEvaluator final : public OptOutDispatch { return container_->outputs(); } + auto* container() const { + return container_.get(); + } + std::ostream& print(std::ostream& os) const { return container_->print(os); }; diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index 308e1399872..1a74d9a9f01 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -718,6 +719,10 @@ std::unique_ptr HostIrLower::lower( hic->addOutput(ir_cloner.clone(output)); } + for (auto tv : hic->allTvs()) { + tv->setMemoryType(MemoryType::Global); + } + std::vector new_top_level_exprs; for (auto top_level_expr : hic->topLevelExprs()) { if (!isResharding(top_level_expr)) { @@ -744,6 +749,9 @@ std::unique_ptr HostIrLower::lower( } hic->resetTopLevelExprs(new_top_level_exprs); + preseg_passes::OptimizationPass::runPass( + hic.get()); + return hic; } diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h index 91d3ca4ec39..1a2bb1634bb 100644 --- a/csrc/ir/internal_nodes.h +++ b/csrc/ir/internal_nodes.h @@ -2477,6 +2477,10 @@ class ForLoop final : public Expr { return input(0); } + IterDomain* iterDomain() const { + return input(1)->as(); + } + Val* indexOrStartIfTrivial() const { return isTrivial() ? start() : index(); } diff --git a/csrc/multidevice/executor.h b/csrc/multidevice/executor.h index c1cc3e31cfe..7dd08a87f0a 100644 --- a/csrc/multidevice/executor.h +++ b/csrc/multidevice/executor.h @@ -103,6 +103,10 @@ class MultiDeviceExecutor { return host_ir_executor_->getFusionExecutorCaches(); }; + auto* hostIrEvaluator() const { + return host_ir_executor_.get(); + } + private: // holds the Communicator to be used for execution Communicator& comm_; diff --git a/csrc/ops/indexing.cpp b/csrc/ops/indexing.cpp index 5ff75065ff2..80c0ff84b85 100644 --- a/csrc/ops/indexing.cpp +++ b/csrc/ops/indexing.cpp @@ -19,8 +19,14 @@ namespace nvfuser { -TensorView* select(TensorView* tv, int64_t dim, Val* index) { - auto dom = TensorDomain::noReductions(tv->getLogicalDomain()); +TensorView* select( + TensorView* tv, + int64_t dim, + Val* index, + bool keep_reduction_axis) { + auto dom = keep_reduction_axis + ? tv->getLogicalDomain() + : TensorDomain::noReductions(tv->getLogicalDomain()); NVF_CHECK(!dom.empty(), "select can not be applied to 0d tensor."); std::vector new_root; diff --git a/csrc/ops/indexing.h b/csrc/ops/indexing.h index c8152c33f82..7a219c534a3 100644 --- a/csrc/ops/indexing.h +++ b/csrc/ops/indexing.h @@ -15,7 +15,11 @@ namespace nvfuser { -NVF_API TensorView* select(TensorView* tv, int64_t dim, Val* index); +NVF_API TensorView* select( + TensorView* tv, + int64_t dim, + Val* index, + bool keep_reduction_axis = false); // torch.index_select NVF_API TensorView* indexSelect( diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp index 8d3870d1a84..5d32c22e212 100644 --- a/csrc/ops/utils.cpp +++ b/csrc/ops/utils.cpp @@ -432,7 +432,9 @@ IterDomain* newOutputIterDomain( #pragma GCC diagnostic pop #endif -std::vector newOutputDomain(const std::vector& vals) { +std::vector newOutputDomain( + const std::vector& vals, + bool keep_reduction_axis) { std::vector tvs; for (auto val : vals) { if (auto* tv = dynamic_cast(val)) { @@ -443,14 +445,20 @@ std::vector newOutputDomain(const std::vector& vals) { !tvs.empty(), "Tried to create new output TensorView but received empty list."); - std::vector out_domain( - TensorDomain::noReductions(tvs[0]->getLogicalDomain()).size(), nullptr); + auto getLogicalDomain = + [keep_reduction_axis](TensorView* tv) -> std::vector { + return keep_reduction_axis + ? tv->getLogicalDomain() + : TensorDomain::noReductions(tv->getLogicalDomain()); + }; + + std::vector out_domain(getLogicalDomain(tvs[0]).size(), nullptr); for (const auto dim_i : arange(out_domain.size())) { std::vector input_ids; input_ids.reserve(tvs.size()); for (auto* tv : tvs) { - auto dom = TensorDomain::noReductions(tv->getLogicalDomain()); + auto dom = getLogicalDomain(tv); input_ids.emplace_back(dom[dim_i]); } out_domain[dim_i] = newOutputIterDomain(input_ids); @@ -458,8 +466,11 @@ std::vector newOutputDomain(const std::vector& vals) { return out_domain; } -TensorView* newOutputTV(const std::vector& vals, DataType dtype) { - auto out_domain = newOutputDomain(vals); +TensorView* newOutputTV( + const std::vector& vals, + DataType dtype, + bool keep_reduction_axis) { + auto out_domain = newOutputDomain(vals, keep_reduction_axis); auto* new_out = IrBuilder::create( IrBuilder::create( out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)), @@ -502,12 +513,12 @@ std::vector maybeBroadcast(const std::vector& vals) { return out_vals; } -Val* newValLike(Val* val, DataType dtype) { +Val* newValLike(Val* val, DataType dtype, bool keep_reduction_axis) { NVF_CHECK( dtype != DataType::Null, "Invalid datatype provided for new value."); if (val->isA()) { - return newOutputTV({val}, dtype); + return newOutputTV({val}, dtype, keep_reduction_axis); } return newScalar(ValType::Others, dtype); diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h index 94d6391cf45..1a2abda03fc 100644 --- a/csrc/ops/utils.h +++ b/csrc/ops/utils.h @@ -99,13 +99,21 @@ IterDomain* newOutputIterDomain( // output tensorview, e.g., for BinaryOp. `vals` can contain scalars, e.g, when // creating the output TensorView for `tv0+scalar`. This is for convenience and // scalars will be ignored. -std::vector newOutputDomain(const std::vector& vals); +std::vector newOutputDomain( + const std::vector& vals, + bool keep_reduction_axis = false); -TensorView* newOutputTV(const std::vector& vals, DataType dtype); +TensorView* newOutputTV( + const std::vector& vals, + DataType dtype, + bool keep_reduction_axis = false); std::vector maybeBroadcast(const std::vector& vals); -NVF_API Val* newValLike(Val* val, DataType dtype); +NVF_API Val* newValLike( + Val* val, + DataType dtype, + bool keep_reduction_axis = false); // returns the minimum init value for reduction: // -inf for floating type; diff --git a/csrc/preseg_passes/stream_parallel_type.cpp b/csrc/preseg_passes/stream_parallel_type.cpp new file mode 100644 index 00000000000..5a814c9a59a --- /dev/null +++ b/csrc/preseg_passes/stream_parallel_type.cpp @@ -0,0 +1,347 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nvfuser::preseg_passes { + +// returns the first stream axis in the domain, or nullptr if there is none. +// Throws if two axis are stream parallelized +IterDomain* getStreamAxis(const std::vector& domain) { + IterDomain* ret = nullptr; + for (auto id : domain) { + if (id->getParallelType() == ParallelType::Stream) { + NVF_CHECK( + ret == nullptr, + "Expected at most one stream axis in the domain, but found ", + id, + " and ", + ret); + ret = id; + } + } + return ret; +} + + + +// TODO: ideally we should look at the dag and use the segmenter. Here we take +// advantage of the linear structure of HostIrContainer::topLevelExprs to +// greedily merge the adjacent compatible stream for-loop bodies +void StreamParallelType::runPass(Fusion* fusion) { + // check that there are no stream axes in the inputs + NVF_CHECK( + std::all_of( + fusion->inputs().begin(), + fusion->inputs().end(), + [](Val* input) { + auto input_tv = dynamic_cast(input); + return input_tv == nullptr || + getStreamAxis(input_tv->getLoopDomain()) == nullptr; + }), + "Expected no stream axis in the TensorView inputs."); + + FusionGuard fg(fusion); // set as current container to register the newly + // created for-loops + hir::HostIrContainer* hic = dynamic_cast(fusion); + NVF_CHECK(hic, "Expected HostIrContainer"); + // needed ? + IdModel id_model(fusion); + id_model.buildAlmostExactGraph(); + + std::vector new_top_level_exprs; + // Step 1. Find the segments of expressions that can be merged into a single + // stream for-loop At the end of this step, new_top_level_exprs contains a + // list of expressions including newly created for-loops that will represent + // the stream parallelization, and the relevant expressions grouped inside the + // for-loops bodies. + for (auto expr : hic->topLevelExprs()) { + // we only support exprs having at most 1 output for now + if (expr->outputs().size() == 0) { + new_top_level_exprs.push_back(expr); + continue; + } + NVF_CHECK( + expr->outputs().size() == 1, + "Each expr should have at most one output."); + TensorView* output = expr->output(0)->as(); + // retrieves the Loop IterDomain that is stream parallelized, if any + IterDomain* stream_axis = getStreamAxis(output->getLoopDomain()); + if (stream_axis == nullptr) { + // if the consumer is not stream parallelized, it means the expr need not + // be inside a stream for-loop + new_top_level_exprs.push_back(expr); + continue; + } + NVF_ERROR( + HostIrLower::isLoweredAsStandaloneHostOp(expr), + "Stream parallel type not supported for expr ", + expr); + // find the corresponding stream axis but in the Logical (and not Loop + // Domain) + auto it_logical_stream_axis = std::find( + output->getLogicalDomain().begin(), + output->getLogicalDomain().end(), + stream_axis); + // for now we do not support split/merge stream axis + NVF_ERROR( + it_logical_stream_axis != output->getLogicalDomain().end(), + "Cannot stream parallelize on a split/merge axis ", + stream_axis); + // we don't support reducing or broadcasting a stream axis + NVF_CHECK( + stream_axis->getIterType() == IterType::Iteration, + "Stream axis ", + stream_axis, + " should be an iteration axis."); + // check if the current expr can be merged with the previous stream for-loop + // We consider the previous expression to check whether the expr should + // create a new stream for-loop or be integrated into the previous one + if (!new_top_level_exprs.empty() && + new_top_level_exprs.back()->isA() && + id_model.idGraph(IdMappingMode::ALMOSTEXACT) + .disjointValSets() + .strictAreMapped( + stream_axis, + new_top_level_exprs.back()->as()->iterDomain())) { + // merge with previous for-loop + new_top_level_exprs.back()->as()->body().push_back(expr); + } else { + // create a new for-loop + auto* j = IrBuilder::create( + DataType::Index); // running index of the for-loop + auto* start = hic->zeroVal(); + auto* stop = stream_axis->extent(); + auto* step = hic->oneVal(); + auto* for_loop = IrBuilder::create( + stream_axis, + /*index=*/j, + start, + stop, + step, + /*vectorize=*/false, + /*vectorize_shift=*/nullptr, + /*unroll_required=*/false, + CircularBufferLoopStage::NotApplicable, + /*circular_buffer_loop_stage_depth=*/0); + for_loop->body().push_back(expr); + // replace the current expr by the for-loop containing it + new_top_level_exprs.push_back(for_loop); + } + } + + // Step 2. Setup each for loop's body by Slicing the tensors. + std::vector top_level_exprs = std::move(new_top_level_exprs); + new_top_level_exprs.clear(); + for (auto top_level_expr : top_level_exprs) { + // TODO: change in place? consr issue + if (!top_level_expr->isA()) { + new_top_level_exprs.push_back(top_level_expr); + continue; + } + auto* for_loop = top_level_expr->as(); + // this will contain the new body of the current for-loop + std::vector new_loop_body; + + std::vector current_loop_body = for_loop->body().exprs(); + for (auto it_expr = current_loop_body.begin(); + it_expr != current_loop_body.end(); + ++it_expr) { + Expr* expr = *it_expr; + for (auto* input : ir_utils::filterByType(expr->inputs())) { + int64_t input_stream_id_logical_index = -1; + for (auto id : input->getLoopDomain()) { + if (id_model.idGraph(IdMappingMode::ALMOSTEXACT) + .disjointValSets() + .strictAreMapped(for_loop->iterDomain(), id)) { + NVF_CHECK( + input_stream_id_logical_index == -1, + "Expected at most one axis mapping to the stream axis ", + for_loop->iterDomain(), + " in the tensor ", + input, + " loop's domain ", + input->getLoopDomain()); + auto it_input_stream_id_logical = std::find( + input->getLogicalDomain().begin(), + input->getLogicalDomain().end(), + id); + NVF_CHECK( + it_input_stream_id_logical != input->getLogicalDomain().end(), + "Expected to find ", + id, + " in ", + input, + "'s logical domain ", + input->getLogicalDomain()); + input_stream_id_logical_index = std::distance( + input->getLogicalDomain().begin(), it_input_stream_id_logical); + } + } + if (input_stream_id_logical_index == -1) { + continue; + } + TensorView* input_j = select( + input, + input_stream_id_logical_index, + for_loop->index(), + /*keep_reduction_axis=*/true); + new_loop_body.push_back(input_j->definition()); + for (auto it_running_expr = current_loop_body.begin(); + it_running_expr != current_loop_body.end(); + ++it_running_expr) { + Expr* running_expr = *it_running_expr; + for (auto* running_input : + ir_utils::filterByType(running_expr->inputs())) { + if (running_input == input) { + *it_running_expr = ir_utils::replaceValInExprInputs( + running_expr, input, input_j); + } + } + } + } + + for (auto* output : ir_utils::filterByType(expr->outputs())) { + int64_t output_stream_id_logical_index = -1; + for (auto id : output->getLoopDomain()) { + if (id_model.idGraph(IdMappingMode::ALMOSTEXACT) + .disjointValSets() + .strictAreMapped(for_loop->iterDomain(), id)) { + NVF_CHECK( + output_stream_id_logical_index == -1, + "Expected at most one axis mapping to the stream axis ", + for_loop->iterDomain(), + " in the tensor ", + output, + " loop's domain ", + output->getLoopDomain()); + auto it_output_stream_id_logical = std::find( + output->getLogicalDomain().begin(), + output->getLogicalDomain().end(), + id); + NVF_CHECK( + it_output_stream_id_logical != output->getLogicalDomain().end(), + "Expected to find ", + id, + " in ", + output, + "'s logical domain ", + output->getLogicalDomain()); + output_stream_id_logical_index = std::distance( + output->getLogicalDomain().begin(), + it_output_stream_id_logical); + } + } + if (output_stream_id_logical_index == -1) { + continue; + } + TensorView* output_j = select( + output, + output_stream_id_logical_index, + for_loop->index(), + /*keep_reduction_axis=*/true); + new_top_level_exprs.push_back( + IrBuilder::create(output, MemoryType::Global)); + new_loop_body.push_back(output_j->definition()); + for (auto it_running_expr = current_loop_body.begin(); + it_running_expr != current_loop_body.end(); + ++it_running_expr) { + Expr* running_expr = *it_running_expr; + for (auto* running_output : + ir_utils::filterByType(running_expr->outputs())) { + if (running_output == output) { + TensorView* output_j_alias = + ops::newValLike( + output_j, output_j->dtype(), /*keep_reduction_axis=*/true) + ->as(); + hic->markAlias(output_j, output_j_alias); + *it_running_expr = ir_utils::transferDefinitionToNewOutputs( + running_expr, {output_j_alias}); + if (Communication* comm = dynamic_cast( + output_j_alias->definition()); + comm && comm->type() == CommunicationType::Allgather) { + std::cout << "HERE, with expr:" << *it_running_expr + << std::endl; + } + } + } + } + } + new_loop_body.push_back(*it_expr); + } + // reseting the for-loop body + for_loop->body().clear(); + for (auto* expr : new_loop_body) { + for_loop->body().push_back(expr); + } + new_top_level_exprs.push_back(top_level_expr); + } + + // Step 3. Finalize the for-loop bodies by adding the stream setup and + // synchronization + for (auto* top_level_expr : new_top_level_exprs) { + if (!top_level_expr->isA()) { + continue; + } + auto* for_loop = top_level_expr->as(); + std::vector new_loop_body; + + // Get the current stream to later synchronize subsequent new streams + auto* get_current_stream = IrBuilder::create(); + hir::Stream* original_stream = get_current_stream->stream(); + new_loop_body.push_back(get_current_stream); + + // set the stream to the one corresponding to the current for-loop index + auto* j = for_loop->index(); + auto* number_of_streams = + IrBuilder::create("numberOfStreams", DataType::Int); + auto* stream_index = mod(j, number_of_streams); + auto* stream = IrBuilder::create(stream_index); + auto* set_stream = IrBuilder::create(stream); + new_loop_body.push_back(set_stream); + + // sync the new stream with the original stream + auto* initial_sync_stream = + IrBuilder::create(original_stream); + new_loop_body.push_back(initial_sync_stream); + + // add the actual exprs to the for-loop body + for (auto* expr : for_loop->body().exprs()) { + new_loop_body.push_back(expr); + } + + // set back the original stream + auto* set_back_original_stream = + IrBuilder::create(original_stream); + new_loop_body.push_back(set_back_original_stream); + // synchronize original stream with the for-loop's streams + auto* sync_stream = IrBuilder::create(stream); + new_loop_body.push_back(sync_stream); + + // reset the for-loop's body to the one we constructed. + for_loop->body().clear(); + for (auto* expr : new_loop_body) { + for_loop->body().push_back(expr); + } + } + + // reset hic topLevelExprs to new_top_level_exprs + hic->resetTopLevelExprs(new_top_level_exprs); +} + +} // namespace nvfuser::preseg_passes diff --git a/csrc/preseg_passes/stream_parallel_type.h b/csrc/preseg_passes/stream_parallel_type.h new file mode 100644 index 00000000000..a9600809e21 --- /dev/null +++ b/csrc/preseg_passes/stream_parallel_type.h @@ -0,0 +1,26 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once + +#include +#include + +namespace nvfuser::preseg_passes { + +// A pass used in HostIrLower that takes a HostIrContainer as input, reads the TensorView's ParallelType::Stream, and modify the the HostIrContainer's top level expressions with the corresponding Host For Loops, which bodies contain stream assignement, selecting on tensor's axis, and the exprs on those sliced tensors. After this pass, the ParallelType::Stream is removed from the TensorView's axis. +class StreamParallelType : public OptimizationPass { + friend class OptimizationPass; + + protected: + static void runPass(Fusion* fusion); + static constexpr std::string_view name() { + return "StreamParallelType"; + } +}; + +} // namespace nvfuser::preseg_passes diff --git a/tests/cpp/test_host_ir_stream_lowering.cpp b/tests/cpp/test_host_ir_stream_lowering.cpp new file mode 100644 index 00000000000..9f3d9f432ed --- /dev/null +++ b/tests/cpp/test_host_ir_stream_lowering.cpp @@ -0,0 +1,823 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace nvfuser { + +namespace hir { + +using HirLowerStreamTest = NVFuserTest; + +TEST_F(HirLowerStreamTest, InputsAreNotStreamParallelized) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* tv = makeContigTensor(2); + hic->addInput(tv); + tv->axis(0)->parallelize(ParallelType::Stream); + + EXPECT_ANY_THROW(preseg_passes::OptimizationPass< + preseg_passes::StreamParallelType>::runPass(hic.get())); +} + +TEST_F(HirLowerStreamTest, Split) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = set(tv0); + hic->addInput(tv0); + hic->addOutput(tv1); + hic->pushBackTopLevelExprs(tv1->definition()); + tv1->split(0, 2); + tv1->axis(0)->parallelize(ParallelType::Stream); + + EXPECT_ANY_THROW(preseg_passes::OptimizationPass< + preseg_passes::StreamParallelType>::runPass(hic.get())); +} + +TEST_F(HirLowerStreamTest, Merge) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = set(tv0); + hic->addInput(tv0); + hic->addOutput(tv1); + hic->pushBackTopLevelExprs(tv1->definition()); + tv1->merge(0, 1); + tv1->axis(0)->parallelize(ParallelType::Stream); + + EXPECT_ANY_THROW(preseg_passes::OptimizationPass< + preseg_passes::StreamParallelType>::runPass(hic.get())); +} + +TEST_F(HirLowerStreamTest, SingleSetOp) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = set(tv0); + hic->addInput(tv0); + hic->addOutput(tv1); + hic->pushBackTopLevelExprs(tv1->definition()); + tv0->setMemoryType(MemoryType::Global); + tv1->setMemoryType(MemoryType::Global); + tv1->axis(0)->parallelize(ParallelType::Stream); + + preseg_passes::OptimizationPass::runPass( + hic.get()); + + EXPECT_EQ(hic->topLevelExprs().size(), 2); + EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(1)->isA()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor input = at::rand({4, 8}, options); + auto output = hie.runWithInput({{tv0, input}})[0].as(); + + torch::cuda::synchronize(); + EXPECT_TRUE(output.equal(input)) + << "Output: " << output << " Expected: " << input; +} + +TEST_F(HirLowerStreamTest, SingleSetOpNonOutermost) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = set(tv0); + hic->addInput(tv0); + hic->addOutput(tv1); + hic->pushBackTopLevelExprs(tv1->definition()); + tv0->setMemoryType(MemoryType::Global); + tv1->setMemoryType(MemoryType::Global); + tv1->axis(1)->parallelize(ParallelType::Stream); + + preseg_passes::OptimizationPass::runPass( + hic.get()); + + EXPECT_EQ(hic->topLevelExprs().size(), 2); + EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(1)->isA()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor input = at::rand({4, 8}, options); + auto output = hie.runWithInput({{tv0, input}})[0].as(); + + torch::cuda::synchronize(); + EXPECT_TRUE(output.equal(input)) + << "Output: " << output << " Expected: " << input; +} + +TEST_F(HirLowerStreamTest, SingleBinaryOp) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = makeContigTensor(2); + TensorView* tv2 = add(tv0, tv1); + hic->addInput(tv0); + hic->addInput(tv1); + hic->addOutput(tv2); + hic->pushBackTopLevelExprs(tv2->definition()); + tv0->setMemoryType(MemoryType::Global); + tv1->setMemoryType(MemoryType::Global); + tv2->setMemoryType(MemoryType::Global); + tv2->axis(0)->parallelize(ParallelType::Stream); + + preseg_passes::OptimizationPass::runPass( + hic.get()); + + EXPECT_EQ(hic->topLevelExprs().size(), 2); + EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(1)->isA()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor tv0_input = at::rand({4, 4}, options); + at::Tensor tv1_input = at::rand({4, 4}, options); + // std::unordered_map inputs = {{tv0, input}}; + auto output = hie.runWithInput({{tv0, tv0_input}, {tv1, tv1_input}})[0] + .as(); + auto expected_output = tv0_input + tv1_input; + EXPECT_TRUE(output.equal(expected_output)) + << "Output: " << output << "Expected: " << expected_output; +} + +TEST_F(HirLowerStreamTest, TwoSetOps) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = set(tv0); + TensorView* tv2 = set(tv1); + hic->addInput(tv0); + hic->addOutput(tv2); + hic->pushBackTopLevelExprs(tv1->definition()); + hic->pushBackTopLevelExprs(tv2->definition()); + tv0->setMemoryType(MemoryType::Global); + tv1->setMemoryType(MemoryType::Global); + tv2->setMemoryType(MemoryType::Global); + tv1->axis(0)->parallelize(ParallelType::Stream); + tv2->axis(0)->parallelize(ParallelType::Stream); + + preseg_passes::OptimizationPass::runPass( + hic.get()); + + EXPECT_EQ(hic->topLevelExprs().size(), 3); + EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(1)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(2)->isA()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor input = at::rand({4, 8}, options); + auto output = hie.runWithInput({{tv0, input}})[0].as(); + + torch::cuda::synchronize(); + EXPECT_TRUE(output.equal(input)) + << "Output: " << output << " Expected: " << input; +} + +TEST_F(HirLowerStreamTest, ThreeSetOpsWithDisjointsForLoops) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = set(tv0); + TensorView* tv2 = set(tv1); + TensorView* tv3 = set(tv2); + hic->addInput(tv0); + hic->addOutput(tv3); + hic->pushBackTopLevelExprs(tv1->definition()); + hic->pushBackTopLevelExprs(tv2->definition()); + hic->pushBackTopLevelExprs(tv3->definition()); + tv0->setMemoryType(MemoryType::Global); + tv1->setMemoryType(MemoryType::Global); + tv2->setMemoryType(MemoryType::Global); + tv3->setMemoryType(MemoryType::Global); + tv1->axis(0)->parallelize(ParallelType::Stream); + tv3->axis(0)->parallelize(ParallelType::Stream); + + preseg_passes::OptimizationPass::runPass( + hic.get()); + + EXPECT_EQ(hic->topLevelExprs().size(), 5); + EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(1)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(2)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(3)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(4)->isA()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor input = at::rand({4, 8}, options); + auto output = hie.runWithInput({{tv0, input}})[0].as(); + + torch::cuda::synchronize(); + EXPECT_TRUE(output.equal(input)) + << "Output: " << output << " Expected: " << input; +} + +TEST_F(HirLowerStreamTest, ReductionUnsupported) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = sum(tv0, {0}); + hic->addInput(tv0); + hic->addOutput(tv1); + hic->pushBackTopLevelExprs(tv1->definition()); + tv0->setMemoryType(MemoryType::Global); + tv1->setMemoryType(MemoryType::Global); + tv1->axis(0)->parallelize(ParallelType::Stream); + + EXPECT_ANY_THROW(preseg_passes::OptimizationPass< + preseg_passes::StreamParallelType>::runPass(hic.get())); +} + +TEST_F(HirLowerStreamTest, Reduction) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* tv0 = makeContigTensor(3); + TensorView* tv1 = sum(tv0, {2}); + hic->addInput(tv0); + hic->addOutput(tv1); + hic->pushBackTopLevelExprs(tv1->definition()); + tv0->setMemoryType(MemoryType::Global); + tv1->setMemoryType(MemoryType::Global); + tv1->axis(0)->parallelize(ParallelType::Stream); + + preseg_passes::OptimizationPass::runPass( + hic.get()); + + EXPECT_EQ(hic->topLevelExprs().size(), 2); + EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(1)->isA()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor input = at::rand({4, 8, 2}, options); + auto output = hie.runWithInput({{tv0, input}})[0].as(); + + torch::cuda::synchronize(); + auto expected_output = input.sum(2); + EXPECT_TRUE(output.equal(expected_output)) + << "Output: " << output << " Expected: " << expected_output; +} + +TEST_F(HirLowerStreamTest, Matmul_M) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* a = makeContigTensor(2); + TensorView* b = makeContigTensor(2); + TensorView* c = matmul(a, b); + hic->addInput(a); + hic->addInput(b); + hic->addOutput(c); + hic->pushBackTopLevelExprs(c->definition()); + a->setMemoryType(MemoryType::Global); + b->setMemoryType(MemoryType::Global); + c->setMemoryType(MemoryType::Global); + c->axis(0)->parallelize(ParallelType::Stream); + + preseg_passes::OptimizationPass::runPass( + hic.get()); + + EXPECT_EQ(hic->topLevelExprs().size(), 2); + EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(1)->isA()); + + HostIrEvaluator hie(std::move(hic)); + + constexpr int64_t M = 8, K = 4, N = 2; + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor a_aten = at::rand({M, K}, options); + at::Tensor b_aten = at::rand({K, N}, options); + auto output = + hie.runWithInput({{a, a_aten}, {b, b_aten}})[0].as(); + + torch::cuda::synchronize(); + auto expected_output = at::matmul(a_aten, b_aten); + EXPECT_TRUE(torch::allclose(output, expected_output, 1e-2, 1e-2)) + << "Output: " << output << " Expected: " << expected_output; +} + +TEST_F(HirLowerStreamTest, BatchedMatmul) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* a = makeContigTensor(3); + TensorView* b = makeContigTensor(2); + TensorView* c = matmul(a, b); + hic->addInput(a); + hic->addInput(b); + hic->addOutput(c); + hic->pushBackTopLevelExprs(c->definition()); + a->setMemoryType(MemoryType::Global); + b->setMemoryType(MemoryType::Global); + c->setMemoryType(MemoryType::Global); + c->axis(0)->parallelize(ParallelType::Stream); + + preseg_passes::OptimizationPass::runPass( + hic.get()); + + EXPECT_EQ(hic->topLevelExprs().size(), 2); + EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(1)->isA()); + + HostIrEvaluator hie(std::move(hic)); + + constexpr int64_t B = 16, M = 8, K = 4, N = 2; + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor a_aten = at::rand({B, M, K}, options); + at::Tensor b_aten = at::rand({K, N}, options); + auto output = + hie.runWithInput({{a, a_aten}, {b, b_aten}})[0].as(); + + torch::cuda::synchronize(); + auto expected_output = at::matmul(a_aten, b_aten); + EXPECT_TRUE(torch::allclose(output, expected_output, 1e-2, 1e-2)) + << "Output: " << output << " Expected: " << expected_output; +} + +TEST_F(HirLowerStreamTest, Matmul_N) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* a = makeContigTensor(2); + TensorView* b = makeContigTensor(2); + TensorView* c = matmul(a, b); + hic->addInput(a); + hic->addInput(b); + hic->addOutput(c); + hic->pushBackTopLevelExprs(c->definition()); + a->setMemoryType(MemoryType::Global); + b->setMemoryType(MemoryType::Global); + c->setMemoryType(MemoryType::Global); + c->axis(1)->parallelize(ParallelType::Stream); + + preseg_passes::OptimizationPass::runPass( + hic.get()); + + EXPECT_EQ(hic->topLevelExprs().size(), 2); + EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(hic->topLevelExprs().at(1)->isA()); + + HostIrEvaluator hie(std::move(hic)); + + constexpr int64_t M = 8, K = 4, N = 2; + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor a_aten = at::rand({M, K}, options); + at::Tensor b_aten = at::rand({K, N}, options); + auto output = + hie.runWithInput({{a, a_aten}, {b, b_aten}})[0].as(); + + torch::cuda::synchronize(); + auto expected_output = at::matmul(a_aten, b_aten); + EXPECT_TRUE(torch::allclose(output, expected_output, 1e-2, 1e-2)) + << "Output: " << output << " Expected: " << expected_output; +} + +TEST_F(HirLowerStreamTest, Matmul_K) { + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + TensorView* a = makeContigTensor(2); + TensorView* b = makeContigTensor(2); + TensorView* c = matmul(a, b); + hic->addInput(a); + hic->addInput(b); + hic->addOutput(c); + hic->pushBackTopLevelExprs(c->definition()); + a->setMemoryType(MemoryType::Global); + b->setMemoryType(MemoryType::Global); + c->setMemoryType(MemoryType::Global); + c->axis(-1)->parallelize(ParallelType::Stream); + + EXPECT_ANY_THROW(preseg_passes::OptimizationPass< + preseg_passes::StreamParallelType>::runPass(hic.get())); +} + +// We don's support PostOnStream because it does not support well pre-allocated +// outputs. There is no strong motivation to support PostOnStream +TEST_F(HirLowerStreamTest, DoNotSupportPostOnStream) { + const std::vector input_sizes = {4, 8, 32}; + const std::vector output_sizes = { + input_sizes.at(1), input_sizes.at(2)}; + + auto get_fusion = [input_sizes]() -> std::unique_ptr { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + auto tv0 = makeConcreteTensor(input_sizes); + auto tv1 = add(tv0, tv0); + auto tv2 = sum(tv1, {0}); + fusion->addInput(tv0); + fusion->addOutput(tv2); + return fusion; + }; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto host_unit = IrBuilder::create(get_fusion()); + + IrCloner ir_cloner(hic.get()); + TensorView* input = + ir_cloner.clone(host_unit->fusion_to_execute()->inputs().at(0)) + ->as(); + TensorView* output = + ir_cloner.clone(host_unit->fusion_to_execute()->outputs().at(0)) + ->as(); + + std::vector inputs = {input}; + std::vector outputs = {output}; + auto post_on_stream = + IrBuilder::create(host_unit, inputs, outputs); + + hic->pushBackTopLevelExprs(post_on_stream); + + hic->addInput(input); + hic->addOutput(output); + + output->axis(-1)->parallelize(ParallelType::Stream); + + EXPECT_ANY_THROW(preseg_passes::OptimizationPass< + preseg_passes::StreamParallelType>::runPass(hic.get())); +} + +} // namespace hir + +using MultiDeviceExecutorLowerStreamTest = NVFuserTest; + +TEST_F(MultiDeviceExecutorLowerStreamTest, InputsAreNotStreamParallelized) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* tv = makeContigTensor(2); + fusion->addInput(tv); + tv->axis(0)->parallelize(ParallelType::Stream); + + EXPECT_ANY_THROW( + MultiDeviceExecutor(std::move(fusion), Communicator::getInstance())); +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, Split) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = set(tv0); + fusion->addInput(tv0); + fusion->addOutput(tv1); + tv1->split(0, 2); + tv1->axis(0)->parallelize(ParallelType::Stream); + + EXPECT_ANY_THROW( + MultiDeviceExecutor(std::move(fusion), Communicator::getInstance())); +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, Merge) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = set(tv0); + fusion->addInput(tv0); + fusion->addOutput(tv1); + tv1->merge(0, 1); + tv1->axis(0)->parallelize(ParallelType::Stream); + + EXPECT_ANY_THROW( + MultiDeviceExecutor(std::move(fusion), Communicator::getInstance())); +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, SingleSetOp) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = set(tv0); + fusion->addInput(tv0); + fusion->addOutput(tv1); + tv1->axis(0)->parallelize(ParallelType::Stream); + + MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance()); + + hir::HostIrContainer* container = executor.hostIrEvaluator()->container(); + EXPECT_EQ(container->topLevelExprs().size(), 2); + EXPECT_TRUE(container->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(1)->isA()); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor input = at::rand({4, 8}, options); + auto output = + executor.runWithInput(KernelArgumentHolder({input}))[0].as(); + + torch::cuda::synchronize(); + EXPECT_TRUE(output.equal(input)) + << "Output: " << output << " Expected: " << input; +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, SingleSetOpNonOutermost) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = set(tv0); + fusion->addInput(tv0); + fusion->addOutput(tv1); + tv1->axis(1)->parallelize(ParallelType::Stream); + + MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance()); + + hir::HostIrContainer* container = executor.hostIrEvaluator()->container(); + EXPECT_EQ(container->topLevelExprs().size(), 2); + EXPECT_TRUE(container->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(1)->isA()); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor input = at::rand({4, 8}, options); + auto output = + executor.runWithInput(KernelArgumentHolder({input}))[0].as(); + + torch::cuda::synchronize(); + EXPECT_TRUE(output.equal(input)) + << "Output: " << output << " Expected: " << input; +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, SingleBinaryOp) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = makeContigTensor(2); + TensorView* tv2 = add(tv0, tv1); + fusion->addInput(tv0); + fusion->addInput(tv1); + fusion->addOutput(tv2); + tv2->axis(0)->parallelize(ParallelType::Stream); + + MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance()); + + hir::HostIrContainer* container = executor.hostIrEvaluator()->container(); + EXPECT_EQ(container->topLevelExprs().size(), 2); + EXPECT_TRUE(container->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(1)->isA()); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + + at::Tensor tv0_input = at::rand({4, 4}, options); + at::Tensor tv1_input = at::rand({4, 4}, options); + auto output = + executor.runWithInput(KernelArgumentHolder({tv0_input, tv1_input}))[0] + .as(); + auto expected_output = tv0_input + tv1_input; + EXPECT_TRUE(output.equal(expected_output)) + << "Output: " << output << "Expected: " << expected_output; +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, TwoSetOps) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = set(tv0); + TensorView* tv2 = set(tv1); + fusion->addInput(tv0); + fusion->addOutput(tv2); + tv1->axis(0)->parallelize(ParallelType::Stream); + tv2->axis(0)->parallelize(ParallelType::Stream); + + MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance()); + + hir::HostIrContainer* container = executor.hostIrEvaluator()->container(); + EXPECT_EQ(container->topLevelExprs().size(), 3); + EXPECT_TRUE(container->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(1)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(2)->isA()); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor input = at::rand({4, 8}, options); + auto output = + executor.runWithInput(KernelArgumentHolder({input}))[0].as(); + + torch::cuda::synchronize(); + EXPECT_TRUE(output.equal(input)) + << "Output: " << output << " Expected: " << input; +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, ThreeSetOpsWithDisjointsForLoops) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = set(tv0); + TensorView* tv2 = set(tv1); + TensorView* tv3 = set(tv2); + fusion->addInput(tv0); + fusion->addOutput(tv3); + tv1->axis(0)->parallelize(ParallelType::Stream); + tv3->axis(0)->parallelize(ParallelType::Stream); + + MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance()); + + hir::HostIrContainer* container = executor.hostIrEvaluator()->container(); + EXPECT_EQ(container->topLevelExprs().size(), 5); + EXPECT_TRUE(container->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(1)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(2)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(3)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(4)->isA()); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor input = at::rand({4, 8}, options); + auto output = + executor.runWithInput(KernelArgumentHolder({input}))[0].as(); + + torch::cuda::synchronize(); + EXPECT_TRUE(output.equal(input)) + << "Output: " << output << " Expected: " << input; +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, ReductionUnsupported) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = sum(tv0, {0}); + fusion->addInput(tv0); + fusion->addOutput(tv1); + tv1->axis(0)->parallelize(ParallelType::Stream); + + EXPECT_ANY_THROW( + MultiDeviceExecutor(std::move(fusion), Communicator::getInstance())); +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, Reduction) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* tv0 = makeContigTensor(3); + TensorView* tv1 = sum(tv0, {2}); + fusion->addInput(tv0); + fusion->addOutput(tv1); + tv1->axis(0)->parallelize(ParallelType::Stream); + + MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance()); + + hir::HostIrContainer* container = executor.hostIrEvaluator()->container(); + EXPECT_EQ(container->topLevelExprs().size(), 2); + EXPECT_TRUE(container->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(1)->isA()); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor input = at::rand({4, 8, 2}, options); + auto output = + executor.runWithInput(KernelArgumentHolder({input}))[0].as(); + + torch::cuda::synchronize(); + auto expected_output = input.sum(2); + EXPECT_TRUE(output.equal(expected_output)) + << "Output: " << output << " Expected: " << expected_output; +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, Matmul_M) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* a = makeContigTensor(2); + TensorView* b = makeContigTensor(2); + TensorView* c = matmul(a, b); + fusion->addInput(a); + fusion->addInput(b); + fusion->addOutput(c); + c->axis(0)->parallelize(ParallelType::Stream); + + MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance()); + + hir::HostIrContainer* container = executor.hostIrEvaluator()->container(); + EXPECT_EQ(container->topLevelExprs().size(), 2); + EXPECT_TRUE(container->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(1)->isA()); + + constexpr int64_t M = 8, K = 4, N = 2; + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor a_aten = at::rand({M, K}, options); + at::Tensor b_aten = at::rand({K, N}, options); + auto output = executor.runWithInput(KernelArgumentHolder({a_aten, b_aten}))[0] + .as(); + + torch::cuda::synchronize(); + auto expected_output = at::matmul(a_aten, b_aten); + EXPECT_TRUE(torch::allclose(output, expected_output, 1e-2, 1e-2)) + << "Output: " << output << " Expected: " << expected_output; +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, BatchedMatmul) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* a = makeContigTensor(3); + TensorView* b = makeContigTensor(2); + TensorView* c = matmul(a, b); + fusion->addInput(a); + fusion->addInput(b); + fusion->addOutput(c); + c->axis(0)->parallelize(ParallelType::Stream); + + MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance()); + + hir::HostIrContainer* container = executor.hostIrEvaluator()->container(); + EXPECT_EQ(container->topLevelExprs().size(), 2); + EXPECT_TRUE(container->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(1)->isA()); + + constexpr int64_t B = 16, M = 8, K = 4, N = 2; + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor a_aten = at::rand({B, M, K}, options); + at::Tensor b_aten = at::rand({K, N}, options); + auto output = executor.runWithInput(KernelArgumentHolder({a_aten, b_aten}))[0] + .as(); + + torch::cuda::synchronize(); + auto expected_output = at::matmul(a_aten, b_aten); + EXPECT_TRUE(torch::allclose(output, expected_output, 1e-2, 1e-2)) + << "Output: " << output << " Expected: " << expected_output; +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, Matmul_N) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* a = makeContigTensor(2); + TensorView* b = makeContigTensor(2); + TensorView* c = matmul(a, b); + fusion->addInput(a); + fusion->addInput(b); + fusion->addOutput(c); + c->axis(1)->parallelize(ParallelType::Stream); + + MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance()); + + hir::HostIrContainer* container = executor.hostIrEvaluator()->container(); + EXPECT_EQ(container->topLevelExprs().size(), 2); + EXPECT_TRUE(container->topLevelExprs().at(0)->isA()); + EXPECT_TRUE(container->topLevelExprs().at(1)->isA()); + + constexpr int64_t M = 8, K = 4, N = 2; + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor a_aten = at::rand({M, K}, options); + at::Tensor b_aten = at::rand({K, N}, options); + auto output = executor.runWithInput(KernelArgumentHolder({a_aten, b_aten}))[0] + .as(); + + torch::cuda::synchronize(); + auto expected_output = at::matmul(a_aten, b_aten); + EXPECT_TRUE(torch::allclose(output, expected_output, 1e-2, 1e-2)) + << "Output: " << output << " Expected: " << expected_output; +} + +TEST_F(MultiDeviceExecutorLowerStreamTest, Matmul_K) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* a = makeContigTensor(2); + TensorView* b = makeContigTensor(2); + TensorView* c = matmul(a, b); + fusion->addInput(a); + fusion->addInput(b); + fusion->addOutput(c); + c->axis(-1)->parallelize(ParallelType::Stream); + + EXPECT_ANY_THROW( + MultiDeviceExecutor(std::move(fusion), Communicator::getInstance())); +} + +// We only support Stream parallel type on ops that support pre-allocated +// output, which means they need a special handle in HostIrEvaluator and they +// need to be lowered as a Host Ir Op in the TopLevelExpression, no a +// PostOnStream(HostUnit(.)) See HostIrLower::isLoweredAsStandaloneHostOp and +// the test HirLowerStreamTest.DoNotSupportPostOnStream +TEST_F(MultiDeviceExecutorLowerStreamTest, DoNotSupportPostOnStream) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + TensorView* tv0 = makeContigTensor(2); + TensorView* tv1 = + abs(tv0); // arbitrary example of an unsupported op. There is no deep + // reason why we not support it -- if needed we could widen the + // support. But I wanna make sure that an unsupported op do not + // silently fails + fusion->addInput(tv0); + fusion->addOutput(tv1); + tv1->axis(0)->parallelize(ParallelType::Stream); + + EXPECT_ANY_THROW( + MultiDeviceExecutor(std::move(fusion), Communicator::getInstance())); +} + +} // namespace nvfuser diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp index 88286d6e4c0..e9db27cfd7f 100644 --- a/tests/cpp/test_multidevice_host_ir.cpp +++ b/tests/cpp/test_multidevice_host_ir.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include namespace nvfuser { @@ -362,6 +363,11 @@ TEST_F(P2PCommHostIrTest, CoalescedRingPairwiseExchange) { using OverlapDistributedMatmulTest = MultiDeviceTest; TEST_F(OverlapDistributedMatmulTest, AG_matmul) { + // Disable StreamParallelType pass temporarily as proper stream lowering gets + // implemented + preseg_passes::OptimizationPassGuard guard( + false); + constexpr int64_t M = 32768; constexpr int64_t K = 32768; constexpr int64_t N = 1024; @@ -417,6 +423,10 @@ TEST_F(OverlapDistributedMatmulTest, AG_matmul) { } TEST_F(OverlapDistributedMatmulTest, AG_linear) { + // Disable StreamParallelType pass tempor + preseg_passes::OptimizationPassGuard guard( + false); + constexpr int64_t M = 32768; constexpr int64_t K = 32768; constexpr int64_t N = 1024; From e8869419dd152f5d6f71f505830e35b97cb9f274 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 26 Mar 2025 06:29:12 -0700 Subject: [PATCH 18/38] improve comments --- csrc/preseg_passes/stream_parallel_type.cpp | 159 ++++++++++++-------- csrc/preseg_passes/stream_parallel_type.h | 11 +- 2 files changed, 109 insertions(+), 61 deletions(-) diff --git a/csrc/preseg_passes/stream_parallel_type.cpp b/csrc/preseg_passes/stream_parallel_type.cpp index 5a814c9a59a..12fe2f6a285 100644 --- a/csrc/preseg_passes/stream_parallel_type.cpp +++ b/csrc/preseg_passes/stream_parallel_type.cpp @@ -20,8 +20,9 @@ namespace nvfuser::preseg_passes { -// returns the first stream axis in the domain, or nullptr if there is none. -// Throws if two axis are stream parallelized +// Helper function to find the first stream-parallelized axis in a domain. +// This function throws if multiple stream-parallelized axes are found (only one +// is allowed) IterDomain* getStreamAxis(const std::vector& domain) { IterDomain* ret = nullptr; for (auto id : domain) { @@ -38,13 +39,27 @@ IterDomain* getStreamAxis(const std::vector& domain) { return ret; } - - -// TODO: ideally we should look at the dag and use the segmenter. Here we take -// advantage of the linear structure of HostIrContainer::topLevelExprs to -// greedily merge the adjacent compatible stream for-loop bodies +// StreamParallelType pass implementation. +// This pass handles stream parallelization of operations in a fusion. +// It works by: +// 1. Identifying stream-parallelized axes in tensor operations +// 2. Grouping compatible operations into stream-parallel for-loops +// 3. Setting up proper stream synchronization and management +// +// The pass ensures that: +// - Input tensors don't have stream axes +// - Only one stream axis exists per tensor +// - Stream axes are properly synchronized +// - Operations are correctly grouped into stream-parallel regions +// - The resulting HostIrContainer's top level expression is valid for execution +// and does not contain any stream axes +// +// TODO: Here, we assume that the fusion input is a HostIrContainer and use the +// linear structure of the HostIrContainer::topLevelExpr to greedily merge the +// adjacent compatible stream for-loop bodies. Ideally we should look at the dag +// and use the segmenter. void StreamParallelType::runPass(Fusion* fusion) { - // check that there are no stream axes in the inputs + // Verify that input tensors don't have stream axes NVF_CHECK( std::all_of( fusion->inputs().begin(), @@ -56,62 +71,71 @@ void StreamParallelType::runPass(Fusion* fusion) { }), "Expected no stream axis in the TensorView inputs."); - FusionGuard fg(fusion); // set as current container to register the newly - // created for-loops + // Set up the fusion environment and build the ID model + FusionGuard fg(fusion); hir::HostIrContainer* hic = dynamic_cast(fusion); NVF_CHECK(hic, "Expected HostIrContainer"); - // needed ? + IdModel id_model(fusion); id_model.buildAlmostExactGraph(); std::vector new_top_level_exprs; - // Step 1. Find the segments of expressions that can be merged into a single - // stream for-loop At the end of this step, new_top_level_exprs contains a - // list of expressions including newly created for-loops that will represent - // the stream parallelization, and the relevant expressions grouped inside the - // for-loops bodies. + + // Step 1: Group expressions into stream-parallel regions + // This step identifies which expressions can be merged into single stream + // for-loops + // + // After this step, new_top_level_exprs contains a + // list of expressions including newly created for-loops representing + // the stream parallelization containing and the relevant expressions for (auto expr : hic->topLevelExprs()) { - // we only support exprs having at most 1 output for now + // Skip expressions with no outputs if (expr->outputs().size() == 0) { new_top_level_exprs.push_back(expr); continue; } + + // Verify single output constraint NVF_CHECK( expr->outputs().size() == 1, "Each expr should have at most one output."); + + // Get the output tensor and check for stream parallelization TensorView* output = expr->output(0)->as(); - // retrieves the Loop IterDomain that is stream parallelized, if any IterDomain* stream_axis = getStreamAxis(output->getLoopDomain()); + + // If no stream axis, keep expression as is if (stream_axis == nullptr) { - // if the consumer is not stream parallelized, it means the expr need not - // be inside a stream for-loop new_top_level_exprs.push_back(expr); continue; } + + // Verify expression can be handled as a standalone host operation NVF_ERROR( HostIrLower::isLoweredAsStandaloneHostOp(expr), "Stream parallel type not supported for expr ", expr); - // find the corresponding stream axis but in the Logical (and not Loop - // Domain) + + // Find the stream axis in the logical (and not loop) domain auto it_logical_stream_axis = std::find( output->getLogicalDomain().begin(), output->getLogicalDomain().end(), stream_axis); - // for now we do not support split/merge stream axis + + // Verify stream axis is not split/merged NVF_ERROR( it_logical_stream_axis != output->getLogicalDomain().end(), "Cannot stream parallelize on a split/merge axis ", stream_axis); - // we don't support reducing or broadcasting a stream axis + + // Verify stream axis is an iteration axis (not reduction/broadcast) NVF_CHECK( stream_axis->getIterType() == IterType::Iteration, "Stream axis ", stream_axis, " should be an iteration axis."); - // check if the current expr can be merged with the previous stream for-loop - // We consider the previous expression to check whether the expr should - // create a new stream for-loop or be integrated into the previous one + + // Check if expression can be merged with previous stream for-loop if (!new_top_level_exprs.empty() && new_top_level_exprs.back()->isA() && id_model.idGraph(IdMappingMode::ALMOSTEXACT) @@ -119,21 +143,16 @@ void StreamParallelType::runPass(Fusion* fusion) { .strictAreMapped( stream_axis, new_top_level_exprs.back()->as()->iterDomain())) { - // merge with previous for-loop + // Merge with existing for-loop new_top_level_exprs.back()->as()->body().push_back(expr); } else { - // create a new for-loop - auto* j = IrBuilder::create( - DataType::Index); // running index of the for-loop - auto* start = hic->zeroVal(); - auto* stop = stream_axis->extent(); - auto* step = hic->oneVal(); + // Create new for-loop for stream parallelization auto* for_loop = IrBuilder::create( stream_axis, - /*index=*/j, - start, - stop, - step, + /*index=*/IrBuilder::create(DataType::Index), + /*start=*/hic->zeroVal(), + /*stop=*/stream_axis->extent(), + /*step=*/hic->oneVal(), /*vectorize=*/false, /*vectorize_shift=*/nullptr, /*unroll_required=*/false, @@ -145,30 +164,36 @@ void StreamParallelType::runPass(Fusion* fusion) { } } - // Step 2. Setup each for loop's body by Slicing the tensors. + // Step 2: Process each for-loop's body by slicing tensors + // This step handles the actual tensor slicing for stream parallelization std::vector top_level_exprs = std::move(new_top_level_exprs); new_top_level_exprs.clear(); + for (auto top_level_expr : top_level_exprs) { - // TODO: change in place? consr issue if (!top_level_expr->isA()) { new_top_level_exprs.push_back(top_level_expr); continue; } + auto* for_loop = top_level_expr->as(); - // this will contain the new body of the current for-loop std::vector new_loop_body; + // Process each expression in the loop body std::vector current_loop_body = for_loop->body().exprs(); for (auto it_expr = current_loop_body.begin(); it_expr != current_loop_body.end(); ++it_expr) { Expr* expr = *it_expr; + + // Process input tensors for (auto* input : ir_utils::filterByType(expr->inputs())) { + // Find stream axis index in input tensor int64_t input_stream_id_logical_index = -1; for (auto id : input->getLoopDomain()) { if (id_model.idGraph(IdMappingMode::ALMOSTEXACT) .disjointValSets() .strictAreMapped(for_loop->iterDomain(), id)) { + // Verify only one stream axis exists NVF_CHECK( input_stream_id_logical_index == -1, "Expected at most one axis mapping to the stream axis ", @@ -177,6 +202,8 @@ void StreamParallelType::runPass(Fusion* fusion) { input, " loop's domain ", input->getLoopDomain()); + + // Find stream axis in logical domain auto it_input_stream_id_logical = std::find( input->getLogicalDomain().begin(), input->getLogicalDomain().end(), @@ -193,15 +220,21 @@ void StreamParallelType::runPass(Fusion* fusion) { input->getLogicalDomain().begin(), it_input_stream_id_logical); } } + + // Skip if no stream axis found if (input_stream_id_logical_index == -1) { continue; } + + // Create sliced tensor for current stream iteration TensorView* input_j = select( input, input_stream_id_logical_index, for_loop->index(), /*keep_reduction_axis=*/true); new_loop_body.push_back(input_j->definition()); + + // Update all expressions using this input for (auto it_running_expr = current_loop_body.begin(); it_running_expr != current_loop_body.end(); ++it_running_expr) { @@ -216,12 +249,15 @@ void StreamParallelType::runPass(Fusion* fusion) { } } + // Process output tensors for (auto* output : ir_utils::filterByType(expr->outputs())) { + // Find stream axis index in output tensor int64_t output_stream_id_logical_index = -1; for (auto id : output->getLoopDomain()) { if (id_model.idGraph(IdMappingMode::ALMOSTEXACT) .disjointValSets() .strictAreMapped(for_loop->iterDomain(), id)) { + // Verify only one stream axis exists NVF_CHECK( output_stream_id_logical_index == -1, "Expected at most one axis mapping to the stream axis ", @@ -230,6 +266,8 @@ void StreamParallelType::runPass(Fusion* fusion) { output, " loop's domain ", output->getLoopDomain()); + + // Find stream axis in logical domain auto it_output_stream_id_logical = std::find( output->getLogicalDomain().begin(), output->getLogicalDomain().end(), @@ -247,17 +285,25 @@ void StreamParallelType::runPass(Fusion* fusion) { it_output_stream_id_logical); } } + + // Skip if no stream axis found if (output_stream_id_logical_index == -1) { continue; } + + // Create sliced tensor for current stream iteration TensorView* output_j = select( output, output_stream_id_logical_index, for_loop->index(), /*keep_reduction_axis=*/true); + + // Allocate memory for the output tensor new_top_level_exprs.push_back( IrBuilder::create(output, MemoryType::Global)); new_loop_body.push_back(output_j->definition()); + + // Update all expressions using this output for (auto it_running_expr = current_loop_body.begin(); it_running_expr != current_loop_body.end(); ++it_running_expr) { @@ -265,6 +311,7 @@ void StreamParallelType::runPass(Fusion* fusion) { for (auto* running_output : ir_utils::filterByType(running_expr->outputs())) { if (running_output == output) { + // Create alias for the sliced output TensorView* output_j_alias = ops::newValLike( output_j, output_j->dtype(), /*keep_reduction_axis=*/true) @@ -272,19 +319,14 @@ void StreamParallelType::runPass(Fusion* fusion) { hic->markAlias(output_j, output_j_alias); *it_running_expr = ir_utils::transferDefinitionToNewOutputs( running_expr, {output_j_alias}); - if (Communication* comm = dynamic_cast( - output_j_alias->definition()); - comm && comm->type() == CommunicationType::Allgather) { - std::cout << "HERE, with expr:" << *it_running_expr - << std::endl; - } } } } } new_loop_body.push_back(*it_expr); } - // reseting the for-loop body + + // Update for-loop body with processed expressions for_loop->body().clear(); for (auto* expr : new_loop_body) { for_loop->body().push_back(expr); @@ -292,8 +334,7 @@ void StreamParallelType::runPass(Fusion* fusion) { new_top_level_exprs.push_back(top_level_expr); } - // Step 3. Finalize the for-loop bodies by adding the stream setup and - // synchronization + // Step 3: Add stream management and synchronization for (auto* top_level_expr : new_top_level_exprs) { if (!top_level_expr->isA()) { continue; @@ -301,46 +342,44 @@ void StreamParallelType::runPass(Fusion* fusion) { auto* for_loop = top_level_expr->as(); std::vector new_loop_body; - // Get the current stream to later synchronize subsequent new streams + // Get current stream for later synchronization auto* get_current_stream = IrBuilder::create(); hir::Stream* original_stream = get_current_stream->stream(); new_loop_body.push_back(get_current_stream); - // set the stream to the one corresponding to the current for-loop index - auto* j = for_loop->index(); + // Set up stream for current iteration auto* number_of_streams = IrBuilder::create("numberOfStreams", DataType::Int); - auto* stream_index = mod(j, number_of_streams); + auto* stream_index = mod(for_loop->index(), number_of_streams); auto* stream = IrBuilder::create(stream_index); auto* set_stream = IrBuilder::create(stream); new_loop_body.push_back(set_stream); - // sync the new stream with the original stream + // Synchronize with original stream auto* initial_sync_stream = IrBuilder::create(original_stream); new_loop_body.push_back(initial_sync_stream); - // add the actual exprs to the for-loop body + // Add the actual computation expressions for (auto* expr : for_loop->body().exprs()) { new_loop_body.push_back(expr); } - // set back the original stream + // Restore original stream and synchronize auto* set_back_original_stream = IrBuilder::create(original_stream); new_loop_body.push_back(set_back_original_stream); - // synchronize original stream with the for-loop's streams auto* sync_stream = IrBuilder::create(stream); new_loop_body.push_back(sync_stream); - // reset the for-loop's body to the one we constructed. + // Update for-loop body with stream management for_loop->body().clear(); for (auto* expr : new_loop_body) { for_loop->body().push_back(expr); } } - // reset hic topLevelExprs to new_top_level_exprs + // Update the container's top-level expressions hic->resetTopLevelExprs(new_top_level_exprs); } diff --git a/csrc/preseg_passes/stream_parallel_type.h b/csrc/preseg_passes/stream_parallel_type.h index a9600809e21..9c0c39efe87 100644 --- a/csrc/preseg_passes/stream_parallel_type.h +++ b/csrc/preseg_passes/stream_parallel_type.h @@ -12,7 +12,16 @@ namespace nvfuser::preseg_passes { -// A pass used in HostIrLower that takes a HostIrContainer as input, reads the TensorView's ParallelType::Stream, and modify the the HostIrContainer's top level expressions with the corresponding Host For Loops, which bodies contain stream assignement, selecting on tensor's axis, and the exprs on those sliced tensors. After this pass, the ParallelType::Stream is removed from the TensorView's axis. +// A pass used in HostIrLower that takes a HostIrContainer as input, reads the +// TensorView's ParallelType::Stream, and modify the the HostIrContainer's top +// level expressions with the corresponding Host For Loops, which bodies contain +// stream assignement, selecting on tensor's axis, and the exprs on those sliced +// tensors. After this pass, the ParallelType::Stream is removed from the +// TensorView's axis. +// +// An illustration of the pass can be found in the tests +// `test_host_ir_stream_lowering.cpp` +// with the option `NVFUSER_DUMP=host_ir`. class StreamParallelType : public OptimizationPass { friend class OptimizationPass; From b6c54f2e47246535f6b2ea915bf236dabe54f087 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 16 Apr 2025 03:48:24 -0700 Subject: [PATCH 19/38] fix rebase --- csrc/preseg_passes/stream_parallel_type.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/preseg_passes/stream_parallel_type.cpp b/csrc/preseg_passes/stream_parallel_type.cpp index 12fe2f6a285..82f1b3d0e67 100644 --- a/csrc/preseg_passes/stream_parallel_type.cpp +++ b/csrc/preseg_passes/stream_parallel_type.cpp @@ -112,7 +112,7 @@ void StreamParallelType::runPass(Fusion* fusion) { // Verify expression can be handled as a standalone host operation NVF_ERROR( - HostIrLower::isLoweredAsStandaloneHostOp(expr), + HostIrLower::isLowerableAsStandaloneHostOp(expr), "Stream parallel type not supported for expr ", expr); From 32a8d552befc8deb4f4fff1deb3d88558bb88074 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 16 Apr 2025 08:23:11 -0700 Subject: [PATCH 20/38] temporarily disable stream pass also in the python test --- csrc/python_frontend/fusion_definition.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp index c48abc9dbdc..ad7b8baf2d6 100644 --- a/csrc/python_frontend/fusion_definition.cpp +++ b/csrc/python_frontend/fusion_definition.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -452,6 +453,10 @@ std::pair> FusionDefinition:: if (scheds->multi_device_executor == nullptr) { MultiDeviceExecutorParams params; params.lower.communicator_backend = backend_type_; + // Disable StreamParallelType pass temporarily as proper stream lowering gets + // implemented + preseg_passes::OptimizationPassGuard guard( + false); scheds->multi_device_executor = std::make_unique( std::make_unique(*scheds->preschedFusion()), Communicator::getInstance(), From afbd020b9e2e6b9461bb335532d44bb884ebc533 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 16 Apr 2025 08:38:34 -0700 Subject: [PATCH 21/38] lint --- csrc/python_frontend/fusion_definition.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp index ad7b8baf2d6..950b2bd148a 100644 --- a/csrc/python_frontend/fusion_definition.cpp +++ b/csrc/python_frontend/fusion_definition.cpp @@ -453,10 +453,10 @@ std::pair> FusionDefinition:: if (scheds->multi_device_executor == nullptr) { MultiDeviceExecutorParams params; params.lower.communicator_backend = backend_type_; - // Disable StreamParallelType pass temporarily as proper stream lowering gets - // implemented - preseg_passes::OptimizationPassGuard guard( - false); + // Disable StreamParallelType pass temporarily as proper stream lowering + // gets implemented + preseg_passes::OptimizationPassGuard + guard(false); scheds->multi_device_executor = std::make_unique( std::make_unique(*scheds->preschedFusion()), Communicator::getInstance(), From 165bd1bab236119a85e9e4dd5843424887960470 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 16 Apr 2025 08:57:55 -0700 Subject: [PATCH 22/38] move stream_parallel_type to host_ir/pass folder --- CMakeLists.txt | 2 +- csrc/host_ir/lower.cpp | 2 +- csrc/{preseg_passes => host_ir/pass}/stream_parallel_type.cpp | 2 +- csrc/{preseg_passes => host_ir/pass}/stream_parallel_type.h | 0 csrc/python_frontend/fusion_definition.cpp | 2 +- tests/cpp/test_host_ir_stream_lowering.cpp | 2 +- tests/cpp/test_multidevice_host_ir.cpp | 2 +- 7 files changed, 6 insertions(+), 6 deletions(-) rename csrc/{preseg_passes => host_ir/pass}/stream_parallel_type.cpp (99%) rename csrc/{preseg_passes => host_ir/pass}/stream_parallel_type.h (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 96a447055ea..dcf94d4a3a7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,7 +212,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/preseg_passes/remove_empty.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/segment_inplace_update.cpp - ${NVFUSER_SRCS_DIR}/preseg_passes/stream_parallel_type.cpp + ${NVFUSER_SRCS_DIR}/host_ir/pass/stream_parallel_type.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/translate_no_reduction_matmul_to_mul_squeeze.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/translate_repeat_to_expand.cpp ${NVFUSER_SRCS_DIR}/rng.cpp diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index 1a74d9a9f01..c36fae09e0a 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -7,6 +7,7 @@ // clang-format on #include #include +#include #include #include #include @@ -19,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/csrc/preseg_passes/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp similarity index 99% rename from csrc/preseg_passes/stream_parallel_type.cpp rename to csrc/host_ir/pass/stream_parallel_type.cpp index 82f1b3d0e67..f1419e3c626 100644 --- a/csrc/preseg_passes/stream_parallel_type.cpp +++ b/csrc/host_ir/pass/stream_parallel_type.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -16,7 +17,6 @@ #include #include #include -#include namespace nvfuser::preseg_passes { diff --git a/csrc/preseg_passes/stream_parallel_type.h b/csrc/host_ir/pass/stream_parallel_type.h similarity index 100% rename from csrc/preseg_passes/stream_parallel_type.h rename to csrc/host_ir/pass/stream_parallel_type.h diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp index 950b2bd148a..d6e552032b1 100644 --- a/csrc/python_frontend/fusion_definition.cpp +++ b/csrc/python_frontend/fusion_definition.cpp @@ -7,11 +7,11 @@ // clang-format on #include #include +#include #include #include #include #include -#include #include #include #include diff --git a/tests/cpp/test_host_ir_stream_lowering.cpp b/tests/cpp/test_host_ir_stream_lowering.cpp index 9f3d9f432ed..f6d74caea87 100644 --- a/tests/cpp/test_host_ir_stream_lowering.cpp +++ b/tests/cpp/test_host_ir_stream_lowering.cpp @@ -11,12 +11,12 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp index e9db27cfd7f..7b233bc47db 100644 --- a/tests/cpp/test_multidevice_host_ir.cpp +++ b/tests/cpp/test_multidevice_host_ir.cpp @@ -9,9 +9,9 @@ #include #include #include +#include #include #include -#include #include namespace nvfuser { From b55d4e73787f3ca3b841ea2baf769021a85e1199 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 18 Apr 2025 02:35:58 -0700 Subject: [PATCH 23/38] minor comment --- csrc/host_ir/executor.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index b973c7f3277..12cf344e549 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -703,8 +703,7 @@ void HostIrEvaluator::handle(BinaryOp* binary_op) { at::div_out(output, lhs, rhs); break; default: - NVF_CHECK( - false, + NVF_THROW( "Unexpected operator type: ", binary_op->getBinaryOpType(), " in ", @@ -744,8 +743,7 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) { at::amin_out(output, input, reduction_axes); return; default: - NVF_CHECK( - false, + NVF_THROW( "Unexpected operator type: ", reduction_op->getReductionOpType(), " in ", From 1181eac5f93fdc1ff91a9f6182246950e2825008 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 18 Apr 2025 04:41:13 -0700 Subject: [PATCH 24/38] minor improvements and cleanup --- csrc/host_ir/pass/stream_parallel_type.cpp | 143 ++++++++++++--------- csrc/preseg_passes/optimization_pass.h | 2 - csrc/type.cpp | 2 +- 3 files changed, 84 insertions(+), 63 deletions(-) diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp index f1419e3c626..e72fb2ac5ee 100644 --- a/csrc/host_ir/pass/stream_parallel_type.cpp +++ b/csrc/host_ir/pass/stream_parallel_type.cpp @@ -39,55 +39,12 @@ IterDomain* getStreamAxis(const std::vector& domain) { return ret; } -// StreamParallelType pass implementation. -// This pass handles stream parallelization of operations in a fusion. -// It works by: -// 1. Identifying stream-parallelized axes in tensor operations -// 2. Grouping compatible operations into stream-parallel for-loops -// 3. Setting up proper stream synchronization and management -// -// The pass ensures that: -// - Input tensors don't have stream axes -// - Only one stream axis exists per tensor -// - Stream axes are properly synchronized -// - Operations are correctly grouped into stream-parallel regions -// - The resulting HostIrContainer's top level expression is valid for execution -// and does not contain any stream axes -// -// TODO: Here, we assume that the fusion input is a HostIrContainer and use the -// linear structure of the HostIrContainer::topLevelExpr to greedily merge the -// adjacent compatible stream for-loop bodies. Ideally we should look at the dag -// and use the segmenter. -void StreamParallelType::runPass(Fusion* fusion) { - // Verify that input tensors don't have stream axes - NVF_CHECK( - std::all_of( - fusion->inputs().begin(), - fusion->inputs().end(), - [](Val* input) { - auto input_tv = dynamic_cast(input); - return input_tv == nullptr || - getStreamAxis(input_tv->getLoopDomain()) == nullptr; - }), - "Expected no stream axis in the TensorView inputs."); - - // Set up the fusion environment and build the ID model - FusionGuard fg(fusion); - hir::HostIrContainer* hic = dynamic_cast(fusion); - NVF_CHECK(hic, "Expected HostIrContainer"); - - IdModel id_model(fusion); - id_model.buildAlmostExactGraph(); - +// Step 1: Group expressions into stream-parallel regions +std::vector groupStreamParallelRegions( + hir::HostIrContainer* hic, + const IdModel& id_model) { std::vector new_top_level_exprs; - // Step 1: Group expressions into stream-parallel regions - // This step identifies which expressions can be merged into single stream - // for-loops - // - // After this step, new_top_level_exprs contains a - // list of expressions including newly created for-loops representing - // the stream parallelization containing and the relevant expressions for (auto expr : hic->topLevelExprs()) { // Skip expressions with no outputs if (expr->outputs().size() == 0) { @@ -130,15 +87,16 @@ void StreamParallelType::runPass(Fusion* fusion) { // Verify stream axis is an iteration axis (not reduction/broadcast) NVF_CHECK( - stream_axis->getIterType() == IterType::Iteration, + stream_axis->getIterType() == IterType::Iteration || + stream_axis->getIterType() == IterType::Broadcast, "Stream axis ", stream_axis, - " should be an iteration axis."); + " should be an iteration or broadcast axis."); // Check if expression can be merged with previous stream for-loop if (!new_top_level_exprs.empty() && new_top_level_exprs.back()->isA() && - id_model.idGraph(IdMappingMode::ALMOSTEXACT) + id_model.idGraph(IdMappingMode::BROADCAST) .disjointValSets() .strictAreMapped( stream_axis, @@ -149,7 +107,7 @@ void StreamParallelType::runPass(Fusion* fusion) { // Create new for-loop for stream parallelization auto* for_loop = IrBuilder::create( stream_axis, - /*index=*/IrBuilder::create(DataType::Index), + /*index=*/NamedScalar::getParallelIndex(ParallelType::Stream), /*start=*/hic->zeroVal(), /*stop=*/stream_axis->extent(), /*step=*/hic->oneVal(), @@ -164,10 +122,15 @@ void StreamParallelType::runPass(Fusion* fusion) { } } - // Step 2: Process each for-loop's body by slicing tensors - // This step handles the actual tensor slicing for stream parallelization - std::vector top_level_exprs = std::move(new_top_level_exprs); - new_top_level_exprs.clear(); + return new_top_level_exprs; +} + +// Step 2: Process for-loop bodies by slicing tensors +std::vector processForLoopBodies( + hir::HostIrContainer* hic, + const IdModel& id_model, + std::vector top_level_exprs) { + std::vector new_top_level_exprs; for (auto top_level_expr : top_level_exprs) { if (!top_level_expr->isA()) { @@ -190,7 +153,7 @@ void StreamParallelType::runPass(Fusion* fusion) { // Find stream axis index in input tensor int64_t input_stream_id_logical_index = -1; for (auto id : input->getLoopDomain()) { - if (id_model.idGraph(IdMappingMode::ALMOSTEXACT) + if (id_model.idGraph(IdMappingMode::BROADCAST) .disjointValSets() .strictAreMapped(for_loop->iterDomain(), id)) { // Verify only one stream axis exists @@ -254,7 +217,7 @@ void StreamParallelType::runPass(Fusion* fusion) { // Find stream axis index in output tensor int64_t output_stream_id_logical_index = -1; for (auto id : output->getLoopDomain()) { - if (id_model.idGraph(IdMappingMode::ALMOSTEXACT) + if (id_model.idGraph(IdMappingMode::BROADCAST) .disjointValSets() .strictAreMapped(for_loop->iterDomain(), id)) { // Verify only one stream axis exists @@ -334,9 +297,16 @@ void StreamParallelType::runPass(Fusion* fusion) { new_top_level_exprs.push_back(top_level_expr); } - // Step 3: Add stream management and synchronization - for (auto* top_level_expr : new_top_level_exprs) { + return new_top_level_exprs; +} + +// Step 3: Add stream management and synchronization +std::vector addStreamManagement(std::vector top_level_exprs) { + std::vector new_top_level_exprs; + + for (auto* top_level_expr : top_level_exprs) { if (!top_level_expr->isA()) { + new_top_level_exprs.push_back(top_level_expr); continue; } auto* for_loop = top_level_expr->as(); @@ -377,10 +347,63 @@ void StreamParallelType::runPass(Fusion* fusion) { for (auto* expr : new_loop_body) { for_loop->body().push_back(expr); } + new_top_level_exprs.push_back(top_level_expr); } + return new_top_level_exprs; +} + +// StreamParallelType pass implementation. +// This pass handles stream parallelization of operations in a fusion. +// It works by: +// 1. Identifying stream-parallelized axes in tensor operations +// 2. Grouping compatible operations into stream-parallel for-loops +// 3. Setting up proper stream synchronization and management +// +// The pass ensures that: +// - Input tensors don't have stream axes +// - Only one stream axis exists per tensor +// - Stream axes are properly synchronized +// - Operations are correctly grouped into stream-parallel regions +// - The resulting HostIrContainer's top level expression is valid for execution +// and does not contain any stream axes +// +// TODO: Here, we assume that the fusion input is a HostIrContainer and use the +// linear structure of the HostIrContainer::topLevelExpr to greedily merge the +// adjacent compatible stream for-loop bodies. Ideally we should look at the dag +// and use the segmenter. +void StreamParallelType::runPass(Fusion* fusion) { + // Verify that input tensors don't have stream axes + NVF_CHECK( + std::all_of( + fusion->inputs().begin(), + fusion->inputs().end(), + [](Val* input) { + auto input_tv = dynamic_cast(input); + return input_tv == nullptr || + getStreamAxis(input_tv->getLoopDomain()) == nullptr; + }), + "Expected no stream axis in the TensorView inputs."); + + // Set up the fusion environment and build the ID model + FusionGuard fg(fusion); + hir::HostIrContainer* hic = dynamic_cast(fusion); + NVF_CHECK(hic, "Expected HostIrContainer"); + + IdModel id_model(fusion); + id_model.buildBroadcastGraph(); + + // Step 1: Group expressions into stream-parallel regions + std::vector top_level_exprs = groupStreamParallelRegions(hic, id_model); + + // Step 2: Process for-loop bodies by slicing tensors + top_level_exprs = processForLoopBodies(hic, id_model, std::move(top_level_exprs)); + + // Step 3: Add stream management and synchronization + top_level_exprs = addStreamManagement(std::move(top_level_exprs)); + // Update the container's top-level expressions - hic->resetTopLevelExprs(new_top_level_exprs); + hic->resetTopLevelExprs(top_level_exprs); } } // namespace nvfuser::preseg_passes diff --git a/csrc/preseg_passes/optimization_pass.h b/csrc/preseg_passes/optimization_pass.h index 53d8a8acd3c..359a4a42742 100644 --- a/csrc/preseg_passes/optimization_pass.h +++ b/csrc/preseg_passes/optimization_pass.h @@ -18,8 +18,6 @@ namespace nvfuser::preseg_passes { -using FusionPass = std::function; - //! [experimental API] //! Base class to unify optimization pass APIs. //! OptimizationPass can be turned on/off programmatically with the `setEnabled` diff --git a/csrc/type.cpp b/csrc/type.cpp index d1a5b2abd80..e4a89372d52 100644 --- a/csrc/type.cpp +++ b/csrc/type.cpp @@ -729,7 +729,7 @@ static const char* parallel_type2string(ParallelType t) { case ParallelType::TIDx: return "threadIdx.x"; case ParallelType::Stream: - return "Stream"; + return "StreamIdx"; case ParallelType::Vectorize: return "V"; case ParallelType::Unroll: From cad9bce67e39678d3a972bdaa8098e16d84f0206 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 18 Apr 2025 05:17:31 -0700 Subject: [PATCH 25/38] further refactor of stream pass --- csrc/host_ir/pass/stream_parallel_type.cpp | 192 ++++++++++----------- 1 file changed, 91 insertions(+), 101 deletions(-) diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp index e72fb2ac5ee..d40c9cff147 100644 --- a/csrc/host_ir/pass/stream_parallel_type.cpp +++ b/csrc/host_ir/pass/stream_parallel_type.cpp @@ -20,9 +20,8 @@ namespace nvfuser::preseg_passes { -// Helper function to find the first stream-parallelized axis in a domain. -// This function throws if multiple stream-parallelized axes are found (only one -// is allowed) +namespace { + IterDomain* getStreamAxis(const std::vector& domain) { IterDomain* ret = nullptr; for (auto id : domain) { @@ -39,6 +38,83 @@ IterDomain* getStreamAxis(const std::vector& domain) { return ret; } +void validateStreamAxis(IterDomain* stream_axis, const TensorView* tv) { + // Find the stream axis in the logical domain + auto it_logical_stream_axis = std::find( + tv->getLogicalDomain().begin(), + tv->getLogicalDomain().end(), + stream_axis); + + // Verify stream axis is not split/merged + NVF_ERROR( + it_logical_stream_axis != tv->getLogicalDomain().end(), + "Cannot stream parallelize on a split/merge axis ", + stream_axis); + + // Verify stream axis is an iteration or broadcast axis + NVF_CHECK( + stream_axis->getIterType() == IterType::Iteration || + stream_axis->getIterType() == IterType::Broadcast, + "Stream axis ", + stream_axis, + " should be an iteration or broadcast axis."); +} + +bool areIdsMapped(const IdModel& id_model, IterDomain* id1, IterDomain* id2) { + return id_model.idGraph(IdMappingMode::BROADCAST) + .disjointValSets() + .strictAreMapped(id1, id2); +} + +bool canMergeWithPreviousForLoop( + const std::vector& new_top_level_exprs, + IterDomain* stream_axis, + const IdModel& id_model) { + return !new_top_level_exprs.empty() && + new_top_level_exprs.back()->isA() && + areIdsMapped( + id_model, + stream_axis, + new_top_level_exprs.back()->as()->iterDomain()); +} + +int64_t findStreamAxisIndex( + const TensorView* tv, + IterDomain* stream_axis, + const IdModel& id_model) { + int64_t stream_id_logical_index = -1; + for (auto id : tv->getLoopDomain()) { + if (areIdsMapped(id_model, stream_axis, id)) { + // Verify only one stream axis exists + NVF_CHECK( + stream_id_logical_index == -1, + "Expected at most one axis mapping to the stream axis ", + stream_axis, + " in the tensor ", + tv, + " loop's domain ", + tv->getLoopDomain()); + + // Find stream axis in logical domain + auto it_stream_id_logical = std::find( + tv->getLogicalDomain().begin(), + tv->getLogicalDomain().end(), + id); + NVF_CHECK( + it_stream_id_logical != tv->getLogicalDomain().end(), + "Expected to find ", + id, + " in ", + tv, + "'s logical domain ", + tv->getLogicalDomain()); + stream_id_logical_index = std::distance( + tv->getLogicalDomain().begin(), it_stream_id_logical); + } + } + return stream_id_logical_index; +} + // Step 1: Group expressions into stream-parallel regions std::vector groupStreamParallelRegions( hir::HostIrContainer* hic, @@ -73,34 +149,11 @@ std::vector groupStreamParallelRegions( "Stream parallel type not supported for expr ", expr); - // Find the stream axis in the logical (and not loop) domain - auto it_logical_stream_axis = std::find( - output->getLogicalDomain().begin(), - output->getLogicalDomain().end(), - stream_axis); - - // Verify stream axis is not split/merged - NVF_ERROR( - it_logical_stream_axis != output->getLogicalDomain().end(), - "Cannot stream parallelize on a split/merge axis ", - stream_axis); - - // Verify stream axis is an iteration axis (not reduction/broadcast) - NVF_CHECK( - stream_axis->getIterType() == IterType::Iteration || - stream_axis->getIterType() == IterType::Broadcast, - "Stream axis ", - stream_axis, - " should be an iteration or broadcast axis."); + // Validate stream axis + validateStreamAxis(stream_axis, output); // Check if expression can be merged with previous stream for-loop - if (!new_top_level_exprs.empty() && - new_top_level_exprs.back()->isA() && - id_model.idGraph(IdMappingMode::BROADCAST) - .disjointValSets() - .strictAreMapped( - stream_axis, - new_top_level_exprs.back()->as()->iterDomain())) { + if (canMergeWithPreviousForLoop(new_top_level_exprs, stream_axis, id_model)) { // Merge with existing for-loop new_top_level_exprs.back()->as()->body().push_back(expr); } else { @@ -117,7 +170,6 @@ std::vector groupStreamParallelRegions( CircularBufferLoopStage::NotApplicable, /*circular_buffer_loop_stage_depth=*/0); for_loop->body().push_back(expr); - // replace the current expr by the for-loop containing it new_top_level_exprs.push_back(for_loop); } } @@ -150,41 +202,9 @@ std::vector processForLoopBodies( // Process input tensors for (auto* input : ir_utils::filterByType(expr->inputs())) { - // Find stream axis index in input tensor - int64_t input_stream_id_logical_index = -1; - for (auto id : input->getLoopDomain()) { - if (id_model.idGraph(IdMappingMode::BROADCAST) - .disjointValSets() - .strictAreMapped(for_loop->iterDomain(), id)) { - // Verify only one stream axis exists - NVF_CHECK( - input_stream_id_logical_index == -1, - "Expected at most one axis mapping to the stream axis ", - for_loop->iterDomain(), - " in the tensor ", - input, - " loop's domain ", - input->getLoopDomain()); - - // Find stream axis in logical domain - auto it_input_stream_id_logical = std::find( - input->getLogicalDomain().begin(), - input->getLogicalDomain().end(), - id); - NVF_CHECK( - it_input_stream_id_logical != input->getLogicalDomain().end(), - "Expected to find ", - id, - " in ", - input, - "'s logical domain ", - input->getLogicalDomain()); - input_stream_id_logical_index = std::distance( - input->getLogicalDomain().begin(), it_input_stream_id_logical); - } - } + int64_t input_stream_id_logical_index = findStreamAxisIndex( + input, for_loop->iterDomain(), id_model); - // Skip if no stream axis found if (input_stream_id_logical_index == -1) { continue; } @@ -214,42 +234,9 @@ std::vector processForLoopBodies( // Process output tensors for (auto* output : ir_utils::filterByType(expr->outputs())) { - // Find stream axis index in output tensor - int64_t output_stream_id_logical_index = -1; - for (auto id : output->getLoopDomain()) { - if (id_model.idGraph(IdMappingMode::BROADCAST) - .disjointValSets() - .strictAreMapped(for_loop->iterDomain(), id)) { - // Verify only one stream axis exists - NVF_CHECK( - output_stream_id_logical_index == -1, - "Expected at most one axis mapping to the stream axis ", - for_loop->iterDomain(), - " in the tensor ", - output, - " loop's domain ", - output->getLoopDomain()); - - // Find stream axis in logical domain - auto it_output_stream_id_logical = std::find( - output->getLogicalDomain().begin(), - output->getLogicalDomain().end(), - id); - NVF_CHECK( - it_output_stream_id_logical != output->getLogicalDomain().end(), - "Expected to find ", - id, - " in ", - output, - "'s logical domain ", - output->getLogicalDomain()); - output_stream_id_logical_index = std::distance( - output->getLogicalDomain().begin(), - it_output_stream_id_logical); - } - } + int64_t output_stream_id_logical_index = findStreamAxisIndex( + output, for_loop->iterDomain(), id_model); - // Skip if no stream axis found if (output_stream_id_logical_index == -1) { continue; } @@ -276,14 +263,14 @@ std::vector processForLoopBodies( if (running_output == output) { // Create alias for the sliced output TensorView* output_j_alias = - ops::newValLike( - output_j, output_j->dtype(), /*keep_reduction_axis=*/true) + ops::newValLike(output_j, output_j->dtype(), true) ->as(); hic->markAlias(output_j, output_j_alias); *it_running_expr = ir_utils::transferDefinitionToNewOutputs( running_expr, {output_j_alias}); } } + } } new_loop_body.push_back(*it_expr); @@ -353,6 +340,8 @@ std::vector addStreamManagement(std::vector top_level_exprs) { return new_top_level_exprs; } +} // anonymous namespace + // StreamParallelType pass implementation. // This pass handles stream parallelization of operations in a fusion. // It works by: @@ -407,3 +396,4 @@ void StreamParallelType::runPass(Fusion* fusion) { } } // namespace nvfuser::preseg_passes + From 7ae7c52f7340c85458b8b48d3d971532e183e36b Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 18 Apr 2025 05:38:36 -0700 Subject: [PATCH 26/38] improve comments clarity --- csrc/host_ir/pass/stream_parallel_type.cpp | 65 ++++++++++++++-------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp index d40c9cff147..dd19f4f6b5d 100644 --- a/csrc/host_ir/pass/stream_parallel_type.cpp +++ b/csrc/host_ir/pass/stream_parallel_type.cpp @@ -22,6 +22,7 @@ namespace nvfuser::preseg_passes { namespace { +// Finds the stream axis in a tensor's domain. There should be at most one stream axis. IterDomain* getStreamAxis(const std::vector& domain) { IterDomain* ret = nullptr; for (auto id : domain) { @@ -38,6 +39,7 @@ IterDomain* getStreamAxis(const std::vector& domain) { return ret; } +// Validates that a stream axis is valid in a tensor void validateStreamAxis(IterDomain* stream_axis, const TensorView* tv) { // Find the stream axis in the logical domain auto it_logical_stream_axis = std::find( @@ -60,12 +62,14 @@ void validateStreamAxis(IterDomain* stream_axis, const TensorView* tv) { " should be an iteration or broadcast axis."); } +// Checks if two iteration domains are mapped in the ID model bool areIdsMapped(const IdModel& id_model, IterDomain* id1, IterDomain* id2) { return id_model.idGraph(IdMappingMode::BROADCAST) .disjointValSets() .strictAreMapped(id1, id2); } +// Determines if a stream-parallel for-loop can be merged with the previous one bool canMergeWithPreviousForLoop( const std::vector& new_top_level_exprs, IterDomain* stream_axis, @@ -78,6 +82,7 @@ bool canMergeWithPreviousForLoop( new_top_level_exprs.back()->as()->iterDomain()); } +// Finds where a stream axis appears in a tensor's logical domain int64_t findStreamAxisIndex( const TensorView* tv, IterDomain* stream_axis, @@ -121,6 +126,7 @@ std::vector groupStreamParallelRegions( const IdModel& id_model) { std::vector new_top_level_exprs; + // Process each top-level expression for (auto expr : hic->topLevelExprs()) { // Skip expressions with no outputs if (expr->outputs().size() == 0) { @@ -128,7 +134,7 @@ std::vector groupStreamParallelRegions( continue; } - // Verify single output constraint + // Each expression should have exactly one output NVF_CHECK( expr->outputs().size() == 1, "Each expr should have at most one output."); @@ -137,13 +143,13 @@ std::vector groupStreamParallelRegions( TensorView* output = expr->output(0)->as(); IterDomain* stream_axis = getStreamAxis(output->getLoopDomain()); - // If no stream axis, keep expression as is + // If no stream axis found, keep the expression as is if (stream_axis == nullptr) { new_top_level_exprs.push_back(expr); continue; } - // Verify expression can be handled as a standalone host operation + // Verify that the expression can be handled as a standalone host operation NVF_ERROR( HostIrLower::isLowerableAsStandaloneHostOp(expr), "Stream parallel type not supported for expr ", @@ -152,12 +158,12 @@ std::vector groupStreamParallelRegions( // Validate stream axis validateStreamAxis(stream_axis, output); - // Check if expression can be merged with previous stream for-loop + // Check if we can merge this expression with the previous for-loop if (canMergeWithPreviousForLoop(new_top_level_exprs, stream_axis, id_model)) { - // Merge with existing for-loop + // Merge with existing for-loop by adding the expression to its body new_top_level_exprs.back()->as()->body().push_back(expr); } else { - // Create new for-loop for stream parallelization + // Create a new for-loop for stream parallelization auto* for_loop = IrBuilder::create( stream_axis, /*index=*/NamedScalar::getParallelIndex(ParallelType::Stream), @@ -169,6 +175,7 @@ std::vector groupStreamParallelRegions( /*unroll_required=*/false, CircularBufferLoopStage::NotApplicable, /*circular_buffer_loop_stage_depth=*/0); + // Add the expression to the new for-loop's body for_loop->body().push_back(expr); new_top_level_exprs.push_back(for_loop); } @@ -184,7 +191,9 @@ std::vector processForLoopBodies( std::vector top_level_exprs) { std::vector new_top_level_exprs; + // Process each top-level expression for (auto top_level_expr : top_level_exprs) { + // Skip non-for-loop expressions if (!top_level_expr->isA()) { new_top_level_exprs.push_back(top_level_expr); continue; @@ -192,24 +201,26 @@ std::vector processForLoopBodies( auto* for_loop = top_level_expr->as(); std::vector new_loop_body; + std::vector current_loop_body = for_loop->body().exprs(); // Process each expression in the loop body - std::vector current_loop_body = for_loop->body().exprs(); for (auto it_expr = current_loop_body.begin(); it_expr != current_loop_body.end(); ++it_expr) { Expr* expr = *it_expr; - // Process input tensors + // Process input tensors that might have stream axes for (auto* input : ir_utils::filterByType(expr->inputs())) { + // Find if this input has a stream axis int64_t input_stream_id_logical_index = findStreamAxisIndex( input, for_loop->iterDomain(), id_model); + // Skip if no stream axis found if (input_stream_id_logical_index == -1) { continue; } - // Create sliced tensor for current stream iteration + // Create a sliced version of the input tensor for this stream iterdomain TensorView* input_j = select( input, input_stream_id_logical_index, @@ -217,7 +228,7 @@ std::vector processForLoopBodies( /*keep_reduction_axis=*/true); new_loop_body.push_back(input_j->definition()); - // Update all expressions using this input + // Update all expressions that use this input to use the sliced version for (auto it_running_expr = current_loop_body.begin(); it_running_expr != current_loop_body.end(); ++it_running_expr) { @@ -232,28 +243,30 @@ std::vector processForLoopBodies( } } - // Process output tensors + // Process output tensors that might have stream axes for (auto* output : ir_utils::filterByType(expr->outputs())) { + // Find if this output has a stream axis int64_t output_stream_id_logical_index = findStreamAxisIndex( output, for_loop->iterDomain(), id_model); + // Skip if no stream axis found if (output_stream_id_logical_index == -1) { continue; } - // Create sliced tensor for current stream iteration + // Create a sliced version of the output tensor for this stream axis TensorView* output_j = select( output, output_stream_id_logical_index, for_loop->index(), /*keep_reduction_axis=*/true); - // Allocate memory for the output tensor + // Allocate memory for the output tensor, and place the allocation IR before the for-loop, at the top level new_top_level_exprs.push_back( IrBuilder::create(output, MemoryType::Global)); new_loop_body.push_back(output_j->definition()); - // Update all expressions using this output + // Update all expressions that use this output to use the sliced version for (auto it_running_expr = current_loop_body.begin(); it_running_expr != current_loop_body.end(); ++it_running_expr) { @@ -261,7 +274,8 @@ std::vector processForLoopBodies( for (auto* running_output : ir_utils::filterByType(running_expr->outputs())) { if (running_output == output) { - // Create alias for the sliced output + // Create an alias for the sliced output to maintain the original tensor's properties + // Alias is needed here to avoid that transferDefinitionToNewOutputs throws. Indeed, HIC does not make the SSA assumption, but the util functions we use (such as transferDefinitionToNewOutputs) do, therefore we need to create an alias for the sliced output to not create loops in the dag. TensorView* output_j_alias = ops::newValLike(output_j, output_j->dtype(), true) ->as(); @@ -270,13 +284,14 @@ std::vector processForLoopBodies( running_expr, {output_j_alias}); } } - } } + + // Add the original expression to the new loop body new_loop_body.push_back(*it_expr); } - // Update for-loop body with processed expressions + // Update the for-loop body with all the processed expressions for_loop->body().clear(); for (auto* expr : new_loop_body) { for_loop->body().push_back(expr); @@ -291,20 +306,23 @@ std::vector processForLoopBodies( std::vector addStreamManagement(std::vector top_level_exprs) { std::vector new_top_level_exprs; + // Process each top-level expression for (auto* top_level_expr : top_level_exprs) { + // Skip non-for-loop expressions if (!top_level_expr->isA()) { new_top_level_exprs.push_back(top_level_expr); continue; } + auto* for_loop = top_level_expr->as(); std::vector new_loop_body; - // Get current stream for later synchronization + // Get the current stream before entering the loop auto* get_current_stream = IrBuilder::create(); hir::Stream* original_stream = get_current_stream->stream(); new_loop_body.push_back(get_current_stream); - // Set up stream for current iteration + // Set up a new stream for this iteration based on the loop index auto* number_of_streams = IrBuilder::create("numberOfStreams", DataType::Int); auto* stream_index = mod(for_loop->index(), number_of_streams); @@ -312,24 +330,24 @@ std::vector addStreamManagement(std::vector top_level_exprs) { auto* set_stream = IrBuilder::create(stream); new_loop_body.push_back(set_stream); - // Synchronize with original stream + // Synchronize with the original stream before starting computation auto* initial_sync_stream = IrBuilder::create(original_stream); new_loop_body.push_back(initial_sync_stream); - // Add the actual computation expressions + // Add all the expressions to the loop body for (auto* expr : for_loop->body().exprs()) { new_loop_body.push_back(expr); } - // Restore original stream and synchronize + // Restore the original stream and synchronize with the iteration's stream auto* set_back_original_stream = IrBuilder::create(original_stream); new_loop_body.push_back(set_back_original_stream); auto* sync_stream = IrBuilder::create(stream); new_loop_body.push_back(sync_stream); - // Update for-loop body with stream management + // Update the for-loop body with the new expressions for_loop->body().clear(); for (auto* expr : new_loop_body) { for_loop->body().push_back(expr); @@ -396,4 +414,3 @@ void StreamParallelType::runPass(Fusion* fusion) { } } // namespace nvfuser::preseg_passes - From 6dd673f4b39f0b9db7809be9110d6f998da1bd8e Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 18 Apr 2025 05:49:22 -0700 Subject: [PATCH 27/38] more comments --- csrc/host_ir/pass/stream_parallel_type.cpp | 42 +++++++++++++--------- csrc/ops/indexing.h | 3 ++ 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp index dd19f4f6b5d..63ebc9fc42c 100644 --- a/csrc/host_ir/pass/stream_parallel_type.cpp +++ b/csrc/host_ir/pass/stream_parallel_type.cpp @@ -22,7 +22,8 @@ namespace nvfuser::preseg_passes { namespace { -// Finds the stream axis in a tensor's domain. There should be at most one stream axis. +// Finds the stream axis in a tensor's domain. There should be at most one +// stream axis. IterDomain* getStreamAxis(const std::vector& domain) { IterDomain* ret = nullptr; for (auto id : domain) { @@ -102,9 +103,7 @@ int64_t findStreamAxisIndex( // Find stream axis in logical domain auto it_stream_id_logical = std::find( - tv->getLogicalDomain().begin(), - tv->getLogicalDomain().end(), - id); + tv->getLogicalDomain().begin(), tv->getLogicalDomain().end(), id); NVF_CHECK( it_stream_id_logical != tv->getLogicalDomain().end(), "Expected to find ", @@ -113,8 +112,8 @@ int64_t findStreamAxisIndex( tv, "'s logical domain ", tv->getLogicalDomain()); - stream_id_logical_index = std::distance( - tv->getLogicalDomain().begin(), it_stream_id_logical); + stream_id_logical_index = + std::distance(tv->getLogicalDomain().begin(), it_stream_id_logical); } } return stream_id_logical_index; @@ -159,7 +158,8 @@ std::vector groupStreamParallelRegions( validateStreamAxis(stream_axis, output); // Check if we can merge this expression with the previous for-loop - if (canMergeWithPreviousForLoop(new_top_level_exprs, stream_axis, id_model)) { + if (canMergeWithPreviousForLoop( + new_top_level_exprs, stream_axis, id_model)) { // Merge with existing for-loop by adding the expression to its body new_top_level_exprs.back()->as()->body().push_back(expr); } else { @@ -212,15 +212,16 @@ std::vector processForLoopBodies( // Process input tensors that might have stream axes for (auto* input : ir_utils::filterByType(expr->inputs())) { // Find if this input has a stream axis - int64_t input_stream_id_logical_index = findStreamAxisIndex( - input, for_loop->iterDomain(), id_model); + int64_t input_stream_id_logical_index = + findStreamAxisIndex(input, for_loop->iterDomain(), id_model); // Skip if no stream axis found if (input_stream_id_logical_index == -1) { continue; } - // Create a sliced version of the input tensor for this stream iterdomain + // Create a sliced version of the input tensor for this stream + // iterdomain TensorView* input_j = select( input, input_stream_id_logical_index, @@ -246,8 +247,8 @@ std::vector processForLoopBodies( // Process output tensors that might have stream axes for (auto* output : ir_utils::filterByType(expr->outputs())) { // Find if this output has a stream axis - int64_t output_stream_id_logical_index = findStreamAxisIndex( - output, for_loop->iterDomain(), id_model); + int64_t output_stream_id_logical_index = + findStreamAxisIndex(output, for_loop->iterDomain(), id_model); // Skip if no stream axis found if (output_stream_id_logical_index == -1) { @@ -261,7 +262,8 @@ std::vector processForLoopBodies( for_loop->index(), /*keep_reduction_axis=*/true); - // Allocate memory for the output tensor, and place the allocation IR before the for-loop, at the top level + // Allocate memory for the output tensor, and place the allocation IR + // before the for-loop, at the top level new_top_level_exprs.push_back( IrBuilder::create(output, MemoryType::Global)); new_loop_body.push_back(output_j->definition()); @@ -274,8 +276,12 @@ std::vector processForLoopBodies( for (auto* running_output : ir_utils::filterByType(running_expr->outputs())) { if (running_output == output) { - // Create an alias for the sliced output to maintain the original tensor's properties - // Alias is needed here to avoid that transferDefinitionToNewOutputs throws. Indeed, HIC does not make the SSA assumption, but the util functions we use (such as transferDefinitionToNewOutputs) do, therefore we need to create an alias for the sliced output to not create loops in the dag. + // Create an alias for the sliced output to maintain the original + // tensor's properties Alias is needed here to avoid that + // transferDefinitionToNewOutputs throws. Indeed, HIC does not + // make the SSA assumption, but the util functions we use (such as + // transferDefinitionToNewOutputs) do, therefore we need to create + // an alias for the sliced output to not create loops in the dag. TensorView* output_j_alias = ops::newValLike(output_j, output_j->dtype(), true) ->as(); @@ -401,10 +407,12 @@ void StreamParallelType::runPass(Fusion* fusion) { id_model.buildBroadcastGraph(); // Step 1: Group expressions into stream-parallel regions - std::vector top_level_exprs = groupStreamParallelRegions(hic, id_model); + std::vector top_level_exprs = + groupStreamParallelRegions(hic, id_model); // Step 2: Process for-loop bodies by slicing tensors - top_level_exprs = processForLoopBodies(hic, id_model, std::move(top_level_exprs)); + top_level_exprs = + processForLoopBodies(hic, id_model, std::move(top_level_exprs)); // Step 3: Add stream management and synchronization top_level_exprs = addStreamManagement(std::move(top_level_exprs)); diff --git a/csrc/ops/indexing.h b/csrc/ops/indexing.h index 7a219c534a3..5e0410d95d5 100644 --- a/csrc/ops/indexing.h +++ b/csrc/ops/indexing.h @@ -15,6 +15,9 @@ namespace nvfuser { +// When keep_reduction_axis is true, all reduction axis are kept in the +// SelectOp's consumer. This is used in the context of HostIr where SelectOp is +// used to index into Stream-parallelized axes. NVF_API TensorView* select( TensorView* tv, int64_t dim, From db90ef0749725656a4ec3d668668aed3288b2a3d Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 23 Apr 2025 14:17:26 -0700 Subject: [PATCH 28/38] add HirAliasSelect --- csrc/dispatch.h | 3 ++- csrc/host_ir/executor.cpp | 9 ++++++++ csrc/host_ir/executor.h | 1 + csrc/host_ir/host_ir.cpp | 45 +++++++++++++++++++++++++++++++++++++ csrc/host_ir/host_ir.h | 43 +++++++++++++++++++++++++++++++++++ tests/cpp/test_host_irs.cpp | 38 +++++++++++++++++++++++++++++++ 6 files changed, 138 insertions(+), 1 deletion(-) diff --git a/csrc/dispatch.h b/csrc/dispatch.h index 218ccd8267a..b9874860bd6 100644 --- a/csrc/dispatch.h +++ b/csrc/dispatch.h @@ -158,7 +158,8 @@ class Val; f(Synchronize); \ f(StartCoalescing); \ f(EndCoalescing); \ - f(ShareMemHandles); + f(ShareMemHandles); \ + f(HirAliasSelect); // Forward declarations for all Val and Expr types diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 12cf344e549..66ac0ef1d64 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -751,6 +751,15 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) { } } +void HostIrEvaluator::handle(HirAliasSelect* hir_alias_select) { + auto index = + expr_evaluator_.evaluate(hir_alias_select->index()).as(); + auto input = getKnownConcreteValue(hir_alias_select->in()->as()) + .as(); + int64_t axis = hir_alias_select->axis(); + bind(hir_alias_select->out(), input.select(axis, index)); +} + void HostIrEvaluator::unhandled(Statement* stmt) { NVF_ERROR(stmt->isA(), stmt, " must be an Expr"); auto* expr = stmt->as(); diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 89ac5119681..3f147b7801b 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -142,6 +142,7 @@ class HostIrEvaluator final : public OptOutDispatch { void handle(BinaryOp* binary_op) override; void handle(ReductionOp* reduction_op) override; void handle(ShareMemHandles* share_mem_handles) override; + void handle(HirAliasSelect* hir_alias_select) override; void unhandled(Statement* stmt) override; c10::cuda::CUDAStream getCUDAStream(Stream* stream); diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp index 9e1386d0d3d..bf3d5cef9eb 100644 --- a/csrc/host_ir/host_ir.cpp +++ b/csrc/host_ir/host_ir.cpp @@ -355,6 +355,51 @@ std::string ShareMemHandles::toInlineString(int indent_size) const { NVF_THROW("Cannot be printed inline"); } +HirAliasSelect::HirAliasSelect( + IrBuilderPasskey passkey, + TensorView* in, + TensorView* out, + int64_t axis, + Val* index) + : Expr(passkey, {in, index}, {}, {}) { + NVF_ERROR(passkey.ir_container_ != nullptr); + NVF_ERROR( + passkey.ir_container_->isA(), + this, + "must be registered in a HostIrContainer"); + NVF_ERROR( + static_cast(in->getLogicalDomain().size()) > axis, + "Select axis ", + axis, + " is out of bounds for tensor ", + in->toString(), + " with ", + in->getLogicalDomain().size(), + " dimensions"); + // "out" is not added as an output because the current op doesn't "define" it, + // but rather sets its allocation. Since "out" will be used in another + // producing expression, this avoids unnecessary cyclic dependencies. This + // ressembles how kir::Allocate treats its allocated TensorView. + addAttribute(out); + addDataAttribute(axis); +} + +NVFUSER_DEFINE_CLONE_AND_CREATE(HirAliasSelect) + +std::string HirAliasSelect::toString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << out()->toString() << "\n"; + indent_size++; + indent(ss, indent_size) << " = HirAliasSelect( " << in()->toString() + << ", axis = " << in()->getLogicalDomain().at(axis()) + << ", index = " << index()->toString() << " )\n"; + return ss.str(); +} + +std::string HirAliasSelect::toInlineString(int indent_size) const { + NVF_THROW("Cannot be printed inline"); +} + } // namespace hir } // namespace nvfuser diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h index bad3a6ef722..d267d23ab1f 100644 --- a/csrc/host_ir/host_ir.h +++ b/csrc/host_ir/host_ir.h @@ -351,6 +351,49 @@ class ShareMemHandles : public Expr { } }; +// This op mimicks the semantics of SelectOp but is used in HIR non-SSA context +// to index into a TensorView, returning an alias "slice" of the original +// TensorView. +class HirAliasSelect : public Expr { + public: + using Expr::Expr; + HirAliasSelect( + IrBuilderPasskey passkey, + TensorView* in, + TensorView* out, + int64_t axis, + Val* index); + + HirAliasSelect(const HirAliasSelect& other) = delete; + HirAliasSelect& operator=(const HirAliasSelect& other) = delete; + HirAliasSelect(HirAliasSelect&& other) = delete; + HirAliasSelect& operator=(HirAliasSelect&& other) = delete; + + NVFUSER_DECLARE_CLONE_AND_CREATE + + std::string toString(int indent_size = 0) const override; + std::string toInlineString(int indent_size = 0) const override; + const char* getOpString() const override { + return "hir::HirAliasSelect"; + } + + TensorView* in() const { + return inputs().at(0)->as(); + } + + TensorView* out() const { + return attributeVal(0)->as(); + } + + int64_t axis() const { + return attribute(1); + } + + Val* index() const { + return inputs().at(1); + } +}; + } // namespace hir } // namespace nvfuser diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index 633ebc83504..eb1291de57d 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -1487,6 +1487,44 @@ TEST_F(HirReductionOpTest, NonPreAllocatedOutputs) { << "Expected output: " << expected_out; } +using HirAliasSelectHostIrTest = NVFuserTest; + +TEST_F(HirAliasSelectHostIrTest, SelectingTensor) { + constexpr int64_t ndims = 2; + constexpr int64_t dim = 1; + constexpr int64_t index = 3; + const std::vector input_sizes = {32, 32}; + + ASSERT_LT(dim, ndims); + ASSERT_EQ(input_sizes.size(), ndims); + ASSERT_LT(index, input_sizes.at(dim)); + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + TensorView* in = makeContigTensor(ndims); + TensorView* out = makeContigTensor(ndims - 1); + auto* index_val = IrBuilder::create(index, DataType::Index); + auto* select_op = IrBuilder::create(in, out, dim, index_val); + + hic->addInput(in); + hic->addOutput(out); + hic->pushBackTopLevelExprs(select_op); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat); + auto in_aten = at::randn(input_sizes, options); + std::unordered_map concrete_input_buffers = { + {in, in_aten}}; + + auto out_aten = hie.runWithInput(concrete_input_buffers)[0].as(); + + // validate + auto ref_out = in_aten.select(dim, index); + EXPECT_TRUE(ref_out.equal(out_aten)); +} + } // namespace hir } // namespace nvfuser From e32653a383c8e1689d8a1a8d5dbf5fd1d409ea92 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 23 Apr 2025 14:18:05 -0700 Subject: [PATCH 29/38] replace SelectOp by HirAliasSelect in stream lowering --- csrc/host_ir/container.cpp | 10 ++-- csrc/host_ir/pass/stream_parallel_type.cpp | 56 ++++++++++++---------- csrc/ops/indexing.cpp | 10 +--- csrc/ops/indexing.h | 9 +--- csrc/ops/utils.cpp | 27 ++++------- csrc/ops/utils.h | 14 ++---- 6 files changed, 52 insertions(+), 74 deletions(-) diff --git a/csrc/host_ir/container.cpp b/csrc/host_ir/container.cpp index 83e668770fc..9fdcfa376a6 100644 --- a/csrc/host_ir/container.cpp +++ b/csrc/host_ir/container.cpp @@ -35,11 +35,13 @@ Stream* HostIrContainer::getDefaultStream() { std::ostream& HostIrContainer::print(std::ostream& os) const { IrMathPrinter op_exprs(os); op_exprs.handle(this); - os << "Aliases:{"; - for (const auto& alias : alias_) { - os << "\n " << alias.first << " -> " << alias.second; + if (alias_.size() > 0) { + os << "Aliases:{"; + for (const auto& alias : alias_) { + os << "\n " << alias.first << " -> " << alias.second; + } + os << "\n}\n"; } - os << "\n}\n"; return os; } diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp index 63ebc9fc42c..3d63290ef17 100644 --- a/csrc/host_ir/pass/stream_parallel_type.cpp +++ b/csrc/host_ir/pass/stream_parallel_type.cpp @@ -119,6 +119,30 @@ int64_t findStreamAxisIndex( return stream_id_logical_index; } +// Helper function to create a sliced version of a tensor for stream +// parallelization +hir::HirAliasSelect* createSlicedTensor( + TensorView* tensor, + int64_t stream_axis_index, + Val* index) { + auto dom = tensor->getLogicalDomain(); + + std::vector new_root; + new_root.reserve(dom.size() - 1); + + for (auto i : arange((int64_t)dom.size())) { + if (i != stream_axis_index) { + new_root.emplace_back(dom[i]->cloneWithoutRFactor()); + } + } + + auto td = IrBuilder::create( + new_root, TensorDomain::getContiguityFilledWith(new_root, true)); + auto out = IrBuilder::create(td, *tensor->getDataType()); + return IrBuilder::create( + tensor, out, stream_axis_index, index); +} + // Step 1: Group expressions into stream-parallel regions std::vector groupStreamParallelRegions( hir::HostIrContainer* hic, @@ -222,12 +246,9 @@ std::vector processForLoopBodies( // Create a sliced version of the input tensor for this stream // iterdomain - TensorView* input_j = select( - input, - input_stream_id_logical_index, - for_loop->index(), - /*keep_reduction_axis=*/true); - new_loop_body.push_back(input_j->definition()); + hir::HirAliasSelect* input_slicing = createSlicedTensor( + input, input_stream_id_logical_index, for_loop->index()); + new_loop_body.push_back(input_slicing); // Update all expressions that use this input to use the sliced version for (auto it_running_expr = current_loop_body.begin(); @@ -238,7 +259,7 @@ std::vector processForLoopBodies( ir_utils::filterByType(running_expr->inputs())) { if (running_input == input) { *it_running_expr = ir_utils::replaceValInExprInputs( - running_expr, input, input_j); + running_expr, input, input_slicing->out()); } } } @@ -256,17 +277,14 @@ std::vector processForLoopBodies( } // Create a sliced version of the output tensor for this stream axis - TensorView* output_j = select( - output, - output_stream_id_logical_index, - for_loop->index(), - /*keep_reduction_axis=*/true); + hir::HirAliasSelect* output_slicing = createSlicedTensor( + output, output_stream_id_logical_index, for_loop->index()); // Allocate memory for the output tensor, and place the allocation IR // before the for-loop, at the top level new_top_level_exprs.push_back( IrBuilder::create(output, MemoryType::Global)); - new_loop_body.push_back(output_j->definition()); + new_loop_body.push_back(output_slicing); // Update all expressions that use this output to use the sliced version for (auto it_running_expr = current_loop_body.begin(); @@ -276,18 +294,8 @@ std::vector processForLoopBodies( for (auto* running_output : ir_utils::filterByType(running_expr->outputs())) { if (running_output == output) { - // Create an alias for the sliced output to maintain the original - // tensor's properties Alias is needed here to avoid that - // transferDefinitionToNewOutputs throws. Indeed, HIC does not - // make the SSA assumption, but the util functions we use (such as - // transferDefinitionToNewOutputs) do, therefore we need to create - // an alias for the sliced output to not create loops in the dag. - TensorView* output_j_alias = - ops::newValLike(output_j, output_j->dtype(), true) - ->as(); - hic->markAlias(output_j, output_j_alias); *it_running_expr = ir_utils::transferDefinitionToNewOutputs( - running_expr, {output_j_alias}); + running_expr, {output_slicing->out()}); } } } diff --git a/csrc/ops/indexing.cpp b/csrc/ops/indexing.cpp index 80c0ff84b85..5ff75065ff2 100644 --- a/csrc/ops/indexing.cpp +++ b/csrc/ops/indexing.cpp @@ -19,14 +19,8 @@ namespace nvfuser { -TensorView* select( - TensorView* tv, - int64_t dim, - Val* index, - bool keep_reduction_axis) { - auto dom = keep_reduction_axis - ? tv->getLogicalDomain() - : TensorDomain::noReductions(tv->getLogicalDomain()); +TensorView* select(TensorView* tv, int64_t dim, Val* index) { + auto dom = TensorDomain::noReductions(tv->getLogicalDomain()); NVF_CHECK(!dom.empty(), "select can not be applied to 0d tensor."); std::vector new_root; diff --git a/csrc/ops/indexing.h b/csrc/ops/indexing.h index 5e0410d95d5..c8152c33f82 100644 --- a/csrc/ops/indexing.h +++ b/csrc/ops/indexing.h @@ -15,14 +15,7 @@ namespace nvfuser { -// When keep_reduction_axis is true, all reduction axis are kept in the -// SelectOp's consumer. This is used in the context of HostIr where SelectOp is -// used to index into Stream-parallelized axes. -NVF_API TensorView* select( - TensorView* tv, - int64_t dim, - Val* index, - bool keep_reduction_axis = false); +NVF_API TensorView* select(TensorView* tv, int64_t dim, Val* index); // torch.index_select NVF_API TensorView* indexSelect( diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp index 5d32c22e212..8d3870d1a84 100644 --- a/csrc/ops/utils.cpp +++ b/csrc/ops/utils.cpp @@ -432,9 +432,7 @@ IterDomain* newOutputIterDomain( #pragma GCC diagnostic pop #endif -std::vector newOutputDomain( - const std::vector& vals, - bool keep_reduction_axis) { +std::vector newOutputDomain(const std::vector& vals) { std::vector tvs; for (auto val : vals) { if (auto* tv = dynamic_cast(val)) { @@ -445,20 +443,14 @@ std::vector newOutputDomain( !tvs.empty(), "Tried to create new output TensorView but received empty list."); - auto getLogicalDomain = - [keep_reduction_axis](TensorView* tv) -> std::vector { - return keep_reduction_axis - ? tv->getLogicalDomain() - : TensorDomain::noReductions(tv->getLogicalDomain()); - }; - - std::vector out_domain(getLogicalDomain(tvs[0]).size(), nullptr); + std::vector out_domain( + TensorDomain::noReductions(tvs[0]->getLogicalDomain()).size(), nullptr); for (const auto dim_i : arange(out_domain.size())) { std::vector input_ids; input_ids.reserve(tvs.size()); for (auto* tv : tvs) { - auto dom = getLogicalDomain(tv); + auto dom = TensorDomain::noReductions(tv->getLogicalDomain()); input_ids.emplace_back(dom[dim_i]); } out_domain[dim_i] = newOutputIterDomain(input_ids); @@ -466,11 +458,8 @@ std::vector newOutputDomain( return out_domain; } -TensorView* newOutputTV( - const std::vector& vals, - DataType dtype, - bool keep_reduction_axis) { - auto out_domain = newOutputDomain(vals, keep_reduction_axis); +TensorView* newOutputTV(const std::vector& vals, DataType dtype) { + auto out_domain = newOutputDomain(vals); auto* new_out = IrBuilder::create( IrBuilder::create( out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)), @@ -513,12 +502,12 @@ std::vector maybeBroadcast(const std::vector& vals) { return out_vals; } -Val* newValLike(Val* val, DataType dtype, bool keep_reduction_axis) { +Val* newValLike(Val* val, DataType dtype) { NVF_CHECK( dtype != DataType::Null, "Invalid datatype provided for new value."); if (val->isA()) { - return newOutputTV({val}, dtype, keep_reduction_axis); + return newOutputTV({val}, dtype); } return newScalar(ValType::Others, dtype); diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h index 1a2abda03fc..94d6391cf45 100644 --- a/csrc/ops/utils.h +++ b/csrc/ops/utils.h @@ -99,21 +99,13 @@ IterDomain* newOutputIterDomain( // output tensorview, e.g., for BinaryOp. `vals` can contain scalars, e.g, when // creating the output TensorView for `tv0+scalar`. This is for convenience and // scalars will be ignored. -std::vector newOutputDomain( - const std::vector& vals, - bool keep_reduction_axis = false); +std::vector newOutputDomain(const std::vector& vals); -TensorView* newOutputTV( - const std::vector& vals, - DataType dtype, - bool keep_reduction_axis = false); +TensorView* newOutputTV(const std::vector& vals, DataType dtype); std::vector maybeBroadcast(const std::vector& vals); -NVF_API Val* newValLike( - Val* val, - DataType dtype, - bool keep_reduction_axis = false); +NVF_API Val* newValLike(Val* val, DataType dtype); // returns the minimum init value for reduction: // -inf for floating type; From a50b53c90e744cf469779dcccdc613c6af68958f Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 23 Apr 2025 14:48:36 -0700 Subject: [PATCH 30/38] add cache for tensor slicing --- csrc/host_ir/pass/stream_parallel_type.cpp | 92 ++++++++++++++++------ 1 file changed, 68 insertions(+), 24 deletions(-) diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp index 3d63290ef17..6999b0c4ca5 100644 --- a/csrc/host_ir/pass/stream_parallel_type.cpp +++ b/csrc/host_ir/pass/stream_parallel_type.cpp @@ -119,29 +119,67 @@ int64_t findStreamAxisIndex( return stream_id_logical_index; } -// Helper function to create a sliced version of a tensor for stream -// parallelization -hir::HirAliasSelect* createSlicedTensor( - TensorView* tensor, - int64_t stream_axis_index, - Val* index) { - auto dom = tensor->getLogicalDomain(); - - std::vector new_root; - new_root.reserve(dom.size() - 1); - - for (auto i : arange((int64_t)dom.size())) { - if (i != stream_axis_index) { - new_root.emplace_back(dom[i]->cloneWithoutRFactor()); +// Cache for tensor slicing operations in stream parallelization. +// This cache stores previously created sliced versions of tensors to avoid +// redundant slicing operations. A sliced tensor is created by removing a +// specific axis (stream axis) from the tensor's domain and creating a new +// tensor that represents a slice of the original tensor at a given index. +// The cache key is a tuple of (original tensor, axis index to remove, slice +// index). +struct TensorSlicingCache { + // Type aliases + using Key = std::tuple; + + // Custom hash function for the tuple used as cache key + struct Hash { + size_t operator()(const Key& t) const { + auto [tv, idx, val] = t; + return std::hash{}(tv) ^ std::hash{}(idx) ^ + std::hash{}(val); } + }; + + // Map type for storing cached sliced tensors + using Map = std::unordered_map; + + // Get the expr producing the indexed version of a tensor. If the expr already + // exists in the cache, returns the cached version. Otherwise, creates a new + // expr, producing a tensor "selected" on its dimension `stream_axis_index` at + // index `index`. Returns a pair of (expr, is_new) where is_new indicates + // whether the expr was newly created. + std::pair get( + TensorView* tensor, + int64_t stream_axis_index, + Val* index) { + auto key = std::make_tuple(tensor, stream_axis_index, index); + auto it = cache_.find(key); + if (it != cache_.end()) { + return {it->second, false}; + } + + auto dom = tensor->getLogicalDomain(); + std::vector new_root; + new_root.reserve(dom.size() - 1); + + for (auto i : arange((int64_t)dom.size())) { + if (i != stream_axis_index) { + new_root.emplace_back(dom[i]->cloneWithoutRFactor()); + } + } + + auto td = IrBuilder::create( + new_root, TensorDomain::getContiguityFilledWith(new_root, true)); + auto out = IrBuilder::create(td, *tensor->getDataType()); + auto result = IrBuilder::create( + tensor, out, stream_axis_index, index); + + cache_[key] = result; + return {result, true}; } - auto td = IrBuilder::create( - new_root, TensorDomain::getContiguityFilledWith(new_root, true)); - auto out = IrBuilder::create(td, *tensor->getDataType()); - return IrBuilder::create( - tensor, out, stream_axis_index, index); -} + private: + Map cache_; // Storage for cached sliced tensors +}; // Step 1: Group expressions into stream-parallel regions std::vector groupStreamParallelRegions( @@ -214,6 +252,8 @@ std::vector processForLoopBodies( const IdModel& id_model, std::vector top_level_exprs) { std::vector new_top_level_exprs; + // Create a cache for tensor indexing + TensorSlicingCache tensor_slicing_cache; // Process each top-level expression for (auto top_level_expr : top_level_exprs) { @@ -246,9 +286,11 @@ std::vector processForLoopBodies( // Create a sliced version of the input tensor for this stream // iterdomain - hir::HirAliasSelect* input_slicing = createSlicedTensor( + auto [input_slicing, is_new] = tensor_slicing_cache.get( input, input_stream_id_logical_index, for_loop->index()); - new_loop_body.push_back(input_slicing); + if (is_new) { + new_loop_body.push_back(input_slicing); + } // Update all expressions that use this input to use the sliced version for (auto it_running_expr = current_loop_body.begin(); @@ -277,14 +319,16 @@ std::vector processForLoopBodies( } // Create a sliced version of the output tensor for this stream axis - hir::HirAliasSelect* output_slicing = createSlicedTensor( + auto [output_slicing, is_new] = tensor_slicing_cache.get( output, output_stream_id_logical_index, for_loop->index()); + if (is_new) { + new_loop_body.push_back(output_slicing); + } // Allocate memory for the output tensor, and place the allocation IR // before the for-loop, at the top level new_top_level_exprs.push_back( IrBuilder::create(output, MemoryType::Global)); - new_loop_body.push_back(output_slicing); // Update all expressions that use this output to use the sliced version for (auto it_running_expr = current_loop_body.begin(); From d01c5a27f64db79e568827e25730d9ab6b84cfa4 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 23 Apr 2025 15:58:29 -0700 Subject: [PATCH 31/38] separate out tensor allocation logic --- csrc/host_ir/pass/stream_parallel_type.cpp | 45 +++++++++++++++++----- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp index 6999b0c4ca5..7532c145ba1 100644 --- a/csrc/host_ir/pass/stream_parallel_type.cpp +++ b/csrc/host_ir/pass/stream_parallel_type.cpp @@ -246,7 +246,36 @@ std::vector groupStreamParallelRegions( return new_top_level_exprs; } -// Step 2: Process for-loop bodies by slicing tensors +// Helper function to add allocations for tensors that need them +std::vector addTensorAllocations( + std::vector top_level_exprs, + const IdModel& id_model) { + std::vector new_top_level_exprs; + + for (auto* expr : top_level_exprs) { + if (expr->isA()) { + // add allocations for tensors produced in the loop that have a stream axes + auto* for_loop = expr->as(); + for (auto* body_expr : for_loop->body().exprs()) { + for (auto* output : ir_utils::filterByType(body_expr->outputs())) { + if (findStreamAxisIndex(output, for_loop->iterDomain(), id_model) != -1) { + new_top_level_exprs.push_back( + IrBuilder::create(output, MemoryType::Global)); + } + } + } + } + new_top_level_exprs.push_back(expr); + } + + // Add all original expressions + new_top_level_exprs.insert( + new_top_level_exprs.end(), top_level_exprs.begin(), top_level_exprs.end()); + + return new_top_level_exprs; +} + +// Step 3: Process for-loop bodies by slicing tensors std::vector processForLoopBodies( hir::HostIrContainer* hic, const IdModel& id_model, @@ -325,11 +354,6 @@ std::vector processForLoopBodies( new_loop_body.push_back(output_slicing); } - // Allocate memory for the output tensor, and place the allocation IR - // before the for-loop, at the top level - new_top_level_exprs.push_back( - IrBuilder::create(output, MemoryType::Global)); - // Update all expressions that use this output to use the sliced version for (auto it_running_expr = current_loop_body.begin(); it_running_expr != current_loop_body.end(); @@ -424,7 +448,7 @@ std::vector addStreamManagement(std::vector top_level_exprs) { // 1. Identifying stream-parallelized axes in tensor operations // 2. Grouping compatible operations into stream-parallel for-loops // 3. Setting up proper stream synchronization and management -// +// 4. Adding allocations for tensors that need them // The pass ensures that: // - Input tensors don't have stream axes // - Only one stream axis exists per tensor @@ -462,11 +486,14 @@ void StreamParallelType::runPass(Fusion* fusion) { std::vector top_level_exprs = groupStreamParallelRegions(hic, id_model); - // Step 2: Process for-loop bodies by slicing tensors + // Step 2: Add allocations for tensors that need them + top_level_exprs = addTensorAllocations(std::move(top_level_exprs), id_model); + + // Step 3: Process for-loop bodies by slicing tensors top_level_exprs = processForLoopBodies(hic, id_model, std::move(top_level_exprs)); - // Step 3: Add stream management and synchronization + // Step 4: Add stream management and synchronization top_level_exprs = addStreamManagement(std::move(top_level_exprs)); // Update the container's top-level expressions From 25b7695b2db13c85020845fdca4bb1e91f7e8359 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 23 Apr 2025 16:12:23 -0700 Subject: [PATCH 32/38] minor cleanup --- csrc/host_ir/pass/stream_parallel_type.cpp | 147 ++++++--------------- 1 file changed, 42 insertions(+), 105 deletions(-) diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp index 7532c145ba1..8bea4c3430f 100644 --- a/csrc/host_ir/pass/stream_parallel_type.cpp +++ b/csrc/host_ir/pass/stream_parallel_type.cpp @@ -183,12 +183,11 @@ struct TensorSlicingCache { // Step 1: Group expressions into stream-parallel regions std::vector groupStreamParallelRegions( - hir::HostIrContainer* hic, + const std::vector& top_level_exprs, const IdModel& id_model) { std::vector new_top_level_exprs; - // Process each top-level expression - for (auto expr : hic->topLevelExprs()) { + for (auto* expr : top_level_exprs) { // Skip expressions with no outputs if (expr->outputs().size() == 0) { new_top_level_exprs.push_back(expr); @@ -229,9 +228,9 @@ std::vector groupStreamParallelRegions( auto* for_loop = IrBuilder::create( stream_axis, /*index=*/NamedScalar::getParallelIndex(ParallelType::Stream), - /*start=*/hic->zeroVal(), + /*start=*/FusionGuard::getCurFusion()->zeroVal(), /*stop=*/stream_axis->extent(), - /*step=*/hic->oneVal(), + /*step=*/FusionGuard::getCurFusion()->oneVal(), /*vectorize=*/false, /*vectorize_shift=*/nullptr, /*unroll_required=*/false, @@ -254,11 +253,14 @@ std::vector addTensorAllocations( for (auto* expr : top_level_exprs) { if (expr->isA()) { - // add allocations for tensors produced in the loop that have a stream axes + // add allocations for tensors produced in the loop that have a stream + // axes auto* for_loop = expr->as(); for (auto* body_expr : for_loop->body().exprs()) { - for (auto* output : ir_utils::filterByType(body_expr->outputs())) { - if (findStreamAxisIndex(output, for_loop->iterDomain(), id_model) != -1) { + for (auto* output : + ir_utils::filterByType(body_expr->outputs())) { + if (findStreamAxisIndex(output, for_loop->iterDomain(), id_model) != + -1) { new_top_level_exprs.push_back( IrBuilder::create(output, MemoryType::Global)); } @@ -268,131 +270,68 @@ std::vector addTensorAllocations( new_top_level_exprs.push_back(expr); } - // Add all original expressions - new_top_level_exprs.insert( - new_top_level_exprs.end(), top_level_exprs.begin(), top_level_exprs.end()); - return new_top_level_exprs; } // Step 3: Process for-loop bodies by slicing tensors std::vector processForLoopBodies( - hir::HostIrContainer* hic, - const IdModel& id_model, - std::vector top_level_exprs) { - std::vector new_top_level_exprs; - // Create a cache for tensor indexing + std::vector top_level_exprs, + const IdModel& id_model) { TensorSlicingCache tensor_slicing_cache; - // Process each top-level expression - for (auto top_level_expr : top_level_exprs) { - // Skip non-for-loop expressions - if (!top_level_expr->isA()) { - new_top_level_exprs.push_back(top_level_expr); + for (auto* expr : top_level_exprs) { + if (!expr->isA()) { continue; } - auto* for_loop = top_level_expr->as(); + auto* for_loop = expr->as(); std::vector new_loop_body; - std::vector current_loop_body = for_loop->body().exprs(); - - // Process each expression in the loop body - for (auto it_expr = current_loop_body.begin(); - it_expr != current_loop_body.end(); - ++it_expr) { - Expr* expr = *it_expr; - - // Process input tensors that might have stream axes - for (auto* input : ir_utils::filterByType(expr->inputs())) { - // Find if this input has a stream axis - int64_t input_stream_id_logical_index = - findStreamAxisIndex(input, for_loop->iterDomain(), id_model); - - // Skip if no stream axis found - if (input_stream_id_logical_index == -1) { - continue; - } - // Create a sliced version of the input tensor for this stream - // iterdomain - auto [input_slicing, is_new] = tensor_slicing_cache.get( - input, input_stream_id_logical_index, for_loop->index()); + // Lambda to process a tensor in a for-loop body + auto processTensor = [&](Expr*& expr, TensorView* tensor) { + if (auto stream_idx = + findStreamAxisIndex(tensor, for_loop->iterDomain(), id_model); + stream_idx != -1) { + auto [slicing, is_new] = + tensor_slicing_cache.get(tensor, stream_idx, for_loop->index()); if (is_new) { - new_loop_body.push_back(input_slicing); + new_loop_body.push_back(slicing); } - - // Update all expressions that use this input to use the sliced version - for (auto it_running_expr = current_loop_body.begin(); - it_running_expr != current_loop_body.end(); - ++it_running_expr) { - Expr* running_expr = *it_running_expr; - for (auto* running_input : - ir_utils::filterByType(running_expr->inputs())) { - if (running_input == input) { - *it_running_expr = ir_utils::replaceValInExprInputs( - running_expr, input, input_slicing->out()); - } - } + expr = ir_utils::replaceValInExprInputs(expr, tensor, slicing->out()); + if (expr->outputs().size() > 0 && expr->outputs()[0] == tensor) { + expr = + ir_utils::transferDefinitionToNewOutputs(expr, {slicing->out()}); } } + }; - // Process output tensors that might have stream axes - for (auto* output : ir_utils::filterByType(expr->outputs())) { - // Find if this output has a stream axis - int64_t output_stream_id_logical_index = - findStreamAxisIndex(output, for_loop->iterDomain(), id_model); - - // Skip if no stream axis found - if (output_stream_id_logical_index == -1) { - continue; - } - - // Create a sliced version of the output tensor for this stream axis - auto [output_slicing, is_new] = tensor_slicing_cache.get( - output, output_stream_id_logical_index, for_loop->index()); - if (is_new) { - new_loop_body.push_back(output_slicing); - } - - // Update all expressions that use this output to use the sliced version - for (auto it_running_expr = current_loop_body.begin(); - it_running_expr != current_loop_body.end(); - ++it_running_expr) { - Expr* running_expr = *it_running_expr; - for (auto* running_output : - ir_utils::filterByType(running_expr->outputs())) { - if (running_output == output) { - *it_running_expr = ir_utils::transferDefinitionToNewOutputs( - running_expr, {output_slicing->out()}); - } - } - } + for (auto* body_expr : for_loop->body().exprs()) { + for (auto* input : + ir_utils::filterByType(body_expr->inputs())) { + processTensor(body_expr, input); } - - // Add the original expression to the new loop body - new_loop_body.push_back(*it_expr); + for (auto* output : + ir_utils::filterByType(body_expr->outputs())) { + processTensor(body_expr, output); + } + new_loop_body.push_back(body_expr); } - // Update the for-loop body with all the processed expressions for_loop->body().clear(); for (auto* expr : new_loop_body) { for_loop->body().push_back(expr); } - new_top_level_exprs.push_back(top_level_expr); } - return new_top_level_exprs; + return top_level_exprs; } -// Step 3: Add stream management and synchronization +// Step 4: Add stream management and synchronization std::vector addStreamManagement(std::vector top_level_exprs) { - std::vector new_top_level_exprs; - // Process each top-level expression for (auto* top_level_expr : top_level_exprs) { // Skip non-for-loop expressions if (!top_level_expr->isA()) { - new_top_level_exprs.push_back(top_level_expr); continue; } @@ -434,10 +373,9 @@ std::vector addStreamManagement(std::vector top_level_exprs) { for (auto* expr : new_loop_body) { for_loop->body().push_back(expr); } - new_top_level_exprs.push_back(top_level_expr); } - return new_top_level_exprs; + return top_level_exprs; } } // anonymous namespace @@ -484,14 +422,13 @@ void StreamParallelType::runPass(Fusion* fusion) { // Step 1: Group expressions into stream-parallel regions std::vector top_level_exprs = - groupStreamParallelRegions(hic, id_model); + groupStreamParallelRegions(hic->topLevelExprs(), id_model); // Step 2: Add allocations for tensors that need them top_level_exprs = addTensorAllocations(std::move(top_level_exprs), id_model); // Step 3: Process for-loop bodies by slicing tensors - top_level_exprs = - processForLoopBodies(hic, id_model, std::move(top_level_exprs)); + top_level_exprs = processForLoopBodies(std::move(top_level_exprs), id_model); // Step 4: Add stream management and synchronization top_level_exprs = addStreamManagement(std::move(top_level_exprs)); From 6b479deca59efed1cc9afae42447756bb260c019 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 24 Apr 2025 06:19:59 -0700 Subject: [PATCH 33/38] lower as HIR only set without permute --- csrc/host_ir/executor.cpp | 18 ++++++++---------- csrc/host_ir/lower.cpp | 37 +++++++++++++++++++++++++++---------- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 12cf344e549..3a3c0921d2a 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -632,21 +632,19 @@ void HostIrEvaluator::handle(LinearOp* linear) { } void HostIrEvaluator::handle(LoadStoreOp* load_store_op) { + NVF_ERROR( + load_store_op->opType() == LoadStoreOpType::Set, + "LoadStoreOp must be a Set"); NVF_ERROR( load_store_op->out()->isA(), "out must be a TensorView"); auto* out_tv = load_store_op->out()->as(); auto in_tensor = getKnownConcreteValue(load_store_op->in()).as(); - // If output has root domain, compute and apply permutation - if (out_tv->hasRoot()) { - auto permutation = ir_utils::computePermutation( - out_tv->getRootDomain(), out_tv->getLogicalDomain()); - NVF_ERROR( - permutation.has_value(), - "The logical domain of a Set.Permute is supposed to be a permutation of the root domain: ", - out_tv->toString()); - in_tensor = in_tensor.permute(*permutation).contiguous(); - } + // If output has root domain, it means that the set op is a permute, which we + // don't support currently + NVF_ERROR( + !out_tv->hasRoot(), "the set op", load_store_op, "must not be a permute"); + if (!isKnown(load_store_op->out())) { bind(load_store_op->out(), in_tensor); } else { diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index 308e1399872..fd14096b190 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -615,16 +615,33 @@ std::vector HostIrLower::lowerToCollectiveBasedPipelinedGemmComm( } bool HostIrLower::isLowerableAsStandaloneHostOp(Expr* expr) { - return expr->isOneOf< - MatmulOp, - SliceOp, - SelectOp, - LinearOp, - LoadStoreOp, - BinaryOp, - ReductionOp, - Communication, - P2PCommunication>(); + if (expr->isOneOf< + MatmulOp, + SliceOp, + SelectOp, + LinearOp, + BinaryOp, + ReductionOp, + Communication, + P2PCommunication>()) { + return true; + } + + // Lower as standalone op "set" ops, i.e., LoadStoreOp of "Set" type with no + // permute + if (expr->isA()) { + auto* load_store = expr->as(); + if (load_store->opType() == LoadStoreOpType::Set && + load_store->out()->isA()) { + auto* tv = load_store->out()->as(); + // If the output tensor has no root, it means it has no permute + if (!tv->hasRoot()) { + return true; + } + } + } + + return false; } bool HostIrLower::shouldMergeSegmentedGroups( From b2a76e94c68c738043ac3251611299b720e85c02 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 24 Apr 2025 06:33:09 -0700 Subject: [PATCH 34/38] add comment --- csrc/host_ir/lower.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index c36fae09e0a..626a7e67e28 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -720,6 +720,9 @@ std::unique_ptr HostIrLower::lower( } for (auto tv : hic->allTvs()) { + // set all host tensors to global memory type. This must be the case by + // definition of a host tensor, and setting the memory type to global is + // also required to avoid Allocate HIR nodes to throw tv->setMemoryType(MemoryType::Global); } From 7f7caf5177936b7387d4e443768788584de2702a Mon Sep 17 00:00:00 2001 From: snordmann Date: Sun, 27 Apr 2025 04:17:41 -0700 Subject: [PATCH 35/38] change namespace of the optimization pass to hir --- csrc/host_ir/lower.cpp | 2 +- csrc/host_ir/pass/stream_parallel_type.cpp | 4 +-- csrc/host_ir/pass/stream_parallel_type.h | 8 +++--- csrc/python_frontend/fusion_definition.cpp | 2 +- tests/cpp/test_host_ir_stream_lowering.cpp | 30 +++++++++++----------- tests/cpp/test_multidevice_host_ir.cpp | 4 +-- 6 files changed, 25 insertions(+), 25 deletions(-) diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index 227b76eb597..313970f703c 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -769,7 +769,7 @@ std::unique_ptr HostIrLower::lower( } hic->resetTopLevelExprs(new_top_level_exprs); - preseg_passes::OptimizationPass::runPass( + preseg_passes::OptimizationPass::runPass( hic.get()); return hic; diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp index 8bea4c3430f..d7bfa0f090a 100644 --- a/csrc/host_ir/pass/stream_parallel_type.cpp +++ b/csrc/host_ir/pass/stream_parallel_type.cpp @@ -18,7 +18,7 @@ #include #include -namespace nvfuser::preseg_passes { +namespace nvfuser::hir { namespace { @@ -437,4 +437,4 @@ void StreamParallelType::runPass(Fusion* fusion) { hic->resetTopLevelExprs(top_level_exprs); } -} // namespace nvfuser::preseg_passes +} // namespace nvfuser::hir diff --git a/csrc/host_ir/pass/stream_parallel_type.h b/csrc/host_ir/pass/stream_parallel_type.h index 9c0c39efe87..f389dbe1ff7 100644 --- a/csrc/host_ir/pass/stream_parallel_type.h +++ b/csrc/host_ir/pass/stream_parallel_type.h @@ -10,7 +10,7 @@ #include #include -namespace nvfuser::preseg_passes { +namespace nvfuser::hir { // A pass used in HostIrLower that takes a HostIrContainer as input, reads the // TensorView's ParallelType::Stream, and modify the the HostIrContainer's top @@ -22,8 +22,8 @@ namespace nvfuser::preseg_passes { // An illustration of the pass can be found in the tests // `test_host_ir_stream_lowering.cpp` // with the option `NVFUSER_DUMP=host_ir`. -class StreamParallelType : public OptimizationPass { - friend class OptimizationPass; +class StreamParallelType : public preseg_passes::OptimizationPass { + friend class preseg_passes::OptimizationPass; protected: static void runPass(Fusion* fusion); @@ -32,4 +32,4 @@ class StreamParallelType : public OptimizationPass { } }; -} // namespace nvfuser::preseg_passes +} // namespace nvfuser::hir diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp index d6e552032b1..fd3e714f7cb 100644 --- a/csrc/python_frontend/fusion_definition.cpp +++ b/csrc/python_frontend/fusion_definition.cpp @@ -455,7 +455,7 @@ std::pair> FusionDefinition:: params.lower.communicator_backend = backend_type_; // Disable StreamParallelType pass temporarily as proper stream lowering // gets implemented - preseg_passes::OptimizationPassGuard + preseg_passes::OptimizationPassGuard guard(false); scheds->multi_device_executor = std::make_unique( std::make_unique(*scheds->preschedFusion()), diff --git a/tests/cpp/test_host_ir_stream_lowering.cpp b/tests/cpp/test_host_ir_stream_lowering.cpp index f6d74caea87..e03fccb34e0 100644 --- a/tests/cpp/test_host_ir_stream_lowering.cpp +++ b/tests/cpp/test_host_ir_stream_lowering.cpp @@ -36,7 +36,7 @@ TEST_F(HirLowerStreamTest, InputsAreNotStreamParallelized) { tv->axis(0)->parallelize(ParallelType::Stream); EXPECT_ANY_THROW(preseg_passes::OptimizationPass< - preseg_passes::StreamParallelType>::runPass(hic.get())); + StreamParallelType>::runPass(hic.get())); } TEST_F(HirLowerStreamTest, Split) { @@ -51,7 +51,7 @@ TEST_F(HirLowerStreamTest, Split) { tv1->axis(0)->parallelize(ParallelType::Stream); EXPECT_ANY_THROW(preseg_passes::OptimizationPass< - preseg_passes::StreamParallelType>::runPass(hic.get())); + StreamParallelType>::runPass(hic.get())); } TEST_F(HirLowerStreamTest, Merge) { @@ -66,7 +66,7 @@ TEST_F(HirLowerStreamTest, Merge) { tv1->axis(0)->parallelize(ParallelType::Stream); EXPECT_ANY_THROW(preseg_passes::OptimizationPass< - preseg_passes::StreamParallelType>::runPass(hic.get())); + StreamParallelType>::runPass(hic.get())); } TEST_F(HirLowerStreamTest, SingleSetOp) { @@ -81,7 +81,7 @@ TEST_F(HirLowerStreamTest, SingleSetOp) { tv1->setMemoryType(MemoryType::Global); tv1->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( + preseg_passes::OptimizationPass::runPass( hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); @@ -111,7 +111,7 @@ TEST_F(HirLowerStreamTest, SingleSetOpNonOutermost) { tv1->setMemoryType(MemoryType::Global); tv1->axis(1)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( + preseg_passes::OptimizationPass::runPass( hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); @@ -144,7 +144,7 @@ TEST_F(HirLowerStreamTest, SingleBinaryOp) { tv2->setMemoryType(MemoryType::Global); tv2->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( + preseg_passes::OptimizationPass::runPass( hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); @@ -180,7 +180,7 @@ TEST_F(HirLowerStreamTest, TwoSetOps) { tv1->axis(0)->parallelize(ParallelType::Stream); tv2->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( + preseg_passes::OptimizationPass::runPass( hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 3); @@ -218,7 +218,7 @@ TEST_F(HirLowerStreamTest, ThreeSetOpsWithDisjointsForLoops) { tv1->axis(0)->parallelize(ParallelType::Stream); tv3->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( + preseg_passes::OptimizationPass::runPass( hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 5); @@ -252,7 +252,7 @@ TEST_F(HirLowerStreamTest, ReductionUnsupported) { tv1->axis(0)->parallelize(ParallelType::Stream); EXPECT_ANY_THROW(preseg_passes::OptimizationPass< - preseg_passes::StreamParallelType>::runPass(hic.get())); + StreamParallelType>::runPass(hic.get())); } TEST_F(HirLowerStreamTest, Reduction) { @@ -267,7 +267,7 @@ TEST_F(HirLowerStreamTest, Reduction) { tv1->setMemoryType(MemoryType::Global); tv1->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( + preseg_passes::OptimizationPass::runPass( hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); @@ -301,7 +301,7 @@ TEST_F(HirLowerStreamTest, Matmul_M) { c->setMemoryType(MemoryType::Global); c->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( + preseg_passes::OptimizationPass::runPass( hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); @@ -338,7 +338,7 @@ TEST_F(HirLowerStreamTest, BatchedMatmul) { c->setMemoryType(MemoryType::Global); c->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( + preseg_passes::OptimizationPass::runPass( hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); @@ -375,7 +375,7 @@ TEST_F(HirLowerStreamTest, Matmul_N) { c->setMemoryType(MemoryType::Global); c->axis(1)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( + preseg_passes::OptimizationPass::runPass( hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); @@ -413,7 +413,7 @@ TEST_F(HirLowerStreamTest, Matmul_K) { c->axis(-1)->parallelize(ParallelType::Stream); EXPECT_ANY_THROW(preseg_passes::OptimizationPass< - preseg_passes::StreamParallelType>::runPass(hic.get())); + StreamParallelType>::runPass(hic.get())); } // We don's support PostOnStream because it does not support well pre-allocated @@ -461,7 +461,7 @@ TEST_F(HirLowerStreamTest, DoNotSupportPostOnStream) { output->axis(-1)->parallelize(ParallelType::Stream); EXPECT_ANY_THROW(preseg_passes::OptimizationPass< - preseg_passes::StreamParallelType>::runPass(hic.get())); + StreamParallelType>::runPass(hic.get())); } } // namespace hir diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp index 7b233bc47db..6932a40fe5c 100644 --- a/tests/cpp/test_multidevice_host_ir.cpp +++ b/tests/cpp/test_multidevice_host_ir.cpp @@ -365,7 +365,7 @@ using OverlapDistributedMatmulTest = MultiDeviceTest; TEST_F(OverlapDistributedMatmulTest, AG_matmul) { // Disable StreamParallelType pass temporarily as proper stream lowering gets // implemented - preseg_passes::OptimizationPassGuard guard( + preseg_passes::OptimizationPassGuard guard( false); constexpr int64_t M = 32768; @@ -424,7 +424,7 @@ TEST_F(OverlapDistributedMatmulTest, AG_matmul) { TEST_F(OverlapDistributedMatmulTest, AG_linear) { // Disable StreamParallelType pass tempor - preseg_passes::OptimizationPassGuard guard( + preseg_passes::OptimizationPassGuard guard( false); constexpr int64_t M = 32768; From 7777fe0038c0c1568f5f8abc91b20618bb360e63 Mon Sep 17 00:00:00 2001 From: snordmann Date: Sun, 27 Apr 2025 07:27:05 -0700 Subject: [PATCH 36/38] lint --- csrc/host_ir/executor.cpp | 2 +- csrc/host_ir/lower.cpp | 3 +- csrc/host_ir/pass/stream_parallel_type.h | 3 +- python/python_frontend/fusion_definition.cpp | 4 +- tests/cpp/test_host_ir_stream_lowering.cpp | 51 ++++++++------------ tests/cpp/test_multidevice_host_ir.cpp | 6 +-- 6 files changed, 29 insertions(+), 40 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 96a462a9444..e089fec32cf 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -771,7 +771,7 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) { } } switch (reduction_op->getReductionOpType()) { - case BinaryOpType::Add: + case BinaryOpType::Add: at::sum_out(output, input, reduction_axes); return; case BinaryOpType::Max: diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index 313970f703c..ca9bb80ae4e 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -769,8 +769,7 @@ std::unique_ptr HostIrLower::lower( } hic->resetTopLevelExprs(new_top_level_exprs); - preseg_passes::OptimizationPass::runPass( - hic.get()); + preseg_passes::OptimizationPass::runPass(hic.get()); return hic; } diff --git a/csrc/host_ir/pass/stream_parallel_type.h b/csrc/host_ir/pass/stream_parallel_type.h index f389dbe1ff7..8b5f138ad7e 100644 --- a/csrc/host_ir/pass/stream_parallel_type.h +++ b/csrc/host_ir/pass/stream_parallel_type.h @@ -22,7 +22,8 @@ namespace nvfuser::hir { // An illustration of the pass can be found in the tests // `test_host_ir_stream_lowering.cpp` // with the option `NVFUSER_DUMP=host_ir`. -class StreamParallelType : public preseg_passes::OptimizationPass { +class StreamParallelType + : public preseg_passes::OptimizationPass { friend class preseg_passes::OptimizationPass; protected: diff --git a/python/python_frontend/fusion_definition.cpp b/python/python_frontend/fusion_definition.cpp index fd3e714f7cb..b77947f1415 100644 --- a/python/python_frontend/fusion_definition.cpp +++ b/python/python_frontend/fusion_definition.cpp @@ -455,8 +455,8 @@ std::pair> FusionDefinition:: params.lower.communicator_backend = backend_type_; // Disable StreamParallelType pass temporarily as proper stream lowering // gets implemented - preseg_passes::OptimizationPassGuard - guard(false); + preseg_passes::OptimizationPassGuard guard( + false); scheds->multi_device_executor = std::make_unique( std::make_unique(*scheds->preschedFusion()), Communicator::getInstance(), diff --git a/tests/cpp/test_host_ir_stream_lowering.cpp b/tests/cpp/test_host_ir_stream_lowering.cpp index e03fccb34e0..b77df002bc6 100644 --- a/tests/cpp/test_host_ir_stream_lowering.cpp +++ b/tests/cpp/test_host_ir_stream_lowering.cpp @@ -35,8 +35,8 @@ TEST_F(HirLowerStreamTest, InputsAreNotStreamParallelized) { hic->addInput(tv); tv->axis(0)->parallelize(ParallelType::Stream); - EXPECT_ANY_THROW(preseg_passes::OptimizationPass< - StreamParallelType>::runPass(hic.get())); + EXPECT_ANY_THROW( + preseg_passes::OptimizationPass::runPass(hic.get())); } TEST_F(HirLowerStreamTest, Split) { @@ -50,8 +50,8 @@ TEST_F(HirLowerStreamTest, Split) { tv1->split(0, 2); tv1->axis(0)->parallelize(ParallelType::Stream); - EXPECT_ANY_THROW(preseg_passes::OptimizationPass< - StreamParallelType>::runPass(hic.get())); + EXPECT_ANY_THROW( + preseg_passes::OptimizationPass::runPass(hic.get())); } TEST_F(HirLowerStreamTest, Merge) { @@ -65,8 +65,8 @@ TEST_F(HirLowerStreamTest, Merge) { tv1->merge(0, 1); tv1->axis(0)->parallelize(ParallelType::Stream); - EXPECT_ANY_THROW(preseg_passes::OptimizationPass< - StreamParallelType>::runPass(hic.get())); + EXPECT_ANY_THROW( + preseg_passes::OptimizationPass::runPass(hic.get())); } TEST_F(HirLowerStreamTest, SingleSetOp) { @@ -81,8 +81,7 @@ TEST_F(HirLowerStreamTest, SingleSetOp) { tv1->setMemoryType(MemoryType::Global); tv1->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( - hic.get()); + preseg_passes::OptimizationPass::runPass(hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); @@ -111,8 +110,7 @@ TEST_F(HirLowerStreamTest, SingleSetOpNonOutermost) { tv1->setMemoryType(MemoryType::Global); tv1->axis(1)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( - hic.get()); + preseg_passes::OptimizationPass::runPass(hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); @@ -144,8 +142,7 @@ TEST_F(HirLowerStreamTest, SingleBinaryOp) { tv2->setMemoryType(MemoryType::Global); tv2->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( - hic.get()); + preseg_passes::OptimizationPass::runPass(hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); @@ -180,8 +177,7 @@ TEST_F(HirLowerStreamTest, TwoSetOps) { tv1->axis(0)->parallelize(ParallelType::Stream); tv2->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( - hic.get()); + preseg_passes::OptimizationPass::runPass(hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 3); EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); @@ -218,8 +214,7 @@ TEST_F(HirLowerStreamTest, ThreeSetOpsWithDisjointsForLoops) { tv1->axis(0)->parallelize(ParallelType::Stream); tv3->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( - hic.get()); + preseg_passes::OptimizationPass::runPass(hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 5); EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); @@ -251,8 +246,8 @@ TEST_F(HirLowerStreamTest, ReductionUnsupported) { tv1->setMemoryType(MemoryType::Global); tv1->axis(0)->parallelize(ParallelType::Stream); - EXPECT_ANY_THROW(preseg_passes::OptimizationPass< - StreamParallelType>::runPass(hic.get())); + EXPECT_ANY_THROW( + preseg_passes::OptimizationPass::runPass(hic.get())); } TEST_F(HirLowerStreamTest, Reduction) { @@ -267,8 +262,7 @@ TEST_F(HirLowerStreamTest, Reduction) { tv1->setMemoryType(MemoryType::Global); tv1->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( - hic.get()); + preseg_passes::OptimizationPass::runPass(hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); @@ -301,8 +295,7 @@ TEST_F(HirLowerStreamTest, Matmul_M) { c->setMemoryType(MemoryType::Global); c->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( - hic.get()); + preseg_passes::OptimizationPass::runPass(hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); @@ -338,8 +331,7 @@ TEST_F(HirLowerStreamTest, BatchedMatmul) { c->setMemoryType(MemoryType::Global); c->axis(0)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( - hic.get()); + preseg_passes::OptimizationPass::runPass(hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); @@ -375,8 +367,7 @@ TEST_F(HirLowerStreamTest, Matmul_N) { c->setMemoryType(MemoryType::Global); c->axis(1)->parallelize(ParallelType::Stream); - preseg_passes::OptimizationPass::runPass( - hic.get()); + preseg_passes::OptimizationPass::runPass(hic.get()); EXPECT_EQ(hic->topLevelExprs().size(), 2); EXPECT_TRUE(hic->topLevelExprs().at(0)->isA()); @@ -412,8 +403,8 @@ TEST_F(HirLowerStreamTest, Matmul_K) { c->setMemoryType(MemoryType::Global); c->axis(-1)->parallelize(ParallelType::Stream); - EXPECT_ANY_THROW(preseg_passes::OptimizationPass< - StreamParallelType>::runPass(hic.get())); + EXPECT_ANY_THROW( + preseg_passes::OptimizationPass::runPass(hic.get())); } // We don's support PostOnStream because it does not support well pre-allocated @@ -460,8 +451,8 @@ TEST_F(HirLowerStreamTest, DoNotSupportPostOnStream) { output->axis(-1)->parallelize(ParallelType::Stream); - EXPECT_ANY_THROW(preseg_passes::OptimizationPass< - StreamParallelType>::runPass(hic.get())); + EXPECT_ANY_THROW( + preseg_passes::OptimizationPass::runPass(hic.get())); } } // namespace hir diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp index 6932a40fe5c..db53f7f114d 100644 --- a/tests/cpp/test_multidevice_host_ir.cpp +++ b/tests/cpp/test_multidevice_host_ir.cpp @@ -365,8 +365,7 @@ using OverlapDistributedMatmulTest = MultiDeviceTest; TEST_F(OverlapDistributedMatmulTest, AG_matmul) { // Disable StreamParallelType pass temporarily as proper stream lowering gets // implemented - preseg_passes::OptimizationPassGuard guard( - false); + preseg_passes::OptimizationPassGuard guard(false); constexpr int64_t M = 32768; constexpr int64_t K = 32768; @@ -424,8 +423,7 @@ TEST_F(OverlapDistributedMatmulTest, AG_matmul) { TEST_F(OverlapDistributedMatmulTest, AG_linear) { // Disable StreamParallelType pass tempor - preseg_passes::OptimizationPassGuard guard( - false); + preseg_passes::OptimizationPassGuard guard(false); constexpr int64_t M = 32768; constexpr int64_t K = 32768; From e517bc31fad5cd39f4d84e5d813f469f6ba90a26 Mon Sep 17 00:00:00 2001 From: snordmann Date: Sun, 27 Apr 2025 07:36:08 -0700 Subject: [PATCH 37/38] fix merge --- csrc/host_ir/executor.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index e089fec32cf..2f2cf9e7b92 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -705,8 +705,6 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { bind(tv, tensor); } -<<<<<<< HEAD -======= void HostIrEvaluator::handle(HirAliasSelect* hir_alias_select) { auto index = expr_evaluator_.evaluate(hir_alias_select->index()).as(); @@ -716,7 +714,6 @@ void HostIrEvaluator::handle(HirAliasSelect* hir_alias_select) { bind(hir_alias_select->out(), input.select(axis, index)); } ->>>>>>> bfc7ba836400aa349fab473fa04bab204e9c5601 void HostIrEvaluator::handle(BinaryOp* binary_op) { if (!isKnown(binary_op->outputs().at(0))) { return unhandled(binary_op); @@ -789,18 +786,6 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) { } } -<<<<<<< HEAD -void HostIrEvaluator::handle(HirAliasSelect* hir_alias_select) { - auto index = - expr_evaluator_.evaluate(hir_alias_select->index()).as(); - auto input = getKnownConcreteValue(hir_alias_select->in()->as()) - .as(); - int64_t axis = hir_alias_select->axis(); - bind(hir_alias_select->out(), input.select(axis, index)); -} - -======= ->>>>>>> bfc7ba836400aa349fab473fa04bab204e9c5601 void HostIrEvaluator::unhandled(Statement* stmt) { NVF_ERROR(stmt->isA(), stmt, " must be an Expr"); auto* expr = stmt->as(); From 35ff4dab62b187e3842086ff20a223af267c1a50 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 28 Apr 2025 09:58:44 +0300 Subject: [PATCH 38/38] empty commit to trigger the CI