From fa8455a19a98cea693e0cbcd8f363afe6e6a2786 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 10 May 2024 11:54:42 -0700 Subject: [PATCH 01/13] LoopPromotionBuilder class --- CMakeLists.txt | 1 + csrc/id_model/id_model.cpp | 5 +- csrc/id_model/id_model.h | 2 + csrc/id_model/loop_promotion.cpp | 159 +++++++++++++++++++++++++++++++ csrc/id_model/loop_promotion.h | 39 ++++++++ 5 files changed, 205 insertions(+), 1 deletion(-) create mode 100644 csrc/id_model/loop_promotion.cpp create mode 100644 csrc/id_model/loop_promotion.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 04a7996933e..f0dfb955f31 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,6 +135,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/host_ir/executor.cpp ${NVFUSER_SRCS_DIR}/host_ir/host_ir.cpp ${NVFUSER_SRCS_DIR}/id_model/id_model.cpp + ${NVFUSER_SRCS_DIR}/id_model/loop_promotion.cpp ${NVFUSER_SRCS_DIR}/id_model/to_string.cpp ${NVFUSER_SRCS_DIR}/id_model/transform_replay.cpp ${NVFUSER_SRCS_DIR}/id_model/validation_utils.cpp diff --git a/csrc/id_model/id_model.cpp b/csrc/id_model/id_model.cpp index d69e1620531..58521ee1a9c 100644 --- a/csrc/id_model/id_model.cpp +++ b/csrc/id_model/id_model.cpp @@ -6,6 +6,7 @@ */ // clang-format on #include +#include #include #include #include @@ -580,6 +581,8 @@ void IdModel::buildLoopGraph() { validateLoopGraphHasNoSelfMappedLeafDomains(); idGraph(IdMappingMode::LOOP).validateConsistency(); + + auto loop_promotion_map2 = LoopPromotionMapBuilder::get(*this, inlining_info); } std::unordered_map IdModel::buildLoopPromotionMap( @@ -703,7 +706,7 @@ std::unordered_map IdModel::buildLoopPromotionMap( // Insert the updated Step-3 results into the Step-5 resutls. Note // that this insertion does not overwrite the existing mappings. - final_iel_promotion_map.insert( + final_loop_promotion_map.insert( loop_promotion_map.begin(), loop_promotion_map.end()); sanityCheckLoopPromotionMap(final_loop_promotion_map); diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h index 8255e6a8000..167f0007561 100644 --- a/csrc/id_model/id_model.h +++ b/csrc/id_model/id_model.h @@ -20,6 +20,7 @@ namespace nvfuser { class ValGraph; +class LoopPromotionMapBuilder; struct StatefulInliningInfo { // All producer ids within (including dependencies of) inlined leaf domains, @@ -95,6 +96,7 @@ StatefulInliningInfo buildStatefulInliningInfo( // Subgraph of the permissive graph. Maps only CA and their // dependent domains. class IdModel : public PolymorphicBase { + friend class LoopPromotionMapBuilder; public: // Sometimes fusion inputs or outputs are disconnected from expressions, in // those cases we still may want to send in some additional tensor views from diff --git a/csrc/id_model/loop_promotion.cpp b/csrc/id_model/loop_promotion.cpp new file mode 100644 index 00000000000..4874df0c7cb --- /dev/null +++ b/csrc/id_model/loop_promotion.cpp @@ -0,0 +1,159 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include +#include +#include + +namespace nvfuser { + +LoopPromotionMapBuilder::LoopPromotionMapBuilder( + IdModel& id_model, + const StatefulInliningInfo& inlining_info) + : id_model_(id_model), inlining_info_(inlining_info) {} + +void LoopPromotionMapBuilder::build() { + auto& loop_graph = id_model_.idGraph(IdMappingMode::LOOP); + + std::cerr << nvfuser::idGroupsString(loop_graph); + std::cerr << "Size: " << inlining_info_.ordered_p_ca_ids.size() << std::endl; +#if 0 + // Make an intersection of the exact and loop map. This will group together + // entries in each loop group that are exact with each other. This provides a + // better graph to do promotion and replays. + // + // It's tempting to use the intersection of the almost exact and loop, but we + // need to model broadcast promotion, and if we have two tensors like: + // + // T1[i0, b1] = T0[i0] + // T2[i0, b2] = T0[i0] + // Then resolution of: + // T4 = T1[i0, b1] + T3[i0, i1] + // T6 = T2[i0, b2] + T5[i0, i2] + // + // Then merge(0, 1) with all tensors except for T0 + // + // The almost exact map will map i0, i0*b1, and i0*b2 together, but b1 and b2 + // are being resolved to i1 and i2 respectively. So we want to have separate + // entries so we can have an easy to process promotion map. + // + // Loop is a permissive like map, it could have many entries, use the exact + // map as the one we iterate on to reduce complexity as it hopefully has + // smaller groups and this algorithm scales with the number of groups * + // (number of entries in groups ^ 2) + // + // iel stands for Intersection of the Exact and Loop graphs. + ValGraph iel_graph = buildIntersection( + id_model_.idGraph(IdMappingMode::EXACT), id_model_.idGraph(IdMappingMode::LOOP), false); + + // Step 1: Build a map of the IEL groups of root broadcast domains + // to resolving domains. + std::unordered_map iel_promotion_map = + buildInlineRootResolutionMap(iel_graph, inlining_info_); + + // Step 2: Propagate the root promotions to intermediate and leaf groups. + // At this point, the promotion may not be final as the analysis is + // localized to IEL groups. The map is used in the next step to + // build mappings of the loop groups. + propagatePromotionsInIELGraph(iel_graph, iel_promotion_map); + + // Step 3: Determine the promotion of each loop graph based on the + // IEL promotion map. For each loop group, examine all the IEL + // promotions and find the most representative one that captures all + // the dependent input domains of the loop group + std::unordered_map loop_promotion_map = + projectIELPromotionToLoopGraph( + iel_graph, + iel_promotion_map, + idGraph(IdMappingMode::LOOP), + inlining_info); + + // At this point, most of loop groups should have correct promoted + // IDs. However, non-inlined loop groups may miss promotion that + // should be propagated from parent ID groups, e.g., iS50 of T2 in + // Indexing19. Its parent ID loop group is promoted, but the loop + // group of iS50 is not found yet. + + // Step 4: In order to fully propagate the loop graph promotions, first + // propagate them to the IEL groups, which are then used to + // propagate back to the loop groups in Step 5. Unlike Step 2, the + // initial IEL promotion map is empty and is populated with the loop + // promotion map as we traverse down the IEL graph. + std::unordered_map final_iel_promotion_map; + propagatePromotionsInIELGraph( + iel_graph, + final_iel_promotion_map, + idGraph(IdMappingMode::LOOP), + loop_promotion_map); + + // Step 5: Find the final promotion of each loop group based on the + // final IEL promotion map + auto final_loop_promotion_map = projectIELPromotionToLoopGraph( + iel_graph, + final_iel_promotion_map, + idGraph(IdMappingMode::LOOP), + inlining_info); + + // The promotion map produced in Step 5 only includes those are + // further propagated at Step 4, so the correct mappings produced at + // Step 3 may not be included in the Step-5 results. Any Step-3 mappings + // that are not found in the Step-5 results are already valid + // results, so merge them into the Step-5 results. + // + // For example, in the below case, nothing will be propated at Step + // 4. + // + // t0: [i0] + // t1: [i1, i2] + // t2 = broadcast(t0, {true, false}) + // t3 = t2 + t1 + // + // t2: [b3, i4] + // t3: [i5, i6] + // + // t3->merge(0) + // propagate-and-inline-most + // + // t0: [i0] ca_pos(1) + // t1: [i1*i2] ca_pos(1) + // t2: [b3*i4] ca_pos(1) + // t3: [i5*i6] + // + // In this case, all domains will be grouped together and there will + // be just a single group in the Loop graph: + // + // - {i0, i1, i2, b3, i4, i5, i6, i1*i2, b3*i4, i5*i6} + // + // Step 3 will identify i5*i6 is the promotion domain. Since all + // domains are promoted to i5*i6, there will be no propagation in + // Step 4 (i.e., loop_promote_inputs will be false). Since the + // result of Step 4 is empty, the Step 5 result will also be empty, + // but that just means there's no change is necessary from the Step + // 3 results. + + // Update the Step-3 map to the latest LOOP graph + loop_promotion_map = + updateValGroupIdMap(loop_promotion_map, idGraph(IdMappingMode::LOOP)); + + // Insert the updated Step-3 results into the Step-5 resutls. Note + // that this insertion does not overwrite the existing mappings. + final_loop_promotion_map.insert( + loop_promotion_map.begin(), loop_promotion_map.end()); + + sanityCheckLoopPromotionMap(final_loop_promotion_map); +#endif +} + +std::unordered_map LoopPromotionMapBuilder::get( + IdModel& id_model, + const StatefulInliningInfo& inlining_info) { + LoopPromotionMapBuilder builder(id_model, inlining_info); + builder.build(); + return builder.loop_promotion_map_; +} + +} // namespace nvfuser diff --git a/csrc/id_model/loop_promotion.h b/csrc/id_model/loop_promotion.h new file mode 100644 index 00000000000..dfc9812fe09 --- /dev/null +++ b/csrc/id_model/loop_promotion.h @@ -0,0 +1,39 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once + +#include + +namespace nvfuser { + +class IdModel; +struct StatefulInliningInfo; + +class LoopPromotionMapBuilder { + public: + // Build a map of loop groups to IterDomains that represent actual + // loops. The map is built based on the broadcast resolution with + // root domains between inlined producer and consumer tensors. + static std::unordered_map get( + IdModel& id_model, + const StatefulInliningInfo& inlining_info); + + private: + LoopPromotionMapBuilder( + IdModel& id_model, + const StatefulInliningInfo& inlining_info); + + void build(); + + private: + IdModel& id_model_; + const StatefulInliningInfo& inlining_info_; + std::unordered_map loop_promotion_map_; +}; + +} // namespace nvfuser From c05fd851e2b3f96abb3e1b02178b8f199bc4fa48 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 10 May 2024 12:58:29 -0700 Subject: [PATCH 02/13] Copied all loop promotion code --- csrc/id_model/id_model.cpp | 6 +- csrc/id_model/id_model.h | 27 +- csrc/id_model/loop_promotion.cpp | 665 ++++++++++++++++++++++++++++++- csrc/id_model/loop_promotion.h | 84 ++++ 4 files changed, 752 insertions(+), 30 deletions(-) diff --git a/csrc/id_model/id_model.cpp b/csrc/id_model/id_model.cpp index 58521ee1a9c..ff5d3fd5cb7 100644 --- a/csrc/id_model/id_model.cpp +++ b/csrc/id_model/id_model.cpp @@ -226,7 +226,7 @@ std::string IdModel::toString() const { return ss.str(); } -ValGraph IdModel::initializeIdGraph(bool propagate_through_exprs) { +ValGraph IdModel::initializeIdGraph(bool propagate_through_exprs) const { ValGraph id_graph(propagate_through_exprs); // To deterministically initialize the graph, the order of adding @@ -716,7 +716,7 @@ std::unordered_map IdModel::buildLoopPromotionMap( std::unordered_map IdModel::buildInlineRootResolutionMap( const ValGraph& iel_graph, - const StatefulInliningInfo& info) { + const StatefulInliningInfo& info) const { std::unordered_map iel_promotion_map; // This should probably work just on terminating inputs, as we shouldn't be @@ -918,7 +918,7 @@ void IdModel::maybeBuildGraph(IdMappingMode mode) { ValGraph IdModel::buildIntersection( const ValGraph& graph0, const ValGraph& graph1, - bool propagate_exprs) { + bool propagate_exprs) const { ValGraph intersection = initializeIdGraph(propagate_exprs); for (const ValGroup& group0 : graph0.disjointValSets().disjointSets()) { auto set_size = group0->size(); diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h index 167f0007561..c58c17531b0 100644 --- a/csrc/id_model/id_model.h +++ b/csrc/id_model/id_model.h @@ -96,7 +96,6 @@ StatefulInliningInfo buildStatefulInliningInfo( // Subgraph of the permissive graph. Maps only CA and their // dependent domains. class IdModel : public PolymorphicBase { - friend class LoopPromotionMapBuilder; public: // Sometimes fusion inputs or outputs are disconnected from expressions, in // those cases we still may want to send in some additional tensor views from @@ -127,6 +126,16 @@ class IdModel : public PolymorphicBase { const ValGraph& idGraph(IdMappingMode mode) const; ValGraph& idGraph(IdMappingMode mode); + const std::unordered_map>& idUses() + const { + return id_uses_; + } + + const std::unordered_map>& + idDefinitions() const { + return id_definitions_; + } + // TODO: Seems a bit unfortunate that this isn't IterDomain local information. const std::unordered_set& viewRfactorIds() const { return view_rfactor_ids_; @@ -165,19 +174,24 @@ class IdModel : public PolymorphicBase { // Iterates over all IterDomains in id_definitions_ and calls initializeVal on // a new ValGraph and returns it. - ValGraph initializeIdGraph(bool propagate_through_exprs = true); + ValGraph initializeIdGraph(bool propagate_through_exprs = true) const; // Returns an IdGraph with all Id's mapped that are mapped both in graph0 and // graph1. ValGraph buildIntersection( const ValGraph& graph0, const ValGraph& graph1, - bool propagate_exprs = true); + bool propagate_exprs = true) const; const std::unordered_map& loopPromotionMap() const { return loop_promotion_map_; } + // Replay Expr but with the inputs provided. ValGraphs will be updated + // for all maps that have entries, adding the output iter domains of the + // replayed expression and adding potential mappings through the expression. + Expr* addReplayAs(std::vector new_inputs, Expr* expr); + protected: // Fills id_uses_ and id_definitions_ for all IterDomains active in the // fusion. @@ -197,7 +211,7 @@ class IdModel : public PolymorphicBase { // IterDomain picked from its IEL group. std::unordered_map buildInlineRootResolutionMap( const ValGraph& iel_graph, - const StatefulInliningInfo& info); + const StatefulInliningInfo& info) const; // Helper function for building loop promotion map. // @@ -284,11 +298,6 @@ class IdModel : public PolymorphicBase { // tensor. void validateLoopGraphHasNoSelfMappedLeafDomains() const; - // Replay Expr but with the inputs provided. ValGraphs will be updated - // for all maps that have entries, adding the output iter domains of the - // replayed expression and adding potential mappings through the expression. - Expr* addReplayAs(std::vector new_inputs, Expr* expr); - protected: // All tensor expressions that this model analyzes std::vector tv_exprs_; diff --git a/csrc/id_model/loop_promotion.cpp b/csrc/id_model/loop_promotion.cpp index 4874df0c7cb..73af2c3ee08 100644 --- a/csrc/id_model/loop_promotion.cpp +++ b/csrc/id_model/loop_promotion.cpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include namespace nvfuser { @@ -16,12 +18,15 @@ LoopPromotionMapBuilder::LoopPromotionMapBuilder( const StatefulInliningInfo& inlining_info) : id_model_(id_model), inlining_info_(inlining_info) {} -void LoopPromotionMapBuilder::build() { - auto& loop_graph = id_model_.idGraph(IdMappingMode::LOOP); +ValGraph& LoopPromotionMapBuilder::idGraph(IdMappingMode mode) { + return id_model_.idGraph(mode); +} + +const ValGraph& LoopPromotionMapBuilder::idGraph(IdMappingMode mode) const { + return id_model_.idGraph(mode); +} - std::cerr << nvfuser::idGroupsString(loop_graph); - std::cerr << "Size: " << inlining_info_.ordered_p_ca_ids.size() << std::endl; -#if 0 +void LoopPromotionMapBuilder::build() { // Make an intersection of the exact and loop map. This will group together // entries in each loop group that are exact with each other. This provides a // better graph to do promotion and replays. @@ -47,8 +52,8 @@ void LoopPromotionMapBuilder::build() { // (number of entries in groups ^ 2) // // iel stands for Intersection of the Exact and Loop graphs. - ValGraph iel_graph = buildIntersection( - id_model_.idGraph(IdMappingMode::EXACT), id_model_.idGraph(IdMappingMode::LOOP), false); + ValGraph iel_graph = id_model_.buildIntersection( + idGraph(IdMappingMode::EXACT), idGraph(IdMappingMode::LOOP), false); // Step 1: Build a map of the IEL groups of root broadcast domains // to resolving domains. @@ -65,12 +70,12 @@ void LoopPromotionMapBuilder::build() { // IEL promotion map. For each loop group, examine all the IEL // promotions and find the most representative one that captures all // the dependent input domains of the loop group - std::unordered_map loop_promotion_map = + std::unordered_map initial_loop_promotion_map = projectIELPromotionToLoopGraph( iel_graph, iel_promotion_map, idGraph(IdMappingMode::LOOP), - inlining_info); + inlining_info_); // At this point, most of loop groups should have correct promoted // IDs. However, non-inlined loop groups may miss promotion that @@ -88,15 +93,15 @@ void LoopPromotionMapBuilder::build() { iel_graph, final_iel_promotion_map, idGraph(IdMappingMode::LOOP), - loop_promotion_map); + initial_loop_promotion_map); // Step 5: Find the final promotion of each loop group based on the // final IEL promotion map - auto final_loop_promotion_map = projectIELPromotionToLoopGraph( + loop_promotion_map_ = projectIELPromotionToLoopGraph( iel_graph, final_iel_promotion_map, idGraph(IdMappingMode::LOOP), - inlining_info); + inlining_info_); // The promotion map produced in Step 5 only includes those are // further propagated at Step 4, so the correct mappings produced at @@ -136,16 +141,640 @@ void LoopPromotionMapBuilder::build() { // 3 results. // Update the Step-3 map to the latest LOOP graph - loop_promotion_map = - updateValGroupIdMap(loop_promotion_map, idGraph(IdMappingMode::LOOP)); + initial_loop_promotion_map = updateValGroupIdMap( + initial_loop_promotion_map, idGraph(IdMappingMode::LOOP)); // Insert the updated Step-3 results into the Step-5 resutls. Note // that this insertion does not overwrite the existing mappings. - final_loop_promotion_map.insert( - loop_promotion_map.begin(), loop_promotion_map.end()); + loop_promotion_map_.insert( + initial_loop_promotion_map.begin(), initial_loop_promotion_map.end()); + + sanityCheckLoopPromotionMap(loop_promotion_map_); +} + +std::unordered_map LoopPromotionMapBuilder:: + buildInlineRootResolutionMap( + const ValGraph& iel_graph, + const StatefulInliningInfo& info) const { + std::unordered_map iel_promotion_map; + + // This should probably work just on terminating inputs, as we shouldn't be + // able to modify a broadcast domain between root and rfactor which would be + // required to resolve a non input broadcast domain. But for now leaving it as + // traversal on all broadcast groups. + // + + // We first visit all broadcast root domains. If a broadcast is + // resovled, see if it's promoted. Note that a domain be resolved to + // a domain that may not be loop mapped, yet it can still be + // promoted. In other words, there can be a domain that is exactly + // mapped with the resolving domain *and* is mapped with the + // broadcast domain by the loop map. The algorihm here is: + // + // 1. For a broadcast domain, find the domain that the broadcast is + // resolved to. + // 2. If the resolving domain is also loop-mapped with the + // broadcast, that is the promotion domain, but the resolving + // domain may not be loop mapped as mentioned above. Instead, + // find all loop-mapped domains with the broadcast domain and + // pick one that is exactly mapped with the resolving domain + // + // Note again this process is only done for root domains. Once we + // find promotion relationships for root domains, we propagate the + // mappings to derived domains + for (const ValGroup& iel_group : iel_graph.disjointValSets().disjointSets()) { + NVF_ERROR(!iel_group->empty()); + + IterDomain* iel_group_id = iel_group->front()->as(); + + if (!iel_group_id->isBroadcast()) { + continue; + } + + // Collect all the exact groups of the resolutions of the broadcast id's + ValGroups resolved_exact_groups; + for (Val* bcast_id : *iel_group) { + if (auto p2c_root_broadcast_resolution_map_it = + info.p2c_root_broadcast_resolution_map.find( + bcast_id->as()); + p2c_root_broadcast_resolution_map_it != + info.p2c_root_broadcast_resolution_map.end()) { + resolved_exact_groups.pushBack( + idGraph(IdMappingMode::EXACT) + .toGroups(p2c_root_broadcast_resolution_map_it->second)); + } + } + + if (resolved_exact_groups.empty()) { + // No resolution + continue; + } + + // resolved_exact_groups is a list of IDs that resolves the + // broadcast. We only care those that are also in the same loop + // group, and there must be just one or none. Otherwise, the + // resolution is ambiguous. + + // Collect all the exact groups in the loop set containing this iel_group + const ValGroup& loop_group = + idGraph(IdMappingMode::LOOP).toGroup(iel_group_id); + ValGroups loop_covered_exact_groups = + idGraph(IdMappingMode::EXACT).toGroups(*loop_group); + + // The intersection of the exact groups that the broadcast domains can be + // broadcasted to, and those that exist within the same loop groop are is + // the promotion needed for this iel_group. The promotion should + // be none or unique. + ValGroups loop_exact_resolved_intersection = + resolved_exact_groups.computeIntersect(loop_covered_exact_groups); + + if (loop_exact_resolved_intersection.empty()) { + // No promotion + continue; + } + + if (loop_exact_resolved_intersection.size() > 1) { + // Ambiguous promotion. This should not happen. + std::stringstream err_msg; + err_msg + << "Invalid multiple broadcast resolution within shared loops detected, group:\n " + << iel_group->toString() << "\nIs being broadcasted to:"; + for (const ValGroup& entry : loop_exact_resolved_intersection) { + err_msg << "\n " << entry->toString(); + } + NVF_ERROR(false, err_msg.str()); + } + + const ValGroup& exact_resolution_group = + loop_exact_resolved_intersection.front(); + + // Within the loop group, find the IDs that the broadcast IDs are + // resolved to + VectorOfUniqueEntries resolved_ids = + exact_resolution_group->computeIntersect(*loop_group); + + NVF_ERROR(!resolved_ids.empty()); + + // All the IDs in resolved_ids are mapped with both of the exact + // and loop graphs, so any of them can be used as an IEL promotion + // ID. Just to make it extra clear, look for corresponding + // groups in the IEL graph and make sure there's only one such group. + ValGroups promoted_iel_groups = iel_graph.toGroups(resolved_ids); + + NVF_ERROR(!promoted_iel_groups.empty()); + + if (promoted_iel_groups.size() > 1) { + std::stringstream err_msg; + err_msg + << "Invalid multiple broadcast resolution within shared loops detected, group:\n " + << iel_group->toString() << "\nIs being broadcasted to:"; + for (const ValGroup& entry : promoted_iel_groups) { + err_msg << "\n " << entry->toString(); + } + NVF_ERROR(false, err_msg.str()); + } + + iel_promotion_map[iel_group] = + promoted_iel_groups.front()->front()->as(); + } + + return iel_promotion_map; +} + +namespace { + +// Check if there's an equivalent expression as iel_expr that uses +// maybe_promoted_inputs. This is used to avoid redundantly replaying +// expressions. +// NOTE: This is currently overly conservative and some +// opportunities for reuse are lost, althought it doesn't affect +// the correctness of the analysis. +Expr* findMatchingExpr( + const ExprGroup& iel_expr, + const ValGraph& iel_graph, + const std::vector& maybe_promoted_inputs, + const ValGraph& loop_graph) { + // If any of domains in maybe_promoted_inputs is not found in + // iel_graph, it means the domain is just replayed and by definition + // has no mapping with any existing domain, which means there's no + // matching expr. + if (std::any_of( + maybe_promoted_inputs.begin(), + maybe_promoted_inputs.end(), + [&](IterDomain* maybe_promoted_input) -> bool { + return !iel_graph.hasGroup(maybe_promoted_input); + })) { + return nullptr; + } + + // Grab all eligible uses of the promoted inputs. + // Note that any eligible matching expr should be a use of all + // inputs in maybe_promoted_input_uses, no matter it's promoted or + // not. So it isn't necessary to look at all of + // maybe_promoted_input_uses but just need to grab one. + NVF_ERROR(!maybe_promoted_inputs.empty()); + ExprGroups maybe_promoted_input_uses = + iel_graph.getUses(iel_graph.toGroup(maybe_promoted_inputs.front())); + + if (maybe_promoted_input_uses.empty()) { + return nullptr; + } + + // Look for exprs that have inputs that are mapped in the IEL + // graph with the (promoted) inputs of iel_expr. + for (const ExprGroup& maybe_promoted_input_use_group : + maybe_promoted_input_uses) { + NVF_ERROR(!maybe_promoted_input_use_group->empty()); + // maybe_promoted_inputs may include non-promoted inputs as + // well, so maybe_promoted_input_uses may include the original + // iel_expr itself. Since there must at least be a promoted input, + // iel_expr itself should not be an expr group we are looking for. + if (iel_expr == maybe_promoted_input_use_group) { + continue; + } + Expr* maybe_promoted_input_use = maybe_promoted_input_use_group->front(); + if (!iel_expr->front()->sameOp(maybe_promoted_input_use)) { + continue; + } + // Check if all inputs are mapped + NVF_ERROR( + maybe_promoted_inputs.size() == + maybe_promoted_input_use->inputs().size()); + bool all_inputs_match = true; + for (const auto inp_i : c10::irange(maybe_promoted_inputs.size())) { + all_inputs_match = all_inputs_match && + iel_graph.disjointValSets().strictAreMapped( + maybe_promoted_inputs[inp_i], + maybe_promoted_input_use->inputs().at(inp_i)); + } + if (!all_inputs_match) { + continue; + } + + // We always want to find promotions within the same loop + // groups since we are looking for domains that represent actual + // loops. Note that that's guaranteed when a new domain is + // replayed instead of reusing an existing domain. + if (!loop_graph.disjointExprSets().permissiveAreMapped( + iel_expr->front(), maybe_promoted_input_use_group->front())) { + continue; + } + // This is just an extra sanity check. Make sure all exprs in + // the use group are mapped + NVF_ERROR( + std::all_of( + maybe_promoted_input_use_group->vector().begin(), + maybe_promoted_input_use_group->vector().end(), + [&](Expr* iel_use) { + return loop_graph.disjointExprSets().permissiveAreMapped( + iel_expr->front(), iel_use); + }), + "Not all mapped: ", + nvfuser::toString(iel_expr), + "\n", + nvfuser::toString(maybe_promoted_input_use_group)); + + return maybe_promoted_input_use; + } + + return nullptr; +} + +// When propagating loop promotions from inputs to outputs of an IEL +// expr, we can't blindly apply loop promotion when all of the input +// domains are loop mapped with the outputs. +// +// i.e. if we have the inlined domains from: +// Inputs: +// T0[i0] +// T1[i0, i1] +// +// T2[i0, b2] = broadcast(T0) +// T3[i0, i1] = T2 + T1 +// +// {T1, T2, T3}->merge(0, 1) +// inlineMost +// +// The inlined loop group would consist of: +// +// {i0, i1, b2, i0*b2, i0*i1} +// +// Note that all these domains would have promotion to i0*i1 at the +// end of Step 3. When the IEL expression of merge(i0, i1) is visited by +// propagatePromotionsInIELGraph again, the promotion to i0*i1 of both +// inputs would be propagated to its output, resulting in promotion of +// i0*i1 to (i0*i1)*(i0*i1), which is not the correct propagation. +// +// Therefore only promote i0*b1 to i0*i1, or i0*i1 to i0*i1 (i.e. don't +// promote an input to any transformation within the loop group). +// +// So if we have an iel_expr make sure its inputs and outputs are not in +// the same loop group. +bool hasUniqueInputLoopGroups( + const ExprGroup& iel_expr, + const ValGraph& iel_graph, + const ValGraph& loop_graph) { + const std::vector iel_inp_groups = iel_graph.inputGroups(iel_expr); + + const std::vector iel_out_groups = iel_graph.outputGroups(iel_expr); + + ValGroups inp_loop_groups; + for (const ValGroup& iel_inp_group : iel_inp_groups) { + inp_loop_groups.pushBack(loop_graph.toGroup(iel_inp_group->front())); + } + ValGroups out_loop_groups; + for (const ValGroup& iel_out_group : iel_out_groups) { + out_loop_groups.pushBack(loop_graph.toGroup(iel_out_group->front())); + } + + // Check if input groups that are not included in the output group set + return !inp_loop_groups.computeSubtract(out_loop_groups).empty(); +} + +} // namespace + +void LoopPromotionMapBuilder::propagatePromotionsInIELGraph( + const ValGraph& iel_graph, + std::unordered_map& iel_promotion_map, + const ValGraph& loop_graph, + const std::unordered_map& loop_graph_promotion_map) { + // In order to make this traversal work, the traversal order must be + // topologically sorted. + ValGraphStmtSort iel_stmt_sort(iel_graph); + + for (const ExprGroup& iel_expr : iel_stmt_sort.exprs()) { + NVF_ERROR(!iel_expr->empty()); + const std::vector iel_inp_groups = + iel_graph.inputGroups(iel_expr); + + // Check if any inputs need promotion indicating this expr group needs to + // be replayed with promoted inputs + bool an_input_was_promoted = false; + std::vector maybe_promoted_inputs; + maybe_promoted_inputs.reserve(iel_inp_groups.size()); + + // Propagate loop graph promotion only when the inputs and outputs are + // not in the same loop group. + const bool loop_promote_inputs = !loop_graph_promotion_map.empty() && + hasUniqueInputLoopGroups(iel_expr, iel_graph, loop_graph); + + for (const ValGroup& iel_inp_group : iel_inp_groups) { + // Assumed all inputs are IterDomains + NVF_ERROR(iel_inp_group->front()->isA()); + + // Propagate IEL promotions when available. + if (auto inp_promo_it = iel_promotion_map.find(iel_inp_group); + inp_promo_it != iel_promotion_map.end()) { + maybe_promoted_inputs.push_back(inp_promo_it->second); + an_input_was_promoted = true; + continue; + } + + // Promote loops based on the loop promotion map. If the loop promotion + // map should be used and has an entry we should use that promotion. + if (loop_promote_inputs) { + const ValGroup& loop_copy_group = + loop_graph.toGroup(iel_inp_group->front()); + auto inp_loop_promo_it = loop_graph_promotion_map.find(loop_copy_group); + if (inp_loop_promo_it != loop_graph_promotion_map.end()) { + maybe_promoted_inputs.push_back(inp_loop_promo_it->second); + an_input_was_promoted = true; + continue; + } + } + + // No promotion found. Just use the non-promoted domain + maybe_promoted_inputs.push_back(iel_inp_group->front()->as()); + } + + if (!an_input_was_promoted) { + // No inputs need promotion so just continue + continue; + } + + Expr* promoted_expr = findMatchingExpr( + iel_expr, + iel_graph, + maybe_promoted_inputs, + idGraph(IdMappingMode::LOOP)); + + bool replayed = false; + + if (!promoted_expr) { + promoted_expr = + id_model_.addReplayAs(maybe_promoted_inputs, iel_expr->front()); + replayed = true; + } + + // Mark outputs as having a promoted iter domain + std::vector out_groups = iel_graph.outputGroups(iel_expr); + NVF_ERROR(promoted_expr->outputs().size() == out_groups.size()); + NVF_ERROR( + ir_utils::filterByType(promoted_expr->outputs()).size() == + out_groups.size(), + "Unexpected non IterDomain outputs found: ", + promoted_expr->toString()); + + for (const auto i : c10::irange(out_groups.size())) { + // Promote if necessary, if the output is already in the same exact map + // it doesn't need a promotion. + if (idGraph(IdMappingMode::EXACT) + .disjointValSets() + .strictAreMapped( + promoted_expr->output(i), out_groups[i]->front())) { + continue; + } + iel_promotion_map[out_groups[i]] = + promoted_expr->output(i)->as(); + // Explicitly map loop map since expr propagation doesn't happen + if (replayed) { + idGraph(IdMappingMode::LOOP) + .mapVals(iel_expr->front()->output(i), promoted_expr->output(i)); + } + } + } +} + +void LoopPromotionMapBuilder::propagatePromotionsInIELGraph( + const ValGraph& iel_graph, + std::unordered_map& iel_promotion_map) { + propagatePromotionsInIELGraph( + iel_graph, iel_promotion_map, idGraph(IdMappingMode::LOOP), {}); +} + +namespace { + +// Returns for each ValGroup in provided IdGraph what the input ValGroups are +// traversing on definitions. Ignoring broadcast ValGroups and resetting inputs +// at RFactor ValGroups. +std::unordered_map computeCoveredGroups( + const ValGraph& graph, + const std::unordered_set& view_rfactor_ids) { + // Map from an exact iter domain group, to all the exact iter domain groups it + // covers + std::unordered_map covered_ids; + + for (const ValGroup& id_group : graph.disjointValSets().disjointSets()) { + // Initialize inputs + const ExprGroups& id_group_defs = graph.getDefinitions(id_group); + if (id_group_defs.empty()) { + covered_ids[id_group] = {id_group}; + } + + // Initialize rfactor groups + if (std::any_of(id_group->begin(), id_group->end(), [&](Val* id) { + return view_rfactor_ids.find(id->as()) != + view_rfactor_ids.end(); + })) { + covered_ids[id_group] = {id_group}; + } + + // Initialize broadcast groups to empty since broadcast domains + // don't matter for indexing + if (std::any_of(id_group->begin(), id_group->end(), [&](Val* id) { + return id->as()->isBroadcast(); + })) { + covered_ids[id_group] = {}; + } + } + + ValGraphStmtSort exact_stmt_sort(graph); + + for (const ExprGroup& exact_expr : exact_stmt_sort.exprs()) { + std::vector input_groups = graph.inputGroups(exact_expr); + + ValGroups covered; + for (const ValGroup& inp_group : input_groups) { + covered.pushBack(covered_ids.at(inp_group)); + } + + for (const ValGroup& output_group : graph.outputGroups(exact_expr)) { + // Don't overwrite initialized cases due to rfactor markings. + if (covered_ids.find(output_group) == covered_ids.end()) { + covered_ids[output_group] = covered; + } + } + } + + return covered_ids; +} + +}; // namespace + +std::unordered_map LoopPromotionMapBuilder:: + projectIELPromotionToLoopGraph( + const ValGraph& iel_graph, + const std::unordered_map& iel_promotion_map, + const ValGraph& loop_graph, + const StatefulInliningInfo& inlining_info) { + const std::unordered_map exact_covered_ids = + computeCoveredGroups( + idGraph(IdMappingMode::EXACT), id_model_.viewRfactorIds()); + + // Grab terminal iter domain in the loop groups. + const VectorOfUniqueEntries terminal_loop_ids = + computeTerminalLoopIds(inlining_info); + + std::unordered_map loop_promotion_map; + + for (const ValGroup& loop_group : + loop_graph.disjointValSets().disjointSets()) { + IterDomain* promotion_id = findPromotionOfLoopGroup( + loop_group, + iel_graph, + iel_promotion_map, + exact_covered_ids, + terminal_loop_ids); + if (promotion_id) { + loop_promotion_map[loop_group] = promotion_id; + } + } + + return loop_promotion_map; +} + +IterDomain* LoopPromotionMapBuilder::findPromotionOfLoopGroup( + const ValGroup& loop_group, + const ValGraph& iel_graph, + const std::unordered_map& iel_promotion_map, + const std::unordered_map& exact_covered_ids, + const VectorOfUniqueEntries& terminal_loop_ids) { + const ValGraph& exact_graph = idGraph(IdMappingMode::EXACT); + + // Grab all the (potentially promoted) terminal iter domains in this group. + // Save the exact group and the iter domain in this vector. + std::vector> exact_promoted_terminal_ids; + for (auto loop_id : *loop_group) { + // If not a terminal id in the group skip + if (!terminal_loop_ids.has(loop_id->as())) { + continue; + } + + // Grab the iel entry. There can be iter domains that were added + // after the IEL graph was built. All the promotion information is + // associated with the domains that exist in the original graph, + // so the new domains can be simply ignored. + if (!iel_graph.hasGroup(loop_id)) { + continue; + } + + const ValGroup& iel_group = iel_graph.toGroup(loop_id); + + // Does it still need iel_promotion_map? The loop group already has + // the replayed domains, so we should be able to find it. + auto iel_promo_it = iel_promotion_map.find(iel_group); + if (iel_promo_it == iel_promotion_map.end()) { + // If this terminal ID doesn't have a promotion associated with it, save + // the terminal ID. + exact_promoted_terminal_ids.emplace_back( + exact_graph.toGroup(loop_id), loop_id->as()); + } else { + // If this terminal ID has a promotion, grab the promoted ID. + exact_promoted_terminal_ids.emplace_back( + exact_graph.toGroup(iel_promo_it->second), iel_promo_it->second); + } + } + + // All the exact groups of the iter domains in the loop group + ValGroups exact_groups = exact_graph.toGroups(*loop_group); + + // All exact groups covered by all iter domains in this loop group + ValGroups loop_group_covered_ids; + for (const ValGroup& exact_group : exact_groups) { + auto covered_it = exact_covered_ids.find(exact_group); + NVF_ERROR(covered_it != exact_covered_ids.end()); + loop_group_covered_ids.pushBack(covered_it->second); + } + + // Check if any of the candidate Iter Domains we collected cover all the + // exact groups of loop_group_covered_ids. If so, that's the correct + // promoted iter domain of this group. + for (const auto& entry : exact_promoted_terminal_ids) { + const ValGroup& terminal_id_group = entry.first; + IterDomain* terminal_id = entry.second; + auto covered_it = exact_covered_ids.find(terminal_id_group); + NVF_ERROR(covered_it != exact_covered_ids.end()); + if (loop_group_covered_ids.computeSubtract(covered_it->second).empty()) { + return terminal_id; + } + } + + return nullptr; +} + +VectorOfUniqueEntries LoopPromotionMapBuilder:: + computeTerminalLoopIds(const StatefulInliningInfo& info) { + VectorOfUniqueEntries terminal_loop_ids; + for (const ValGroup& group : + idGraph(IdMappingMode::LOOP).disjointValSets().disjointSets()) { + if (group->size() == 1) { + terminal_loop_ids.pushBack(group->front()->as()); + } + + // Don't select producer iter domains + for (auto loop_id : *group) { + if (info.p2c_ca_permissive_maps.find(loop_id->as()) != + info.p2c_ca_permissive_maps.end()) { + continue; + } + + // It's terminal if there's no use group + auto uses_it = id_model_.idUses().find(loop_id->as()); + if (uses_it == id_model_.idUses().end() || uses_it->second.empty()) { + terminal_loop_ids.pushBack(loop_id->as()); + continue; + } + + // If there's an output group that is not in the same group, + // then it's a terminal ID + bool all_outs_in_loop_group = true; + for (auto use : uses_it->second) { + if (std::any_of( + use->outputs().begin(), + use->outputs().end(), + [&](Val* out) -> bool { + return group != idGraph(IdMappingMode::LOOP).toGroup(out); + })) { + all_outs_in_loop_group = false; + break; + } + } + + if (!all_outs_in_loop_group) { + terminal_loop_ids.pushBack(loop_id->as()); + } + } + } + return terminal_loop_ids; +} - sanityCheckLoopPromotionMap(final_loop_promotion_map); -#endif +void LoopPromotionMapBuilder::sanityCheckLoopPromotionMap( + const std::unordered_map& loop_promotion_map) const { + const auto& loop_graph = idGraph(IdMappingMode::LOOP); + for (const ValGroup& loop_group : + loop_graph.disjointValSets().disjointSets()) { + // Non-leaf loop groups are not guaranteed to have valid + // promotions. See for example FusionRepro1713, where root domains + // are all grouped together but there's no valid promotion. + if (loop_graph.hasUses(loop_group)) { + continue; + } + // Make sure the loop group is promoted to a domain that is mapped + // in the LOOP graph + auto promotion_it = loop_promotion_map.find(loop_group); + NVF_ERROR( + promotion_it != loop_promotion_map.end(), + "Loop promotion not found for ", + nvfuser::toString(loop_group)); + IterDomain* promotion = promotion_it->second; + // Make sure the promotion domain is also loop-mapped + NVF_ERROR( + loop_group->has(promotion), + "Loop promotion not loop-mapped. Loop group: ", + nvfuser::toString(loop_group), + ". Promotion domain: ", + promotion->name()); + } } std::unordered_map LoopPromotionMapBuilder::get( diff --git a/csrc/id_model/loop_promotion.h b/csrc/id_model/loop_promotion.h index dfc9812fe09..c2f647d81a6 100644 --- a/csrc/id_model/loop_promotion.h +++ b/csrc/id_model/loop_promotion.h @@ -30,6 +30,90 @@ class LoopPromotionMapBuilder { void build(); + ValGraph& idGraph(IdMappingMode mode); + const ValGraph& idGraph(IdMappingMode mode) const; + + std::unordered_map buildInlineRootResolutionMap( + const ValGraph& iel_graph, + const StatefulInliningInfo& info) const; + + // Helper function for building loop promotion map. + // + // Propagate promotion mappings from root IEL groups to intermediate + // and leaf IEL groups by traversing IEL exprs. For each expr, if an + // input is promoted, the output needs to be promoted too. If + // there's already an equivalent expr that uses the promoted inputs, + // create a mapping from the outputs of the IEL expr to the outputs + // of the equivalent expr. We only consider exprs that are mapped + // in the loop graph as we are looking for domains that represent + // the actual loops of the input and output domains of the IEL + // expr. If no such expr is found, the IEL expr is replayed with the + // promoted inputs. + // + // This is used twice when building the promotion map. The first time + // it is used there's no loop graph promotion yet, so only the IEL + // promotions are propagated. In that case, loop_graph_promotion_map + // should be just empty. + // + // Propagation uses iel_promotion_map and + // loop_graph_promotion_map. If both are available for an IEL group, + // the former has the precedence. This is because when this function + // is used for step 4, the given iel_promotion_map starts as an + // empty map and gets populated during this propagation, so any + // mapping in the map is guaranteed to be the correct final mapping, + // whereas the loop graph may have invalid mappings for partially + // inlined domains. + void propagatePromotionsInIELGraph( + const ValGraph& iel_graph, + std::unordered_map& iel_promotion_map, + const ValGraph& loop_graph, + const std::unordered_map& loop_promotion_map); + + // Same as the other propagatePromotionsInIELGraph but without loop + // graph map. This is used for step 2, where there's no loop + // graph map yet. + void propagatePromotionsInIELGraph( + const ValGraph& iel_graph, + std::unordered_map& iel_promotion_map); + + // Given an IEL promotion map, identify the mapping of each loop + // group. The promotion must represent all the domains in each loop + // group. If a valid representative promotion is not found for a + // loop group, no mapping is added for the group. + std::unordered_map projectIELPromotionToLoopGraph( + const ValGraph& iel_graph, + const std::unordered_map& iel_promotion_map, + const ValGraph& loop_graph, + const StatefulInliningInfo& inlining_info); + + // Find a promoted iter domain of a given loop group that covers all + // the exact groups representative of the resolved transformations + // within the loop group. Specifically, we examine each IEL group of + // the loop group, and if an IEL group has a promotion, we consider it as a + // candidate of the promotion of this loop group. If not, we include a + // domain of the IEL group as a candidate too. Once all candidates are + // obtained, we pick one that covers all the exact domains (cf. concrete + // domains in ComputeAtMap) + IterDomain* findPromotionOfLoopGroup( + const ValGroup& loop_group, + const ValGraph& iel_graph, + const std::unordered_map& iel_promotion_map, + const std::unordered_map& exact_covered_ids, + const VectorOfUniqueEntries& terminal_loop_ids); + + // Terminal loop ids are iteration domains in each loop group that: + // 1) Don't have an entry in p2c_ca_permissive_maps, which would mean a + // consumer TV's iter domain maps to this domain in a way that that domain + // is also in the same loop group + // 2) Don't have a direct IterDomain consumer within the group + VectorOfUniqueEntries computeTerminalLoopIds( + const StatefulInliningInfo& info); + + // Basic consistency check of the given loop promotion map + void sanityCheckLoopPromotionMap( + const std::unordered_map& loop_promotion_map) + const; + private: IdModel& id_model_; const StatefulInliningInfo& inlining_info_; From 31d28e570f30f10b86b0a211bd61420eee3d3210 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 10 May 2024 12:59:43 -0700 Subject: [PATCH 03/13] enable idmodel --- csrc/device_lower/lower2device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 76181bd72b4..f988ce65c4f 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -391,7 +391,7 @@ void GpuLower::analysis(Fusion* fusion) { // functionality should be affected. New IterDomains may be created, // so it is expected that generated code may use diffrent variable // names - if (isOptionEnabled(EnableOption::IdModel)) { + if (true || isOptionEnabled(EnableOption::IdModel)) { IdModel id_model(fusion_); } From d0bdc8ede7e9737af9234f09bf4382c6c541cc71 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 10 May 2024 13:02:56 -0700 Subject: [PATCH 04/13] Switch to the new builder --- csrc/id_model/id_model.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/csrc/id_model/id_model.cpp b/csrc/id_model/id_model.cpp index ff5d3fd5cb7..d3d8446f96e 100644 --- a/csrc/id_model/id_model.cpp +++ b/csrc/id_model/id_model.cpp @@ -574,15 +574,13 @@ void IdModel::buildLoopGraph() { validateLoopGraphHasNoSelfMappedLeafDomains(); - loop_promotion_map_ = buildLoopPromotionMap(inlining_info); + loop_promotion_map_ = LoopPromotionMapBuilder::get(*this, inlining_info); // New domains are added. Make sure there's still no self mapping in // the leaf domains validateLoopGraphHasNoSelfMappedLeafDomains(); idGraph(IdMappingMode::LOOP).validateConsistency(); - - auto loop_promotion_map2 = LoopPromotionMapBuilder::get(*this, inlining_info); } std::unordered_map IdModel::buildLoopPromotionMap( From aa9b414e3f0ba2674821dca1ed8b637355ad0bd9 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 10 May 2024 13:08:48 -0700 Subject: [PATCH 05/13] const --- csrc/id_model/loop_promotion.cpp | 6 +++--- csrc/id_model/loop_promotion.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/csrc/id_model/loop_promotion.cpp b/csrc/id_model/loop_promotion.cpp index 73af2c3ee08..68491e1a9c5 100644 --- a/csrc/id_model/loop_promotion.cpp +++ b/csrc/id_model/loop_promotion.cpp @@ -606,7 +606,7 @@ std::unordered_map LoopPromotionMapBuilder:: const ValGraph& iel_graph, const std::unordered_map& iel_promotion_map, const ValGraph& loop_graph, - const StatefulInliningInfo& inlining_info) { + const StatefulInliningInfo& inlining_info) const { const std::unordered_map exact_covered_ids = computeCoveredGroups( idGraph(IdMappingMode::EXACT), id_model_.viewRfactorIds()); @@ -638,7 +638,7 @@ IterDomain* LoopPromotionMapBuilder::findPromotionOfLoopGroup( const ValGraph& iel_graph, const std::unordered_map& iel_promotion_map, const std::unordered_map& exact_covered_ids, - const VectorOfUniqueEntries& terminal_loop_ids) { + const VectorOfUniqueEntries& terminal_loop_ids) const { const ValGraph& exact_graph = idGraph(IdMappingMode::EXACT); // Grab all the (potentially promoted) terminal iter domains in this group. @@ -703,7 +703,7 @@ IterDomain* LoopPromotionMapBuilder::findPromotionOfLoopGroup( } VectorOfUniqueEntries LoopPromotionMapBuilder:: - computeTerminalLoopIds(const StatefulInliningInfo& info) { + computeTerminalLoopIds(const StatefulInliningInfo& info) const { VectorOfUniqueEntries terminal_loop_ids; for (const ValGroup& group : idGraph(IdMappingMode::LOOP).disjointValSets().disjointSets()) { diff --git a/csrc/id_model/loop_promotion.h b/csrc/id_model/loop_promotion.h index c2f647d81a6..bbde115a623 100644 --- a/csrc/id_model/loop_promotion.h +++ b/csrc/id_model/loop_promotion.h @@ -84,7 +84,7 @@ class LoopPromotionMapBuilder { const ValGraph& iel_graph, const std::unordered_map& iel_promotion_map, const ValGraph& loop_graph, - const StatefulInliningInfo& inlining_info); + const StatefulInliningInfo& inlining_info) const; // Find a promoted iter domain of a given loop group that covers all // the exact groups representative of the resolved transformations @@ -99,7 +99,7 @@ class LoopPromotionMapBuilder { const ValGraph& iel_graph, const std::unordered_map& iel_promotion_map, const std::unordered_map& exact_covered_ids, - const VectorOfUniqueEntries& terminal_loop_ids); + const VectorOfUniqueEntries& terminal_loop_ids) const; // Terminal loop ids are iteration domains in each loop group that: // 1) Don't have an entry in p2c_ca_permissive_maps, which would mean a @@ -107,7 +107,7 @@ class LoopPromotionMapBuilder { // is also in the same loop group // 2) Don't have a direct IterDomain consumer within the group VectorOfUniqueEntries computeTerminalLoopIds( - const StatefulInliningInfo& info); + const StatefulInliningInfo& info) const; // Basic consistency check of the given loop promotion map void sanityCheckLoopPromotionMap( From 7aef29d2d73ea668568a1071c16818317b604cb7 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 10 May 2024 13:15:23 -0700 Subject: [PATCH 06/13] cleanup --- csrc/id_model/loop_promotion.cpp | 13 +++++++------ csrc/id_model/loop_promotion.h | 3 +-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/csrc/id_model/loop_promotion.cpp b/csrc/id_model/loop_promotion.cpp index 68491e1a9c5..6ea37b7f5f6 100644 --- a/csrc/id_model/loop_promotion.cpp +++ b/csrc/id_model/loop_promotion.cpp @@ -26,7 +26,7 @@ const ValGraph& LoopPromotionMapBuilder::idGraph(IdMappingMode mode) const { return id_model_.idGraph(mode); } -void LoopPromotionMapBuilder::build() { +std::unordered_map LoopPromotionMapBuilder::build() { // Make an intersection of the exact and loop map. This will group together // entries in each loop group that are exact with each other. This provides a // better graph to do promotion and replays. @@ -97,7 +97,7 @@ void LoopPromotionMapBuilder::build() { // Step 5: Find the final promotion of each loop group based on the // final IEL promotion map - loop_promotion_map_ = projectIELPromotionToLoopGraph( + auto final_loop_promotion_map = projectIELPromotionToLoopGraph( iel_graph, final_iel_promotion_map, idGraph(IdMappingMode::LOOP), @@ -146,10 +146,12 @@ void LoopPromotionMapBuilder::build() { // Insert the updated Step-3 results into the Step-5 resutls. Note // that this insertion does not overwrite the existing mappings. - loop_promotion_map_.insert( + final_loop_promotion_map.insert( initial_loop_promotion_map.begin(), initial_loop_promotion_map.end()); - sanityCheckLoopPromotionMap(loop_promotion_map_); + sanityCheckLoopPromotionMap(final_loop_promotion_map); + + return final_loop_promotion_map; } std::unordered_map LoopPromotionMapBuilder:: @@ -781,8 +783,7 @@ std::unordered_map LoopPromotionMapBuilder::get( IdModel& id_model, const StatefulInliningInfo& inlining_info) { LoopPromotionMapBuilder builder(id_model, inlining_info); - builder.build(); - return builder.loop_promotion_map_; + return builder.build(); } } // namespace nvfuser diff --git a/csrc/id_model/loop_promotion.h b/csrc/id_model/loop_promotion.h index bbde115a623..a16b8220fb9 100644 --- a/csrc/id_model/loop_promotion.h +++ b/csrc/id_model/loop_promotion.h @@ -28,7 +28,7 @@ class LoopPromotionMapBuilder { IdModel& id_model, const StatefulInliningInfo& inlining_info); - void build(); + std::unordered_map build(); ValGraph& idGraph(IdMappingMode mode); const ValGraph& idGraph(IdMappingMode mode) const; @@ -117,7 +117,6 @@ class LoopPromotionMapBuilder { private: IdModel& id_model_; const StatefulInliningInfo& inlining_info_; - std::unordered_map loop_promotion_map_; }; } // namespace nvfuser From 24a0c8ce33f9be2278f60ffb6d6705374bac759e Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 10 May 2024 13:25:13 -0700 Subject: [PATCH 07/13] Remove loop promotion code from IdModel --- csrc/id_model/id_model.cpp | 751 ------------------------------- csrc/id_model/id_model.h | 85 ---- csrc/id_model/loop_promotion.cpp | 6 +- 3 files changed, 3 insertions(+), 839 deletions(-) diff --git a/csrc/id_model/id_model.cpp b/csrc/id_model/id_model.cpp index d3d8446f96e..b436e1fc1f1 100644 --- a/csrc/id_model/id_model.cpp +++ b/csrc/id_model/id_model.cpp @@ -583,263 +583,6 @@ void IdModel::buildLoopGraph() { idGraph(IdMappingMode::LOOP).validateConsistency(); } -std::unordered_map IdModel::buildLoopPromotionMap( - const StatefulInliningInfo& inlining_info) { - // Make an intersection of the exact and loop map. This will group together - // entries in each loop group that are exact with each other. This provides a - // better graph to do promotion and replays. - // - // It's tempting to use the intersection of the almost exact and loop, but we - // need to model broadcast promotion, and if we have two tensors like: - // - // T1[i0, b1] = T0[i0] - // T2[i0, b2] = T0[i0] - // Then resolution of: - // T4 = T1[i0, b1] + T3[i0, i1] - // T6 = T2[i0, b2] + T5[i0, i2] - // - // Then merge(0, 1) with all tensors except for T0 - // - // The almost exact map will map i0, i0*b1, and i0*b2 together, but b1 and b2 - // are being resolved to i1 and i2 respectively. So we want to have separate - // entries so we can have an easy to process promotion map. - // - // Loop is a permissive like map, it could have many entries, use the exact - // map as the one we iterate on to reduce complexity as it hopefully has - // smaller groups and this algorithm scales with the number of groups * - // (number of entries in groups ^ 2) - // - // iel stands for Intersection of the Exact and Loop graphs. - ValGraph iel_graph = buildIntersection( - idGraph(IdMappingMode::EXACT), idGraph(IdMappingMode::LOOP), false); - - // Step 1: Build a map of the IEL groups of root broadcast domains - // to resolving domains. - std::unordered_map iel_promotion_map = - buildInlineRootResolutionMap(iel_graph, inlining_info); - - // Step 2: Propagate the root promotions to intermediate and leaf groups. - // At this point, the promotion may not be final as the analysis is - // localized to IEL groups. The map is used in the next step to - // build mappings of the loop groups. - propagatePromotionsInIELGraph(iel_graph, iel_promotion_map); - - // Step 3: Determine the promotion of each loop graph based on the - // IEL promotion map. For each loop group, examine all the IEL - // promotions and find the most representative one that captures all - // the dependent input domains of the loop group - std::unordered_map loop_promotion_map = - projectIELPromotionToLoopGraph( - iel_graph, - iel_promotion_map, - idGraph(IdMappingMode::LOOP), - inlining_info); - - // At this point, most of loop groups should have correct promoted - // IDs. However, non-inlined loop groups may miss promotion that - // should be propagated from parent ID groups, e.g., iS50 of T2 in - // Indexing19. Its parent ID loop group is promoted, but the loop - // group of iS50 is not found yet. - - // Step 4: In order to fully propagate the loop graph promotions, first - // propagate them to the IEL groups, which are then used to - // propagate back to the loop groups in Step 5. Unlike Step 2, the - // initial IEL promotion map is empty and is populated with the loop - // promotion map as we traverse down the IEL graph. - std::unordered_map final_iel_promotion_map; - propagatePromotionsInIELGraph( - iel_graph, - final_iel_promotion_map, - idGraph(IdMappingMode::LOOP), - loop_promotion_map); - - // Step 5: Find the final promotion of each loop group based on the - // final IEL promotion map - auto final_loop_promotion_map = projectIELPromotionToLoopGraph( - iel_graph, - final_iel_promotion_map, - idGraph(IdMappingMode::LOOP), - inlining_info); - - // The promotion map produced in Step 5 only includes those are - // further propagated at Step 4, so the correct mappings produced at - // Step 3 may not be included in the Step-5 results. Any Step-3 mappings - // that are not found in the Step-5 results are already valid - // results, so merge them into the Step-5 results. - // - // For example, in the below case, nothing will be propated at Step - // 4. - // - // t0: [i0] - // t1: [i1, i2] - // t2 = broadcast(t0, {true, false}) - // t3 = t2 + t1 - // - // t2: [b3, i4] - // t3: [i5, i6] - // - // t3->merge(0) - // propagate-and-inline-most - // - // t0: [i0] ca_pos(1) - // t1: [i1*i2] ca_pos(1) - // t2: [b3*i4] ca_pos(1) - // t3: [i5*i6] - // - // In this case, all domains will be grouped together and there will - // be just a single group in the Loop graph: - // - // - {i0, i1, i2, b3, i4, i5, i6, i1*i2, b3*i4, i5*i6} - // - // Step 3 will identify i5*i6 is the promotion domain. Since all - // domains are promoted to i5*i6, there will be no propagation in - // Step 4 (i.e., loop_promote_inputs will be false). Since the - // result of Step 4 is empty, the Step 5 result will also be empty, - // but that just means there's no change is necessary from the Step - // 3 results. - - // Update the Step-3 map to the latest LOOP graph - loop_promotion_map = - updateValGroupIdMap(loop_promotion_map, idGraph(IdMappingMode::LOOP)); - - // Insert the updated Step-3 results into the Step-5 resutls. Note - // that this insertion does not overwrite the existing mappings. - final_loop_promotion_map.insert( - loop_promotion_map.begin(), loop_promotion_map.end()); - - sanityCheckLoopPromotionMap(final_loop_promotion_map); - - return final_loop_promotion_map; -} - -std::unordered_map IdModel::buildInlineRootResolutionMap( - const ValGraph& iel_graph, - const StatefulInliningInfo& info) const { - std::unordered_map iel_promotion_map; - - // This should probably work just on terminating inputs, as we shouldn't be - // able to modify a broadcast domain between root and rfactor which would be - // required to resolve a non input broadcast domain. But for now leaving it as - // traversal on all broadcast groups. - // - - // We first visit all broadcast root domains. If a broadcast is - // resovled, see if it's promoted. Note that a domain be resolved to - // a domain that may not be loop mapped, yet it can still be - // promoted. In other words, there can be a domain that is exactly - // mapped with the resolving domain *and* is mapped with the - // broadcast domain by the loop map. The algorihm here is: - // - // 1. For a broadcast domain, find the domain that the broadcast is - // resolved to. - // 2. If the resolving domain is also loop-mapped with the - // broadcast, that is the promotion domain, but the resolving - // domain may not be loop mapped as mentioned above. Instead, - // find all loop-mapped domains with the broadcast domain and - // pick one that is exactly mapped with the resolving domain - // - // Note again this process is only done for root domains. Once we - // find promotion relationships for root domains, we propagate the - // mappings to derived domains - for (const ValGroup& iel_group : iel_graph.disjointValSets().disjointSets()) { - NVF_ERROR(!iel_group->empty()); - - IterDomain* iel_group_id = iel_group->front()->as(); - - if (!iel_group_id->isBroadcast()) { - continue; - } - - // Collect all the exact groups of the resolutions of the broadcast id's - ValGroups resolved_exact_groups; - for (Val* bcast_id : *iel_group) { - if (auto p2c_root_broadcast_resolution_map_it = - info.p2c_root_broadcast_resolution_map.find( - bcast_id->as()); - p2c_root_broadcast_resolution_map_it != - info.p2c_root_broadcast_resolution_map.end()) { - resolved_exact_groups.pushBack( - idGraph(IdMappingMode::EXACT) - .toGroups(p2c_root_broadcast_resolution_map_it->second)); - } - } - - if (resolved_exact_groups.empty()) { - // No resolution - continue; - } - - // resolved_exact_groups is a list of IDs that resolves the - // broadcast. We only care those that are also in the same loop - // group, and there must be just one or none. Otherwise, the - // resolution is ambiguous. - - // Collect all the exact groups in the loop set containing this iel_group - const ValGroup& loop_group = - idGraph(IdMappingMode::LOOP).toGroup(iel_group_id); - ValGroups loop_covered_exact_groups = - idGraph(IdMappingMode::EXACT).toGroups(*loop_group); - - // The intersection of the exact groups that the broadcast domains can be - // broadcasted to, and those that exist within the same loop groop are is - // the promotion needed for this iel_group. The promotion should - // be none or unique. - ValGroups loop_exact_resolved_intersection = - resolved_exact_groups.computeIntersect(loop_covered_exact_groups); - - if (loop_exact_resolved_intersection.empty()) { - // No promotion - continue; - } - - if (loop_exact_resolved_intersection.size() > 1) { - // Ambiguous promotion. This should not happen. - std::stringstream err_msg; - err_msg - << "Invalid multiple broadcast resolution within shared loops detected, group:\n " - << iel_group->toString() << "\nIs being broadcasted to:"; - for (const ValGroup& entry : loop_exact_resolved_intersection) { - err_msg << "\n " << entry->toString(); - } - NVF_ERROR(false, err_msg.str()); - } - - const ValGroup& exact_resolution_group = - loop_exact_resolved_intersection.front(); - - // Within the loop group, find the IDs that the broadcast IDs are - // resolved to - VectorOfUniqueEntries resolved_ids = - exact_resolution_group->computeIntersect(*loop_group); - - NVF_ERROR(!resolved_ids.empty()); - - // All the IDs in resolved_ids are mapped with both of the exact - // and loop graphs, so any of them can be used as an IEL promotion - // ID. Just to make it extra clear, look for corresponding - // groups in the IEL graph and make sure there's only one such group. - ValGroups promoted_iel_groups = iel_graph.toGroups(resolved_ids); - - NVF_ERROR(!promoted_iel_groups.empty()); - - if (promoted_iel_groups.size() > 1) { - std::stringstream err_msg; - err_msg - << "Invalid multiple broadcast resolution within shared loops detected, group:\n " - << iel_group->toString() << "\nIs being broadcasted to:"; - for (const ValGroup& entry : promoted_iel_groups) { - err_msg << "\n " << entry->toString(); - } - NVF_ERROR(false, err_msg.str()); - } - - iel_promotion_map[iel_group] = - promoted_iel_groups.front()->front()->as(); - } - - return iel_promotion_map; -} - void IdModel::buildAllGraphs() { if (tvs_.empty()) { return; @@ -935,266 +678,6 @@ ValGraph IdModel::buildIntersection( return intersection; } -namespace { - -// Check if there's an equivalent expression as iel_expr that uses -// maybe_promoted_inputs. This is used to avoid redundantly replaying -// expressions. -// NOTE: This is currently overly conservative and some -// opportunities for reuse are lost, althought it doesn't affect -// the correctness of the analysis. -Expr* findMatchingExpr( - const ExprGroup& iel_expr, - const ValGraph& iel_graph, - const std::vector& maybe_promoted_inputs, - const ValGraph& loop_graph) { - // If any of domains in maybe_promoted_inputs is not found in - // iel_graph, it means the domain is just replayed and by definition - // has no mapping with any existing domain, which means there's no - // matching expr. - if (std::any_of( - maybe_promoted_inputs.begin(), - maybe_promoted_inputs.end(), - [&](IterDomain* maybe_promoted_input) -> bool { - return !iel_graph.hasGroup(maybe_promoted_input); - })) { - return nullptr; - } - - // Grab all eligible uses of the promoted inputs. - // Note that any eligible matching expr should be a use of all - // inputs in maybe_promoted_input_uses, no matter it's promoted or - // not. So it isn't necessary to look at all of - // maybe_promoted_input_uses but just need to grab one. - NVF_ERROR(!maybe_promoted_inputs.empty()); - ExprGroups maybe_promoted_input_uses = - iel_graph.getUses(iel_graph.toGroup(maybe_promoted_inputs.front())); - - if (maybe_promoted_input_uses.empty()) { - return nullptr; - } - - // Look for exprs that have inputs that are mapped in the IEL - // graph with the (promoted) inputs of iel_expr. - for (const ExprGroup& maybe_promoted_input_use_group : - maybe_promoted_input_uses) { - NVF_ERROR(!maybe_promoted_input_use_group->empty()); - // maybe_promoted_inputs may include non-promoted inputs as - // well, so maybe_promoted_input_uses may include the original - // iel_expr itself. Since there must at least be a promoted input, - // iel_expr itself should not be an expr group we are looking for. - if (iel_expr == maybe_promoted_input_use_group) { - continue; - } - Expr* maybe_promoted_input_use = maybe_promoted_input_use_group->front(); - if (!iel_expr->front()->sameOp(maybe_promoted_input_use)) { - continue; - } - // Check if all inputs are mapped - NVF_ERROR( - maybe_promoted_inputs.size() == - maybe_promoted_input_use->inputs().size()); - bool all_inputs_match = true; - for (const auto inp_i : c10::irange(maybe_promoted_inputs.size())) { - all_inputs_match = all_inputs_match && - iel_graph.disjointValSets().strictAreMapped( - maybe_promoted_inputs[inp_i], - maybe_promoted_input_use->inputs().at(inp_i)); - } - if (!all_inputs_match) { - continue; - } - - // We always want to find promotions within the same loop - // groups since we are looking for domains that represent actual - // loops. Note that that's guaranteed when a new domain is - // replayed instead of reusing an existing domain. - if (!loop_graph.disjointExprSets().permissiveAreMapped( - iel_expr->front(), maybe_promoted_input_use_group->front())) { - continue; - } - // This is just an extra sanity check. Make sure all exprs in - // the use group are mapped - NVF_ERROR( - std::all_of( - maybe_promoted_input_use_group->vector().begin(), - maybe_promoted_input_use_group->vector().end(), - [&](Expr* iel_use) { - return loop_graph.disjointExprSets().permissiveAreMapped( - iel_expr->front(), iel_use); - }), - "Not all mapped: ", - nvfuser::toString(iel_expr), - "\n", - nvfuser::toString(maybe_promoted_input_use_group)); - - return maybe_promoted_input_use; - } - - return nullptr; -} - -// When propagating loop promotions from inputs to outputs of an IEL -// expr, we can't blindly apply loop promotion when all of the input -// domains are loop mapped with the outputs. -// -// i.e. if we have the inlined domains from: -// Inputs: -// T0[i0] -// T1[i0, i1] -// -// T2[i0, b2] = broadcast(T0) -// T3[i0, i1] = T2 + T1 -// -// {T1, T2, T3}->merge(0, 1) -// inlineMost -// -// The inlined loop group would consist of: -// -// {i0, i1, b2, i0*b2, i0*i1} -// -// Note that all these domains would have promotion to i0*i1 at the -// end of Step 3. When the IEL expression of merge(i0, i1) is visited by -// propagatePromotionsInIELGraph again, the promotion to i0*i1 of both -// inputs would be propagated to its output, resulting in promotion of -// i0*i1 to (i0*i1)*(i0*i1), which is not the correct propagation. -// -// Therefore only promote i0*b1 to i0*i1, or i0*i1 to i0*i1 (i.e. don't -// promote an input to any transformation within the loop group). -// -// So if we have an iel_expr make sure its inputs and outputs are not in -// the same loop group. -bool hasUniqueInputLoopGroups( - const ExprGroup& iel_expr, - const ValGraph& iel_graph, - const ValGraph& loop_graph) { - const std::vector iel_inp_groups = iel_graph.inputGroups(iel_expr); - - const std::vector iel_out_groups = iel_graph.outputGroups(iel_expr); - - ValGroups inp_loop_groups; - for (const ValGroup& iel_inp_group : iel_inp_groups) { - inp_loop_groups.pushBack(loop_graph.toGroup(iel_inp_group->front())); - } - ValGroups out_loop_groups; - for (const ValGroup& iel_out_group : iel_out_groups) { - out_loop_groups.pushBack(loop_graph.toGroup(iel_out_group->front())); - } - - // Check if input groups that are not included in the output group set - return !inp_loop_groups.computeSubtract(out_loop_groups).empty(); -} - -} // namespace - -void IdModel::propagatePromotionsInIELGraph( - const ValGraph& iel_graph, - std::unordered_map& iel_promotion_map, - const ValGraph& loop_graph, - const std::unordered_map& loop_graph_promotion_map) { - // In order to make this traversal work, the traversal order must be - // topologically sorted. - ValGraphStmtSort iel_stmt_sort(iel_graph); - - for (const ExprGroup& iel_expr : iel_stmt_sort.exprs()) { - NVF_ERROR(!iel_expr->empty()); - const std::vector iel_inp_groups = - iel_graph.inputGroups(iel_expr); - - // Check if any inputs need promotion indicating this expr group needs to - // be replayed with promoted inputs - bool an_input_was_promoted = false; - std::vector maybe_promoted_inputs; - maybe_promoted_inputs.reserve(iel_inp_groups.size()); - - // Propagate loop graph promotion only when the inputs and outputs are - // not in the same loop group. - const bool loop_promote_inputs = !loop_graph_promotion_map.empty() && - hasUniqueInputLoopGroups(iel_expr, iel_graph, loop_graph); - - for (const ValGroup& iel_inp_group : iel_inp_groups) { - // Assumed all inputs are IterDomains - NVF_ERROR(iel_inp_group->front()->isA()); - - // Propagate IEL promotions when available. - if (auto inp_promo_it = iel_promotion_map.find(iel_inp_group); - inp_promo_it != iel_promotion_map.end()) { - maybe_promoted_inputs.push_back(inp_promo_it->second); - an_input_was_promoted = true; - continue; - } - - // Promote loops based on the loop promotion map. If the loop promotion - // map should be used and has an entry we should use that promotion. - if (loop_promote_inputs) { - const ValGroup& loop_copy_group = - loop_graph.toGroup(iel_inp_group->front()); - auto inp_loop_promo_it = loop_graph_promotion_map.find(loop_copy_group); - if (inp_loop_promo_it != loop_graph_promotion_map.end()) { - maybe_promoted_inputs.push_back(inp_loop_promo_it->second); - an_input_was_promoted = true; - continue; - } - } - - // No promotion found. Just use the non-promoted domain - maybe_promoted_inputs.push_back(iel_inp_group->front()->as()); - } - - if (!an_input_was_promoted) { - // No inputs need promotion so just continue - continue; - } - - Expr* promoted_expr = findMatchingExpr( - iel_expr, - iel_graph, - maybe_promoted_inputs, - idGraph(IdMappingMode::LOOP)); - - bool replayed = false; - - if (!promoted_expr) { - promoted_expr = addReplayAs(maybe_promoted_inputs, iel_expr->front()); - replayed = true; - } - - // Mark outputs as having a promoted iter domain - std::vector out_groups = iel_graph.outputGroups(iel_expr); - NVF_ERROR(promoted_expr->outputs().size() == out_groups.size()); - NVF_ERROR( - ir_utils::filterByType(promoted_expr->outputs()).size() == - out_groups.size(), - "Unexpected non IterDomain outputs found: ", - promoted_expr->toString()); - - for (const auto i : c10::irange(out_groups.size())) { - // Promote if necessary, if the output is already in the same exact map - // it doesn't need a promotion. - if (idGraph(IdMappingMode::EXACT) - .disjointValSets() - .strictAreMapped( - promoted_expr->output(i), out_groups[i]->front())) { - continue; - } - iel_promotion_map[out_groups[i]] = - promoted_expr->output(i)->as(); - // Explicitly map loop map since expr propagation doesn't happen - if (replayed) { - idGraph(IdMappingMode::LOOP) - .mapVals(iel_expr->front()->output(i), promoted_expr->output(i)); - } - } - } -} - -void IdModel::propagatePromotionsInIELGraph( - const ValGraph& iel_graph, - std::unordered_map& iel_promotion_map) { - propagatePromotionsInIELGraph( - iel_graph, iel_promotion_map, idGraph(IdMappingMode::LOOP), {}); -} - // Replay Expr but with the inputs provided. Expr* IdModel::addReplayAs(std::vector new_inputs, Expr* expr) { // Figure out which graphs are already initialized to make sure we add the new @@ -1311,240 +794,6 @@ Expr* IdModel::addReplayAs(std::vector new_inputs, Expr* expr) { return replay; } -namespace { - -// Returns for each ValGroup in provided IdGraph what the input ValGroups are -// traversing on definitions. Ignoring broadcast ValGroups and resetting inputs -// at RFactor ValGroups. -std::unordered_map computeCoveredGroups( - const ValGraph& graph, - const std::unordered_set& view_rfactor_ids) { - // Map from an exact iter domain group, to all the exact iter domain groups it - // covers - std::unordered_map covered_ids; - - for (const ValGroup& id_group : graph.disjointValSets().disjointSets()) { - // Initialize inputs - const ExprGroups& id_group_defs = graph.getDefinitions(id_group); - if (id_group_defs.empty()) { - covered_ids[id_group] = {id_group}; - } - - // Initialize rfactor groups - if (std::any_of(id_group->begin(), id_group->end(), [&](Val* id) { - return view_rfactor_ids.find(id->as()) != - view_rfactor_ids.end(); - })) { - covered_ids[id_group] = {id_group}; - } - - // Initialize broadcast groups to empty since broadcast domains - // don't matter for indexing - if (std::any_of(id_group->begin(), id_group->end(), [&](Val* id) { - return id->as()->isBroadcast(); - })) { - covered_ids[id_group] = {}; - } - } - - ValGraphStmtSort exact_stmt_sort(graph); - - for (const ExprGroup& exact_expr : exact_stmt_sort.exprs()) { - std::vector input_groups = graph.inputGroups(exact_expr); - - ValGroups covered; - for (const ValGroup& inp_group : input_groups) { - covered.pushBack(covered_ids.at(inp_group)); - } - - for (const ValGroup& output_group : graph.outputGroups(exact_expr)) { - // Don't overwrite initialized cases due to rfactor markings. - if (covered_ids.find(output_group) == covered_ids.end()) { - covered_ids[output_group] = covered; - } - } - } - - return covered_ids; -} - -}; // namespace - -std::unordered_map IdModel:: - projectIELPromotionToLoopGraph( - const ValGraph& iel_graph, - const std::unordered_map& iel_promotion_map, - const ValGraph& loop_graph, - const StatefulInliningInfo& inlining_info) { - const std::unordered_map exact_covered_ids = - computeCoveredGroups(idGraph(IdMappingMode::EXACT), view_rfactor_ids_); - - // Grab terminal iter domain in the loop groups. - const VectorOfUniqueEntries terminal_loop_ids = - computeTerminalLoopIds(inlining_info); - - std::unordered_map loop_promotion_map; - - for (const ValGroup& loop_group : - loop_graph.disjointValSets().disjointSets()) { - IterDomain* promotion_id = findPromotionOfLoopGroup( - loop_group, - iel_graph, - iel_promotion_map, - exact_covered_ids, - terminal_loop_ids); - if (promotion_id) { - loop_promotion_map[loop_group] = promotion_id; - } - } - - return loop_promotion_map; -} - -IterDomain* IdModel::findPromotionOfLoopGroup( - const ValGroup& loop_group, - const ValGraph& iel_graph, - const std::unordered_map& iel_promotion_map, - const std::unordered_map& exact_covered_ids, - const VectorOfUniqueEntries& terminal_loop_ids) { - const ValGraph& exact_graph = idGraph(IdMappingMode::EXACT); - - // Grab all the (potentially promoted) terminal iter domains in this group. - // Save the exact group and the iter domain in this vector. - std::vector> exact_promoted_terminal_ids; - for (auto loop_id : *loop_group) { - // If not a terminal id in the group skip - if (!terminal_loop_ids.has(loop_id->as())) { - continue; - } - - // Grab the iel entry. There can be iter domains that were added - // after the IEL graph was built. All the promotion information is - // associated with the domains that exist in the original graph, - // so the new domains can be simply ignored. - if (!iel_graph.hasGroup(loop_id)) { - continue; - } - - const ValGroup& iel_group = iel_graph.toGroup(loop_id); - - // Does it still need iel_promotion_map? The loop group already has - // the replayed domains, so we should be able to find it. - auto iel_promo_it = iel_promotion_map.find(iel_group); - if (iel_promo_it == iel_promotion_map.end()) { - // If this terminal ID doesn't have a promotion associated with it, save - // the terminal ID. - exact_promoted_terminal_ids.emplace_back( - exact_graph.toGroup(loop_id), loop_id->as()); - } else { - // If this terminal ID has a promotion, grab the promoted ID. - exact_promoted_terminal_ids.emplace_back( - exact_graph.toGroup(iel_promo_it->second), iel_promo_it->second); - } - } - - // All the exact groups of the iter domains in the loop group - ValGroups exact_groups = exact_graph.toGroups(*loop_group); - - // All exact groups covered by all iter domains in this loop group - ValGroups loop_group_covered_ids; - for (const ValGroup& exact_group : exact_groups) { - auto covered_it = exact_covered_ids.find(exact_group); - NVF_ERROR(covered_it != exact_covered_ids.end()); - loop_group_covered_ids.pushBack(covered_it->second); - } - - // Check if any of the candidate Iter Domains we collected cover all the - // exact groups of loop_group_covered_ids. If so, that's the correct - // promoted iter domain of this group. - for (const auto& entry : exact_promoted_terminal_ids) { - const ValGroup& terminal_id_group = entry.first; - IterDomain* terminal_id = entry.second; - auto covered_it = exact_covered_ids.find(terminal_id_group); - NVF_ERROR(covered_it != exact_covered_ids.end()); - if (loop_group_covered_ids.computeSubtract(covered_it->second).empty()) { - return terminal_id; - } - } - - return nullptr; -} - -VectorOfUniqueEntries IdModel::computeTerminalLoopIds( - const StatefulInliningInfo& info) { - VectorOfUniqueEntries terminal_loop_ids; - for (const ValGroup& group : - idGraph(IdMappingMode::LOOP).disjointValSets().disjointSets()) { - if (group->size() == 1) { - terminal_loop_ids.pushBack(group->front()->as()); - } - - // Don't select producer iter domains - for (auto loop_id : *group) { - if (info.p2c_ca_permissive_maps.find(loop_id->as()) != - info.p2c_ca_permissive_maps.end()) { - continue; - } - - // It's terminal if there's no use group - auto uses_it = id_uses_.find(loop_id->as()); - if (uses_it == id_uses_.end() || uses_it->second.empty()) { - terminal_loop_ids.pushBack(loop_id->as()); - continue; - } - - // If there's an output group that is not in the same group, - // then it's a terminal ID - bool all_outs_in_loop_group = true; - for (auto use : uses_it->second) { - if (std::any_of( - use->outputs().begin(), - use->outputs().end(), - [&](Val* out) -> bool { - return group != idGraph(IdMappingMode::LOOP).toGroup(out); - })) { - all_outs_in_loop_group = false; - break; - } - } - - if (!all_outs_in_loop_group) { - terminal_loop_ids.pushBack(loop_id->as()); - } - } - } - return terminal_loop_ids; -} - -void IdModel::sanityCheckLoopPromotionMap( - const std::unordered_map& loop_promotion_map) const { - const auto& loop_graph = idGraph(IdMappingMode::LOOP); - for (const ValGroup& loop_group : - loop_graph.disjointValSets().disjointSets()) { - // Non-leaf loop groups are not guaranteed to have valid - // promotions. See for example FusionRepro1713, where root domains - // are all grouped together but there's no valid promotion. - if (loop_graph.hasUses(loop_group)) { - continue; - } - // Make sure the loop group is promoted to a domain that is mapped - // in the LOOP graph - auto promotion_it = loop_promotion_map.find(loop_group); - NVF_ERROR( - promotion_it != loop_promotion_map.end(), - "Loop promotion not found for ", - nvfuser::toString(loop_group)); - IterDomain* promotion = promotion_it->second; - // Make sure the promotion domain is also loop-mapped - NVF_ERROR( - loop_group->has(promotion), - "Loop promotion not loop-mapped. Loop group: ", - nvfuser::toString(loop_group), - ". Promotion domain: ", - promotion->name()); - } -} - void IdModel::validateLoopGraphHasNoSelfMappedLeafDomains() const { for (auto tv : tvs_) { auto self_mappped_leaf_pair = diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h index c58c17531b0..64db01c0064 100644 --- a/csrc/id_model/id_model.h +++ b/csrc/id_model/id_model.h @@ -20,7 +20,6 @@ namespace nvfuser { class ValGraph; -class LoopPromotionMapBuilder; struct StatefulInliningInfo { // All producer ids within (including dependencies of) inlined leaf domains, @@ -206,93 +205,9 @@ class IdModel : public PolymorphicBase { std::unordered_map buildLoopPromotionMap( const StatefulInliningInfo& info); - // Helper function for buildLoopPromotionMap. Returns a map of - // root broadcast ValGroups in the IEL graph to a representative - // IterDomain picked from its IEL group. - std::unordered_map buildInlineRootResolutionMap( - const ValGraph& iel_graph, - const StatefulInliningInfo& info) const; - - // Helper function for building loop promotion map. - // - // Propagate promotion mappings from root IEL groups to intermediate - // and leaf IEL groups by traversing IEL exprs. For each expr, if an - // input is promoted, the output needs to be promoted too. If - // there's already an equivalent expr that uses the promoted inputs, - // create a mapping from the outputs of the IEL expr to the outputs - // of the equivalent expr. We only consider exprs that are mapped - // in the loop graph as we are looking for domains that represent - // the actual loops of the input and output domains of the IEL - // expr. If no such expr is found, the IEL expr is replayed with the - // promoted inputs. - // - // This is used twice when building the promotion map. The first time - // it is used there's no loop graph promotion yet, so only the IEL - // promotions are propagated. In that case, loop_graph_promotion_map - // should be just empty. - // - // Propagation uses iel_promotion_map and - // loop_graph_promotion_map. If both are available for an IEL group, - // the former has the precedence. This is because when this function - // is used for step 4, the given iel_promotion_map starts as an - // empty map and gets populated during this propagation, so any - // mapping in the map is guaranteed to be the correct final mapping, - // whereas the loop graph may have invalid mappings for partially - // inlined domains. - void propagatePromotionsInIELGraph( - const ValGraph& iel_graph, - std::unordered_map& iel_promotion_map, - const ValGraph& loop_graph, - const std::unordered_map& loop_promotion_map); - - // Same as the other propagatePromotionsInIELGraph but without loop - // graph map. This is used for step 2, where there's no loop - // graph map yet. - void propagatePromotionsInIELGraph( - const ValGraph& iel_graph, - std::unordered_map& iel_promotion_map); - - // Given an IEL promotion map, identify the mapping of each loop - // group. The promotion must represent all the domains in each loop - // group. If a valid representative promotion is not found for a - // loop group, no mapping is added for the group. - std::unordered_map projectIELPromotionToLoopGraph( - const ValGraph& iel_graph, - const std::unordered_map& iel_promotion_map, - const ValGraph& loop_graph, - const StatefulInliningInfo& inlining_info); - - // Find a promoted iter domain of a given loop group that covers all - // the exact groups representative of the resolved transformations - // within the loop group. Specifically, we examine each IEL group of - // the loop group, and if an IEL group has a promotion, we consider it as a - // candidate of the promotion of this loop group. If not, we include a - // domain of the IEL group as a candidate too. Once all candidates are - // obtained, we pick one that covers all the exact domains (cf. concrete - // domains in ComputeAtMap) - IterDomain* findPromotionOfLoopGroup( - const ValGroup& loop_group, - const ValGraph& iel_graph, - const std::unordered_map& iel_promotion_map, - const std::unordered_map& exact_covered_ids, - const VectorOfUniqueEntries& terminal_loop_ids); - - // Terminal loop ids are iteration domains in each loop group that: - // 1) Don't have an entry in p2c_ca_permissive_maps, which would mean a - // consumer TV's iter domain maps to this domain in a way that that domain - // is also in the same loop group - // 2) Don't have a direct IterDomain consumer within the group - VectorOfUniqueEntries computeTerminalLoopIds( - const StatefulInliningInfo& info); - // Errors if self mapping occurs void assertNoSelfMapping(); - // Basic consistency check of the given loop promotion map - void sanityCheckLoopPromotionMap( - const std::unordered_map& loop_promotion_map) - const; - // Loop graph represents the loop structure of the given fusion, so // there must not be any mapping between the leaf domains of each // tensor. diff --git a/csrc/id_model/loop_promotion.cpp b/csrc/id_model/loop_promotion.cpp index 6ea37b7f5f6..fe9b494371d 100644 --- a/csrc/id_model/loop_promotion.cpp +++ b/csrc/id_model/loop_promotion.cpp @@ -70,7 +70,7 @@ std::unordered_map LoopPromotionMapBuilder::build() { // IEL promotion map. For each loop group, examine all the IEL // promotions and find the most representative one that captures all // the dependent input domains of the loop group - std::unordered_map initial_loop_promotion_map = + const std::unordered_map initial_loop_promotion_map = projectIELPromotionToLoopGraph( iel_graph, iel_promotion_map, @@ -141,13 +141,13 @@ std::unordered_map LoopPromotionMapBuilder::build() { // 3 results. // Update the Step-3 map to the latest LOOP graph - initial_loop_promotion_map = updateValGroupIdMap( + const auto updated_initial_loop_promotion_map = updateValGroupIdMap( initial_loop_promotion_map, idGraph(IdMappingMode::LOOP)); // Insert the updated Step-3 results into the Step-5 resutls. Note // that this insertion does not overwrite the existing mappings. final_loop_promotion_map.insert( - initial_loop_promotion_map.begin(), initial_loop_promotion_map.end()); + updated_initial_loop_promotion_map.begin(), updated_initial_loop_promotion_map.end()); sanityCheckLoopPromotionMap(final_loop_promotion_map); From c7c04b16a89c32c6ee60fd8805986e4398f42bf5 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 10 May 2024 14:53:57 -0700 Subject: [PATCH 08/13] replace tester with callback --- csrc/id_model/id_model.cpp | 14 ++-- csrc/id_model/id_model.h | 11 +++- csrc/id_model/loop_promotion.cpp | 32 +++++++-- csrc/id_model/loop_promotion.h | 23 ++++++- tests/cpp/test_id_model.cpp | 107 +++++++++++++------------------ 5 files changed, 112 insertions(+), 75 deletions(-) diff --git a/csrc/id_model/id_model.cpp b/csrc/id_model/id_model.cpp index b436e1fc1f1..3eacaea16f2 100644 --- a/csrc/id_model/id_model.cpp +++ b/csrc/id_model/id_model.cpp @@ -75,8 +75,9 @@ IdModel::IdModel( const std::vector& exprs, const std::vector& additional_tvs, bool build_graphs, - bool allow_self_mapping) - : allow_self_mapping_(allow_self_mapping) { + bool allow_self_mapping, + LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback) + : allow_self_mapping_(allow_self_mapping), loop_promotion_map_builder_callback_(loop_promotion_map_builder_callback) { std::copy_if( exprs.begin(), exprs.end(), @@ -103,8 +104,11 @@ IdModel::IdModel( Fusion* fusion, bool build_graphs, bool allow_self_mapping, - bool validate) - : allow_self_mapping_(allow_self_mapping), validate_(validate) { + bool validate, + LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback) + : allow_self_mapping_(allow_self_mapping), + validate_(validate), + loop_promotion_map_builder_callback_(loop_promotion_map_builder_callback) { auto all_exprs = fusion->exprs(); std::copy_if( all_exprs.begin(), @@ -574,7 +578,7 @@ void IdModel::buildLoopGraph() { validateLoopGraphHasNoSelfMappedLeafDomains(); - loop_promotion_map_ = LoopPromotionMapBuilder::get(*this, inlining_info); + loop_promotion_map_ = LoopPromotionMapBuilder::get(*this, inlining_info, loop_promotion_map_builder_callback_); // New domains are added. Make sure there's still no self mapping in // the leaf domains diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h index 64db01c0064..a2565dd94e4 100644 --- a/csrc/id_model/id_model.h +++ b/csrc/id_model/id_model.h @@ -20,6 +20,7 @@ namespace nvfuser { class ValGraph; +class LoopPromotionMapBuilderCallback; struct StatefulInliningInfo { // All producer ids within (including dependencies of) inlined leaf domains, @@ -106,7 +107,8 @@ class IdModel : public PolymorphicBase { const std::vector& exprs, const std::vector& additional_tvs = {}, bool build_graphs = true, - bool allow_self_mapping = false); + bool allow_self_mapping = false, + LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback = nullptr); // Same as the above constructor with fusion->exprs() excpet fusion may have // some dangling inputs/outputs that are expected to have IterDomain entries @@ -118,7 +120,8 @@ class IdModel : public PolymorphicBase { Fusion* fusion, bool build_graphs = true, bool allow_self_mapping = false, - bool validate = true); + bool validate = true, + LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback = nullptr); // Returns iter domain graph of provided mode. The graph must have // been already built. @@ -227,6 +230,10 @@ class IdModel : public PolymorphicBase { // If true, validate graphs by comparing them with ComputeAtMap bool validate_ = false; + // Optional callback for the loop promotion map builder for + // debugging and testing + LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback_ = nullptr; + // By default, the permissive graph should map compliment domains as // well. See the design doc for more details bool permissive_graph_map_compliment_ids_ = true; diff --git a/csrc/id_model/loop_promotion.cpp b/csrc/id_model/loop_promotion.cpp index fe9b494371d..7fb16ce4fcd 100644 --- a/csrc/id_model/loop_promotion.cpp +++ b/csrc/id_model/loop_promotion.cpp @@ -15,8 +15,9 @@ namespace nvfuser { LoopPromotionMapBuilder::LoopPromotionMapBuilder( IdModel& id_model, - const StatefulInliningInfo& inlining_info) - : id_model_(id_model), inlining_info_(inlining_info) {} + const StatefulInliningInfo& inlining_info, + LoopPromotionMapBuilderCallback* callback) + : id_model_(id_model), inlining_info_(inlining_info), callback_(callback) {} ValGraph& LoopPromotionMapBuilder::idGraph(IdMappingMode mode) { return id_model_.idGraph(mode); @@ -52,7 +53,7 @@ std::unordered_map LoopPromotionMapBuilder::build() { // (number of entries in groups ^ 2) // // iel stands for Intersection of the Exact and Loop graphs. - ValGraph iel_graph = id_model_.buildIntersection( + const ValGraph iel_graph = id_model_.buildIntersection( idGraph(IdMappingMode::EXACT), idGraph(IdMappingMode::LOOP), false); // Step 1: Build a map of the IEL groups of root broadcast domains @@ -60,12 +61,20 @@ std::unordered_map LoopPromotionMapBuilder::build() { std::unordered_map iel_promotion_map = buildInlineRootResolutionMap(iel_graph, inlining_info_); + if (callback_) { + callback_->postStep1(iel_promotion_map, iel_graph); + } + // Step 2: Propagate the root promotions to intermediate and leaf groups. // At this point, the promotion may not be final as the analysis is // localized to IEL groups. The map is used in the next step to // build mappings of the loop groups. propagatePromotionsInIELGraph(iel_graph, iel_promotion_map); + if (callback_) { + callback_->postStep2(iel_promotion_map, iel_graph); + } + // Step 3: Determine the promotion of each loop graph based on the // IEL promotion map. For each loop group, examine all the IEL // promotions and find the most representative one that captures all @@ -77,6 +86,10 @@ std::unordered_map LoopPromotionMapBuilder::build() { idGraph(IdMappingMode::LOOP), inlining_info_); + if (callback_) { + callback_->postStep3(initial_loop_promotion_map); + } + // At this point, most of loop groups should have correct promoted // IDs. However, non-inlined loop groups may miss promotion that // should be propagated from parent ID groups, e.g., iS50 of T2 in @@ -95,6 +108,10 @@ std::unordered_map LoopPromotionMapBuilder::build() { idGraph(IdMappingMode::LOOP), initial_loop_promotion_map); + if (callback_) { + callback_->postStep4(final_iel_promotion_map, iel_graph); + } + // Step 5: Find the final promotion of each loop group based on the // final IEL promotion map auto final_loop_promotion_map = projectIELPromotionToLoopGraph( @@ -151,6 +168,10 @@ std::unordered_map LoopPromotionMapBuilder::build() { sanityCheckLoopPromotionMap(final_loop_promotion_map); + if (callback_) { + callback_->postStep5(final_loop_promotion_map); + } + return final_loop_promotion_map; } @@ -781,8 +802,9 @@ void LoopPromotionMapBuilder::sanityCheckLoopPromotionMap( std::unordered_map LoopPromotionMapBuilder::get( IdModel& id_model, - const StatefulInliningInfo& inlining_info) { - LoopPromotionMapBuilder builder(id_model, inlining_info); + const StatefulInliningInfo& inlining_info, + LoopPromotionMapBuilderCallback* callback) { + LoopPromotionMapBuilder builder(id_model, inlining_info, callback); return builder.build(); } diff --git a/csrc/id_model/loop_promotion.h b/csrc/id_model/loop_promotion.h index a16b8220fb9..30d3a9e5b6e 100644 --- a/csrc/id_model/loop_promotion.h +++ b/csrc/id_model/loop_promotion.h @@ -14,6 +14,22 @@ namespace nvfuser { class IdModel; struct StatefulInliningInfo; +class LoopPromotionMapBuilderCallback { + public: + virtual ~LoopPromotionMapBuilderCallback() = default; + + virtual void postStep1( + const std::unordered_map& iel_root_resolution_map, + const ValGraph& iel_graph) {} + virtual void postStep2( + const std::unordered_map& iel_promotion_map, + const ValGraph& iel_graph) {} + virtual void postStep3(const std::unordered_map& loop_promotion_map) {} + virtual void postStep4(const std::unordered_map& iel_promotion_map, + const ValGraph& iel_graph) {} + virtual void postStep5(const std::unordered_map& loop_promotion_map) {} +}; + class LoopPromotionMapBuilder { public: // Build a map of loop groups to IterDomains that represent actual @@ -21,12 +37,14 @@ class LoopPromotionMapBuilder { // root domains between inlined producer and consumer tensors. static std::unordered_map get( IdModel& id_model, - const StatefulInliningInfo& inlining_info); + const StatefulInliningInfo& inlining_info, + LoopPromotionMapBuilderCallback* callback = nullptr); private: LoopPromotionMapBuilder( IdModel& id_model, - const StatefulInliningInfo& inlining_info); + const StatefulInliningInfo& inlining_info, + LoopPromotionMapBuilderCallback* callback = nullptr); std::unordered_map build(); @@ -117,6 +135,7 @@ class LoopPromotionMapBuilder { private: IdModel& id_model_; const StatefulInliningInfo& inlining_info_; + LoopPromotionMapBuilderCallback* callback_ = nullptr; }; } // namespace nvfuser diff --git a/tests/cpp/test_id_model.cpp b/tests/cpp/test_id_model.cpp index 5033119f758..973dfb6d7ee 100644 --- a/tests/cpp/test_id_model.cpp +++ b/tests/cpp/test_id_model.cpp @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -118,74 +119,57 @@ IterDomain* getChildIdByName(IterDomain* id, StmtNameType name) { }; // Helper class to test IdModel -class IdModelTester : public IdModel { +class IdModelTester : public LoopPromotionMapBuilderCallback { public: // Do not automatically build the graphs - IdModelTester(Fusion* fusion) : IdModel(fusion, /*build_graphs=*/false) { - // Make sure the depedent graphs are already built - maybeBuildGraph(IdMappingMode::EXACT); - maybeBuildGraph(IdMappingMode::PERMISSIVE); - - // Gather broadcast resolution and inlining information - const StatefulInliningInfo inlining_info = buildStatefulInliningInfo( - tv_exprs_, - idGraph(IdMappingMode::EXACT), - idGraph(IdMappingMode::PERMISSIVE)); - - initializeLoopGraph(inlining_info); - - validateLoopGraphHasNoSelfMappedLeafDomains(); - - iel_graph = buildIntersection( - idGraph(IdMappingMode::EXACT), idGraph(IdMappingMode::LOOP), false); + IdModelTester(Fusion* fusion) { + id_model = std::make_unique( + fusion, + /*build_graphs=*/false, + /*allow_self_mapping=*/false, + /*validate=*/true, + /*loop_promotion_map_builder_callback=*/this); + + // Only build the loop graph + id_model->buildLoopGraph(); + } + void postStep1( + const std::unordered_map& iel_root_resolution_map, + const ValGraph& iel_graph) override { + this->iel_graph = iel_graph; + // this->iel_graph is a copy of the original IEL graph. The given + // map is for the original graph and needs to be updated. s1_root_resolution_map = - buildInlineRootResolutionMap(iel_graph, inlining_info); + updateValGroupIdMap(iel_root_resolution_map, this->iel_graph); + } - s2_iel_promotion_map = s1_root_resolution_map; + void postStep2( + const std::unordered_map& iel_promotion_map, + const ValGraph& iel_graph) override { + s2_iel_promotion_map = + updateValGroupIdMap(iel_promotion_map, this->iel_graph); + } - propagatePromotionsInIELGraph(iel_graph, s2_iel_promotion_map); + void postStep3(const std::unordered_map& + loop_promotion_map) override { + s3_loop_graph = id_model->idGraph(IdMappingMode::LOOP); + s3_loop_promotion_map = + updateValGroupIdMap(loop_promotion_map, s3_loop_graph); + } - const auto s3_original_loop_promotion_map = projectIELPromotionToLoopGraph( - iel_graph, - s2_iel_promotion_map, - idGraph(IdMappingMode::LOOP), - inlining_info); + void postStep4( + const std::unordered_map& iel_promotion_map, + const ValGraph& iel_graph) override { + s4_iel_promotion_map = + updateValGroupIdMap(iel_promotion_map, this->iel_graph); + } - // Make a copy for validation as idGraph(IdMappingMode::LOOP) will - // be updated in the later steps - s3_loop_graph = idGraph(IdMappingMode::LOOP); - s3_loop_promotion_map = - updateValGroupIdMap(s3_original_loop_promotion_map, s3_loop_graph); - - // Note that s4_iel_promotion_map is an empty map at this - // point. It'll be populated with the Step-3 map - propagatePromotionsInIELGraph( - iel_graph, - s4_iel_promotion_map, - idGraph(IdMappingMode::LOOP), - s3_original_loop_promotion_map); - - // Step 5: Find the final promotion of each loop group based on the - // final IEL promotion map - s5_loop_promotion_map = projectIELPromotionToLoopGraph( - iel_graph, - s4_iel_promotion_map, - idGraph(IdMappingMode::LOOP), - inlining_info); - - auto updated_s3_loop_promotion_map = updateValGroupIdMap( - s3_loop_promotion_map, idGraph(IdMappingMode::LOOP)); - s5_loop_promotion_map.insert( - updated_s3_loop_promotion_map.begin(), - updated_s3_loop_promotion_map.end()); - - sanityCheckLoopPromotionMap(s5_loop_promotion_map); - validateLoopGraphHasNoSelfMappedLeafDomains(); - - s5_loop_graph = idGraph(IdMappingMode::LOOP); + void postStep5(const std::unordered_map& + loop_promotion_map) override { + s5_loop_graph = id_model->idGraph(IdMappingMode::LOOP); s5_loop_promotion_map = - updateValGroupIdMap(s5_loop_promotion_map, s5_loop_graph); + updateValGroupIdMap(loop_promotion_map, s5_loop_graph); } void print(std::ostream& os) const { @@ -211,6 +195,7 @@ class IdModelTester : public IdModel { } } + std::unique_ptr id_model; ValGraph iel_graph; std::unordered_map s1_root_resolution_map; std::unordered_map s2_iel_promotion_map; @@ -230,8 +215,8 @@ void validateIELResolution( const IdModelTester& tester, const std::unordered_map& iel_promotion_map) { const auto& iel_graph = tester.iel_graph; - const auto& exact_graph = tester.idGraph(IdMappingMode::EXACT); - const auto& loop_graph = tester.idGraph(IdMappingMode::LOOP); + const auto& exact_graph = tester.id_model->idGraph(IdMappingMode::EXACT); + const auto& loop_graph = tester.id_model->idGraph(IdMappingMode::LOOP); const auto& iel_group = iel_graph.toGroup(id); auto iel_promotion_map_it = iel_promotion_map.find(iel_group); From 2b605cd513dedb451c2aeba385ea11fdf24fd5d6 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 10 May 2024 14:55:42 -0700 Subject: [PATCH 09/13] clang-format --- csrc/id_model/id_model.cpp | 10 +++++++--- csrc/id_model/id_model.h | 9 ++++++--- csrc/id_model/loop_promotion.cpp | 3 ++- csrc/id_model/loop_promotion.h | 11 +++++++---- 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/csrc/id_model/id_model.cpp b/csrc/id_model/id_model.cpp index 3eacaea16f2..bc63de3f452 100644 --- a/csrc/id_model/id_model.cpp +++ b/csrc/id_model/id_model.cpp @@ -77,7 +77,9 @@ IdModel::IdModel( bool build_graphs, bool allow_self_mapping, LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback) - : allow_self_mapping_(allow_self_mapping), loop_promotion_map_builder_callback_(loop_promotion_map_builder_callback) { + : allow_self_mapping_(allow_self_mapping), + loop_promotion_map_builder_callback_( + loop_promotion_map_builder_callback) { std::copy_if( exprs.begin(), exprs.end(), @@ -108,7 +110,8 @@ IdModel::IdModel( LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback) : allow_self_mapping_(allow_self_mapping), validate_(validate), - loop_promotion_map_builder_callback_(loop_promotion_map_builder_callback) { + loop_promotion_map_builder_callback_( + loop_promotion_map_builder_callback) { auto all_exprs = fusion->exprs(); std::copy_if( all_exprs.begin(), @@ -578,7 +581,8 @@ void IdModel::buildLoopGraph() { validateLoopGraphHasNoSelfMappedLeafDomains(); - loop_promotion_map_ = LoopPromotionMapBuilder::get(*this, inlining_info, loop_promotion_map_builder_callback_); + loop_promotion_map_ = LoopPromotionMapBuilder::get( + *this, inlining_info, loop_promotion_map_builder_callback_); // New domains are added. Make sure there's still no self mapping in // the leaf domains diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h index a2565dd94e4..618599e634e 100644 --- a/csrc/id_model/id_model.h +++ b/csrc/id_model/id_model.h @@ -108,7 +108,8 @@ class IdModel : public PolymorphicBase { const std::vector& additional_tvs = {}, bool build_graphs = true, bool allow_self_mapping = false, - LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback = nullptr); + LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback = + nullptr); // Same as the above constructor with fusion->exprs() excpet fusion may have // some dangling inputs/outputs that are expected to have IterDomain entries @@ -121,7 +122,8 @@ class IdModel : public PolymorphicBase { bool build_graphs = true, bool allow_self_mapping = false, bool validate = true, - LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback = nullptr); + LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback = + nullptr); // Returns iter domain graph of provided mode. The graph must have // been already built. @@ -232,7 +234,8 @@ class IdModel : public PolymorphicBase { // Optional callback for the loop promotion map builder for // debugging and testing - LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback_ = nullptr; + LoopPromotionMapBuilderCallback* loop_promotion_map_builder_callback_ = + nullptr; // By default, the permissive graph should map compliment domains as // well. See the design doc for more details diff --git a/csrc/id_model/loop_promotion.cpp b/csrc/id_model/loop_promotion.cpp index 7fb16ce4fcd..26b9413a079 100644 --- a/csrc/id_model/loop_promotion.cpp +++ b/csrc/id_model/loop_promotion.cpp @@ -164,7 +164,8 @@ std::unordered_map LoopPromotionMapBuilder::build() { // Insert the updated Step-3 results into the Step-5 resutls. Note // that this insertion does not overwrite the existing mappings. final_loop_promotion_map.insert( - updated_initial_loop_promotion_map.begin(), updated_initial_loop_promotion_map.end()); + updated_initial_loop_promotion_map.begin(), + updated_initial_loop_promotion_map.end()); sanityCheckLoopPromotionMap(final_loop_promotion_map); diff --git a/csrc/id_model/loop_promotion.h b/csrc/id_model/loop_promotion.h index 30d3a9e5b6e..336823d0ad7 100644 --- a/csrc/id_model/loop_promotion.h +++ b/csrc/id_model/loop_promotion.h @@ -24,10 +24,13 @@ class LoopPromotionMapBuilderCallback { virtual void postStep2( const std::unordered_map& iel_promotion_map, const ValGraph& iel_graph) {} - virtual void postStep3(const std::unordered_map& loop_promotion_map) {} - virtual void postStep4(const std::unordered_map& iel_promotion_map, - const ValGraph& iel_graph) {} - virtual void postStep5(const std::unordered_map& loop_promotion_map) {} + virtual void postStep3( + const std::unordered_map& loop_promotion_map) {} + virtual void postStep4( + const std::unordered_map& iel_promotion_map, + const ValGraph& iel_graph) {} + virtual void postStep5( + const std::unordered_map& loop_promotion_map) {} }; class LoopPromotionMapBuilder { From 0f5ab07d5cebe4a8a1b9538288e18d51f170e8e5 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 10 May 2024 15:16:07 -0700 Subject: [PATCH 10/13] comment --- csrc/id_model/loop_promotion.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/csrc/id_model/loop_promotion.h b/csrc/id_model/loop_promotion.h index 336823d0ad7..ba3d7aae4de 100644 --- a/csrc/id_model/loop_promotion.h +++ b/csrc/id_model/loop_promotion.h @@ -14,21 +14,31 @@ namespace nvfuser { class IdModel; struct StatefulInliningInfo; +// Callback interface for LoopPromotionMapBuilder. Allow exposing the +// temporary maps for testing and debugging class LoopPromotionMapBuilderCallback { public: virtual ~LoopPromotionMapBuilderCallback() = default; + // Called after Step 1 with the root resolution map and the + // corresponding IEL graph virtual void postStep1( const std::unordered_map& iel_root_resolution_map, const ValGraph& iel_graph) {} + // Called after Step 2 with the IEL promotion map and the + // corresponding IEL graph virtual void postStep2( const std::unordered_map& iel_promotion_map, const ValGraph& iel_graph) {} + // Called after Step 3 with the loop promotion map virtual void postStep3( const std::unordered_map& loop_promotion_map) {} + // Called after Step 4 with the IEL promotion map and the + // corresponding IEL graph virtual void postStep4( const std::unordered_map& iel_promotion_map, const ValGraph& iel_graph) {} + // Called after Step 3 with the final loop promotion map virtual void postStep5( const std::unordered_map& loop_promotion_map) {} }; From c5b3d8710f6878ade7a14e31f45c39db0a2f1069 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Mon, 13 May 2024 16:33:38 -0700 Subject: [PATCH 11/13] repro --- tests/cpp/test_id_model.cpp | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/cpp/test_id_model.cpp b/tests/cpp/test_id_model.cpp index 973dfb6d7ee..05109416353 100644 --- a/tests/cpp/test_id_model.cpp +++ b/tests/cpp/test_id_model.cpp @@ -2131,4 +2131,34 @@ TEST_F(IdModelTest, PermutedDifferently) { EXPECT_TRUE(iterDomainsAreMapped(id_model, s1->axis(2), t1->axis(2))); } +// Make sure domains of sibling tensors are all mapped together in the +// LOOP graph even when those tensors are not inlined. +TEST_F(IdModelTest, LoopGraphWithSibling) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + auto welford_out_tvs = Welford(tv0, {1}); + auto avg = welford_out_tvs.avg; + fusion.addOutput(avg); + + avg->split(-1, 4); + TransformPropagatorWithCheck propagator(avg); + MaxRootDomainInfoSpanningTree(avg).traverse(&propagator); + + IdModel id_model(&fusion); + + const auto& loop_graph = id_model.idGraph(IdMappingMode::LOOP); + + for (auto welford_out : {welford_out_tvs.var_sum, welford_out_tvs.n}) { + for (const auto i : c10::irange(avg->nDims())) { + ASSERT_TRUE(loop_graph.disjointValSets().strictAreMapped( + avg->axis(i), welford_out->axis(i))) + << "Unmapped siblings: " << avg->axis(i)->toString() << ", " + << welford_out->axis(i)->toString(); + } + } +} + } // namespace nvfuser From 5dc5a28bc6357bc162e82f2f8ad6282aae10dd65 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Mon, 13 May 2024 17:10:12 -0700 Subject: [PATCH 12/13] fix --- csrc/disjoint_set.h | 3 ++- csrc/id_model/id_model.cpp | 31 +++++++++++++++++++++++++++++++ csrc/id_model/id_model.h | 6 ++++++ tests/cpp/test_id_model.cpp | 8 +++++--- 4 files changed, 44 insertions(+), 4 deletions(-) diff --git a/csrc/disjoint_set.h b/csrc/disjoint_set.h index 8d916ea2edf..618f768fff1 100644 --- a/csrc/disjoint_set.h +++ b/csrc/disjoint_set.h @@ -87,7 +87,8 @@ class VectorOfUniqueEntries { } // Returns true if any node was added - bool pushBack(const std::vector& other) { + template + bool pushBack(const std::vector& other) { bool any_added = false; for (const auto& entry : other) { auto added = pushBack(entry); diff --git a/csrc/id_model/id_model.cpp b/csrc/id_model/id_model.cpp index bc63de3f452..8b5af75fd86 100644 --- a/csrc/id_model/id_model.cpp +++ b/csrc/id_model/id_model.cpp @@ -543,6 +543,26 @@ StatefulInliningInfo buildStatefulInliningInfo( } } } + + // Siblings should always be mapped + auto consumer_tvs = ir_utils::filterByType(expr->outputs()); + if (consumer_tvs.size() > 1) { + auto all_consumer_ids = ir_utils::allIDsOf(consumer_tvs.vector().at(0)); + info.ordered_sibling_ids.pushBack( + {all_consumer_ids.begin(), all_consumer_ids.end()}); + for (const auto i : c10::irange(1, consumer_tvs.size())) { + auto consumer_tv_i = consumer_tvs.vector().at(i); + auto all_consumer_i_ids = ir_utils::allIDsOf(consumer_tv_i); + + auto sibling_map = + exact_graph.buildMapBetween(all_consumer_ids, all_consumer_i_ids); + + for (const auto& [c_id_1, c_ids] : sibling_map) { + NVF_ERROR(c_ids.size() == 1); + info.sibling_maps[c_id_1->as()].pushBack(c_ids); + } + } + } } return info; } @@ -565,6 +585,17 @@ void IdModel::initializeLoopGraph(const StatefulInliningInfo& info) { } } } + + // Similarly maps all sibling domains + for (IterDomain* id : info.ordered_sibling_ids) { + auto entry_it = info.sibling_maps.find(id); + if (entry_it != info.sibling_maps.end()) { + const VectorOfUniqueEntries& sibling_ids = entry_it->second; + for (Val* sibling_id : sibling_ids) { + idGraph(IdMappingMode::LOOP).mapVals(id, sibling_id); + } + } + } } void IdModel::buildLoopGraph() { diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h index 618599e634e..5b60ff474c6 100644 --- a/csrc/id_model/id_model.h +++ b/csrc/id_model/id_model.h @@ -36,6 +36,12 @@ struct StatefulInliningInfo { // root domains std::unordered_map> p2c_root_broadcast_resolution_map; + + // All IDs of all first siblings + VectorOfUniqueEntries ordered_sibling_ids; + + // Mappings to other sibling IDs from ordered_sibling_ids + std::unordered_map> sibling_maps; }; StatefulInliningInfo buildStatefulInliningInfo( diff --git a/tests/cpp/test_id_model.cpp b/tests/cpp/test_id_model.cpp index 05109416353..a3d92f969f9 100644 --- a/tests/cpp/test_id_model.cpp +++ b/tests/cpp/test_id_model.cpp @@ -2137,18 +2137,20 @@ TEST_F(IdModelTest, LoopGraphWithSibling) { Fusion fusion; FusionGuard fg(&fusion); - auto tv0 = makeSymbolicTensor(2); + auto tv0 = makeSymbolicTensor(3); fusion.addInput(tv0); - auto welford_out_tvs = Welford(tv0, {1}); + auto welford_out_tvs = Welford(tv0, {2}); auto avg = welford_out_tvs.avg; fusion.addOutput(avg); + // Random scheduling avg->split(-1, 4); + avg->merge(0); + avg->split(0, 8); TransformPropagatorWithCheck propagator(avg); MaxRootDomainInfoSpanningTree(avg).traverse(&propagator); IdModel id_model(&fusion); - const auto& loop_graph = id_model.idGraph(IdMappingMode::LOOP); for (auto welford_out : {welford_out_tvs.var_sum, welford_out_tvs.n}) { From 7b63685b2536ccbaa21fb9bd12517958469e9e9c Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 21 May 2024 13:36:16 -0700 Subject: [PATCH 13/13] disable idmodel --- csrc/device_lower/lower2device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index f988ce65c4f..76181bd72b4 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -391,7 +391,7 @@ void GpuLower::analysis(Fusion* fusion) { // functionality should be affected. New IterDomains may be created, // so it is expected that generated code may use diffrent variable // names - if (true || isOptionEnabled(EnableOption::IdModel)) { + if (isOptionEnabled(EnableOption::IdModel)) { IdModel id_model(fusion_); }