From 0df07dca2ed66198072bb8b092fee081c0c50893 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 26 Mar 2024 08:46:56 -0700 Subject: [PATCH 1/7] Step 4 of the loop promotion analysis. The loop promotion map generated at Step 3 is propagated through the IEL graph. This is necessary for partially inlined domains. --- csrc/device_lower/lower2device.cpp | 2 +- csrc/id_model/id_model.cpp | 153 +++++++++- csrc/id_model/id_model.h | 39 ++- csrc/id_model/utils.h | 55 ++++ tests/cpp/test_id_model.cpp | 452 +++++++++++++++++++++++++---- 5 files changed, 640 insertions(+), 61 deletions(-) create mode 100644 csrc/id_model/utils.h diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 76181bd72b4..f988ce65c4f 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -391,7 +391,7 @@ void GpuLower::analysis(Fusion* fusion) { // functionality should be affected. New IterDomains may be created, // so it is expected that generated code may use diffrent variable // names - if (isOptionEnabled(EnableOption::IdModel)) { + if (true || isOptionEnabled(EnableOption::IdModel)) { IdModel id_model(fusion_); } diff --git a/csrc/id_model/id_model.cpp b/csrc/id_model/id_model.cpp index f9a759c1f36..4a27e9c87b4 100644 --- a/csrc/id_model/id_model.cpp +++ b/csrc/id_model/id_model.cpp @@ -627,10 +627,29 @@ std::unordered_map IdModel::buildLoopPromotionMap( idGraph(IdMappingMode::LOOP), inlining_info); + // At this point, most of loop groups should have correct promoted + // IDs. However, non-inlined loop groups may miss promotion that + // should be propagated from parent ID groups, e.g., iS50 of T2 in + // Indexing19. Its parent ID loop group is promoted, but the loop + // group of iS50 is not found yet. + + // Step 4: In order to fully propagate the loop graph promotions, first + // propagate them to the IEL groups, which are then used to + // propagate back to the loop groups in Step 5. Unlike Step 2, the + // initial IEL promotion map is empty and is populated with the loop + // promotion map as we traverse down the IEL graph. + std::unordered_map final_iel_promotion_map; + propagatePromotionsInIELGraph( + iel_graph, + final_iel_promotion_map, + idGraph(IdMappingMode::LOOP), + loop_promotion_map, + true); + // This is not a right map to return but just a placeholder since // the loop promotion map is not yet completely merged. It will be // replaced by a proper map. - return loop_promotion_map; + return final_iel_promotion_map; } std::unordered_map IdModel::buildInlineRootResolutionMap( @@ -867,7 +886,9 @@ namespace { Expr* findMatchingExpr( const ExprGroup& iel_expr, const ValGraph& iel_graph, - const std::vector& maybe_promoted_inputs) { + const std::vector& maybe_promoted_inputs, + bool require_loop_mapped_promotion, + const ValGraph& loop_graph) { // If any of domains in maybe_promoted_inputs is not found in // iel_graph, it means the domain is just replayed and by definition // has no mapping with any existing domain, which means there's no @@ -925,17 +946,87 @@ Expr* findMatchingExpr( continue; } + // For the final loop promotion map, we want to find + // promotions within the same loop groups. Note that that's + // guaranteed when a new domain is replayed instead of reusing an + // existing domain. + if (require_loop_mapped_promotion) { + if (!loop_graph.disjointExprSets().permissiveAreMapped( + iel_expr->front(), maybe_promoted_input_use_group->front())) { + continue; + } + // This is just an extra sanity check. Make sure all exprs in + // the use group are mapped + NVF_ERROR( + std::all_of( + maybe_promoted_input_use_group->vector().begin(), + maybe_promoted_input_use_group->vector().end(), + [&](Expr* iel_use) { + return loop_graph.disjointExprSets().permissiveAreMapped( + iel_expr->front(), iel_use); + }), + "Not all mapped: ", + nvfuser::toString(iel_expr), + "\n", + nvfuser::toString(maybe_promoted_input_use_group)); + } + return maybe_promoted_input_use; } return nullptr; } +// When propagating loop promotions from inputs to outputs of an IEL +// expr, we can't blindly apply loop promotion when all of the input +// domains are loop mapped with the outputs. +// +// i.e. if we have the inlined domains from: +// T2[i0*i1] pa(1) = T0[i0*b1]ca(1) + T1[i0*i1]ca(1) +// The inlined loop group would be: +// +// i0, i1, b1, i0*i1, b0*i1 +// Then if we replayed the iel transformations they would be: +// merge(i0, i1) +// merge(i0, b1) +// +// So if we replayed them with loop promotion, then i0, i1, b1 would be +// promoted to i0*i1, and the merges would be replayed. +// +// Therefore only promote i0*b1 to i0*i1, or i0*i1 to i0*i1 (i.e. don't +// promote an input to any transformation within the loop group). +// +// So if we have an iel_expr make sure its inputs and outputs are not in +// the same loop group. +bool hasUniqueInputLoopGroups( + const ExprGroup& iel_expr, + const ValGraph& iel_graph, + const ValGraph& loop_graph) { + const std::vector iel_inp_groups = iel_graph.inputGroups(iel_expr); + + const std::vector iel_out_groups = iel_graph.outputGroups(iel_expr); + + ValGroups inp_loop_groups; + for (const ValGroup& iel_inp_group : iel_inp_groups) { + inp_loop_groups.pushBack(loop_graph.toGroup(iel_inp_group->front())); + } + ValGroups out_loop_groups; + for (const ValGroup& iel_out_group : iel_out_groups) { + out_loop_groups.pushBack(loop_graph.toGroup(iel_out_group->front())); + } + + // Check if input groups that are not included in the output group set + return !inp_loop_groups.computeSubtract(out_loop_groups).empty(); +} + } // namespace void IdModel::propagatePromotionsInIELGraph( const ValGraph& iel_graph, - std::unordered_map& iel_promotion_map) { + std::unordered_map& iel_promotion_map, + const ValGraph& loop_graph, + const std::unordered_map& loop_graph_promotion_map, + bool require_loop_mapped_promotion) { // In order to make this traversal work, the traversal order must be // topologically sorted. ValGraphStmtSort iel_stmt_sort(iel_graph); @@ -951,6 +1042,11 @@ void IdModel::propagatePromotionsInIELGraph( std::vector maybe_promoted_inputs; maybe_promoted_inputs.reserve(iel_inp_groups.size()); + // Propagate loop graph promotion only when the inputs and outputs are + // not in the same loop group. + const bool loop_promote_inputs = !loop_graph_promotion_map.empty() && + hasUniqueInputLoopGroups(iel_expr, iel_graph, loop_graph); + for (const ValGroup& iel_inp_group : iel_inp_groups) { // Assumed all inputs are IterDomains NVF_ERROR(iel_inp_group->front()->isA()); @@ -963,6 +1059,19 @@ void IdModel::propagatePromotionsInIELGraph( continue; } + // Promote loops based on the loop promotion map. If the loop promotion + // map should be used and has an entry we should use that promotion. + if (loop_promote_inputs) { + const ValGroup& loop_copy_group = + loop_graph.toGroup(iel_inp_group->front()); + auto inp_loop_promo_it = loop_graph_promotion_map.find(loop_copy_group); + if (inp_loop_promo_it != loop_graph_promotion_map.end()) { + maybe_promoted_inputs.push_back(inp_loop_promo_it->second); + an_input_was_promoted = true; + continue; + } + } + // No promotion found. Just use the non-promoted domain maybe_promoted_inputs.push_back(iel_inp_group->front()->as()); } @@ -972,8 +1081,12 @@ void IdModel::propagatePromotionsInIELGraph( continue; } - Expr* promoted_expr = - findMatchingExpr(iel_expr, iel_graph, maybe_promoted_inputs); + Expr* promoted_expr = findMatchingExpr( + iel_expr, + iel_graph, + maybe_promoted_inputs, + require_loop_mapped_promotion, + idGraph(IdMappingMode::LOOP)); bool replayed = false; @@ -1011,6 +1124,13 @@ void IdModel::propagatePromotionsInIELGraph( } } +void IdModel::propagatePromotionsInIELGraph( + const ValGraph& iel_graph, + std::unordered_map& iel_promotion_map) { + propagatePromotionsInIELGraph( + iel_graph, iel_promotion_map, idGraph(IdMappingMode::LOOP), {}, false); +} + // Replay Expr but with the inputs provided. Expr* IdModel::addReplayAs(std::vector new_inputs, Expr* expr) { // Figure out which graphs are already initialized to make sure we add the new @@ -1332,4 +1452,27 @@ VectorOfUniqueEntries IdModel::computeTerminalLoopIds( return terminal_loop_ids; } +std::unordered_map updateValGroupIdMap( + const std::unordered_map& stale_map, + ValGraph& new_graph) { + std::unordered_map new_map; + + for (const auto& [stale_group, mapped_id] : stale_map) { + const ValGroups& new_groups = new_graph.toGroups(*stale_group); + NVF_ERROR( + new_groups.size() == 1, + "\nUpdate map assumes that new graph is equivalent to old graph plus extra mappings.\n", + "i.e. all mappings in new_graph should exist in the graph stale_map was produced on.\n", + "old:", + nvfuser::toString(stale_group), + "new: ", + nvfuser::toString(new_groups)); + NVF_ERROR( + new_map.emplace(new_groups.front(), mapped_id).second, + "Expected only a single mapping but multiple entries detected for ", + nvfuser::toString(new_groups.front())); + } + return new_map; +} + } // namespace nvfuser diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h index 792a6501be9..cb0c51ad4b9 100644 --- a/csrc/id_model/id_model.h +++ b/csrc/id_model/id_model.h @@ -169,7 +169,7 @@ class IdModel : public PolymorphicBase { // fusion. void buildIterDomainDefinitionsAndUses(); - /// Start loop map by grouping inlined iter domains + // Start loop map by grouping inlined iter domains void initializeLoopGraph(const StatefulInliningInfo& info); // Build a map of loop groups to IterDomains that represent actual @@ -192,7 +192,35 @@ class IdModel : public PolymorphicBase { // input is promoted, the output needs to be promoted too. If // there's already an equivalent expr that uses the promoted inputs, // create a mapping from the outputs of the IEL expr to the outputs - // of the equivalent expr. + // of the equivalent expr. When require_loop_mapped_promotion is + // true, the equivalent expr needs to be already loop mapped. If no + // such expr is found, the IEL expr is replayed iwth the promoted + // inputs. require_loop_mapped_promotion is true when this function + // is used for step 3. + // + // This is used twice when building the promotion map. The first time + // it is used there's no loop graph promotion yet, so only the IEL + // promotions are propagated. In that case, loop_graph_promotion_map + // should be just empty. + // + // Propagation uses iel_promotion_map and + // loop_graph_promotion_map. If both are available for an IEL group, + // the former has the precedence. This is because when this function + // is used for step 4, the given iel_promotion_map starts as an + // empty map and gets populated during this propagation, so any + // mapping in the map is guaranteed to be the correct final mapping, + // whereas the loop graph may have invalid mappings for partially + // inlined domains. + void propagatePromotionsInIELGraph( + const ValGraph& iel_graph, + std::unordered_map& iel_promotion_map, + const ValGraph& loop_graph, + const std::unordered_map& loop_promotion_map, + bool require_loop_mapped_promotion); + + // Same as the other propagatePromotionsInIELGraph but without loop + // graph map. This is used for step 2, where there's no loop + // graph map yet. void propagatePromotionsInIELGraph( const ValGraph& iel_graph, std::unordered_map& iel_promotion_map); @@ -281,4 +309,11 @@ class IdModel : public PolymorphicBase { std::unordered_map loop_promotion_map_; }; +// A utility function to update a map of ValGroups to ID from an old +// Valgraph to a new ValGraph. The new graph must be a superset of the +// old graph. +std::unordered_map updateValGroupIdMap( + const std::unordered_map& stale_map, + ValGraph& new_graph); + } // namespace nvfuser diff --git a/csrc/id_model/utils.h b/csrc/id_model/utils.h new file mode 100644 index 00000000000..2d6327bf586 --- /dev/null +++ b/csrc/id_model/utils.h @@ -0,0 +1,55 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once + +#include + +#include +#include +#include + +#define VERBOSE() verbose(__LINE__) +#define WARN() warn(__LINE__) + +namespace nvfuser { + +// Temporary logging utility +class DebugStream { + public: + DebugStream() + : enabled_(getNvFuserEnv("ID_MODEL_VERBOSE")), out_(std::cerr) {} + + template + DebugStream& operator<<(const T& v) { + if (enabled_) { + out_ << v; + } + return *this; + } + + DebugStream& operator<<(std::ostream& (*endl)(std::ostream&)) { + if (enabled_) { + out_ << endl; + } + return *this; + } + + private: + bool enabled_ = false; + std::ostream& out_; +}; + +inline DebugStream verbose(int line) { + return DebugStream() << "[DEBUG@" << line << "] "; +} + +inline DebugStream warn(int line) { + return DebugStream() << "[WARN@" << line << "] "; +} + +} // namespace nvfuser diff --git a/tests/cpp/test_id_model.cpp b/tests/cpp/test_id_model.cpp index a59bdf4aafa..53145d843d5 100644 --- a/tests/cpp/test_id_model.cpp +++ b/tests/cpp/test_id_model.cpp @@ -110,6 +110,13 @@ ValType* getValByName(const std::vector& vals, StmtNameType name) { } } +IterDomain* getChildIdByName(IterDomain* id, StmtNameType name) { + auto named_val = getValByName(ir_utils::consumerValsOf(id), name); + NVF_ERROR(named_val != nullptr, "Cannot find a child ID named ", name); + NVF_ERROR(named_val->isA()); + return named_val->as(); +}; + // Helper class to test IdModel class IdModelTester : public IdModel { public: @@ -137,17 +144,34 @@ class IdModelTester : public IdModel { propagatePromotionsInIELGraph(iel_graph, s2_iel_promotion_map); - s3_loop_promotion_map = projectIELPromotionToLoopGraph( + const auto s3_original_loop_promotion_map = projectIELPromotionToLoopGraph( iel_graph, s2_iel_promotion_map, idGraph(IdMappingMode::LOOP), inlining_info); + + // Make a copy for validation as idGraph(IdMappingMode::LOOP) will + // be updated in the later steps + s3_loop_graph = idGraph(IdMappingMode::LOOP); + s3_loop_promotion_map = + updateValGroupIdMap(s3_original_loop_promotion_map, s3_loop_graph); + + // Note that s4_iel_promotion_map is an empty map at this + // point. It'll be populated with the Step-3 map + propagatePromotionsInIELGraph( + iel_graph, + s4_iel_promotion_map, + idGraph(IdMappingMode::LOOP), + s3_original_loop_promotion_map, + true); } ValGraph iel_graph; std::unordered_map s1_root_resolution_map; std::unordered_map s2_iel_promotion_map; + ValGraph s3_loop_graph; std::unordered_map s3_loop_promotion_map; + std::unordered_map s4_iel_promotion_map; }; // Test if id is resolved to an ID that is exact mapped with @@ -292,6 +316,37 @@ void checkStep3Results( } } +void checkStep4Results( + const ValGraph& iel_graph, + const std::unordered_map& iel_promotion_map, + const std::vector, IterDomain*>>& + ref_promotion_map) { + EXPECT_EQ(iel_promotion_map.size(), ref_promotion_map.size()) + << "Mismatched Step-4 result map. " + << "Expected to have " << ref_promotion_map.size() + << " mappings but found " << iel_promotion_map.size(); + + // for (const auto& [iel_group, promotion_id] : iel_promotion_map) { + for (const auto& ref_promotion_pair : ref_promotion_map) { + const auto& ref_promotion_group = ref_promotion_pair.first; + const auto& ref_promotion_id = ref_promotion_pair.second; + + auto iel_promotion_it = std::find_if( + iel_promotion_map.begin(), + iel_promotion_map.end(), + [&](const auto& iel_promotion) { + return iel_promotion.first->set() == ref_promotion_group; + }); + + auto iel_promotion_id = iel_promotion_it->second; + ASSERT_EQ(ref_promotion_id, iel_promotion_id) + << "Expected promotion: " << ref_promotion_id->toString() + << ". Actual: " << iel_promotion_id->toString(); + } + + std::cerr << "checkStep4Results done\n"; +} + // Create a fusion where we're missing a valid concrete id so the compute at map // processing will fail. We need to be able to create the concrete ID not just // look for one. It is not yet possible to lower this fusion as the @@ -660,9 +715,10 @@ TEST_F(IdModelTest, LoopPromotion1) { {std::unordered_set{t2->axis(1), t3->axis(1)}, t3->axis(1)}}; checkStep3Results( - tester.idGraph(IdMappingMode::LOOP), - tester.s3_loop_promotion_map, - s3_reference_map); + tester.s3_loop_graph, tester.s3_loop_promotion_map, s3_reference_map); + + ASSERT_TRUE(tester.s4_iel_promotion_map.empty()) + << "No step-4 IEL promotion expected"; } } @@ -717,9 +773,10 @@ TEST_F(IdModelTest, LoopPromotion2) { {std::unordered_set{t3->axis(0), t4->axis(0)}, t4->axis(0)}}; checkStep3Results( - tester.idGraph(IdMappingMode::LOOP), - tester.s3_loop_promotion_map, - s3_reference_map); + tester.s3_loop_graph, tester.s3_loop_promotion_map, s3_reference_map); + + ASSERT_TRUE(tester.s4_iel_promotion_map.empty()) + << "No step-4 IEL promotion expected"; } // Multiple inlined and non-inlined broadcast domains @@ -795,9 +852,10 @@ TEST_F(IdModelTest, LoopPromotion3) { tv3->axis(0)}}; checkStep3Results( - tester.idGraph(IdMappingMode::LOOP), - tester.s3_loop_promotion_map, - s3_reference_map); + tester.s3_loop_graph, tester.s3_loop_promotion_map, s3_reference_map); + + ASSERT_TRUE(tester.s4_iel_promotion_map.empty()) + << "No step-4 IEL promotion expected"; } // Test root resolution with a fusion with outer split. @@ -895,9 +953,26 @@ TEST_F(IdModelTest, LoopPromotion4) { {std::unordered_set{tv2->axis(1)}, tv4->axis(1)}}; checkStep3Results( - tester.idGraph(IdMappingMode::LOOP), - tester.s3_loop_promotion_map, - s3_reference_map); + tester.s3_loop_graph, tester.s3_loop_promotion_map, s3_reference_map); + + auto id10 = getParentId(tv4->axis(0), 1); + ASSERT_EQ(id10->name(), 10); + auto id32 = + getValByName(ir_utils::consumerValsOf(id10), 32)->as(); + auto id33 = + getValByName(ir_utils::consumerValsOf(id10), 33)->as(); + + std::vector, IterDomain*>> + s4_reference_map = { + // 19 -> 10 + {std::unordered_set{getParentId(tv2->axis(0), 1)}, id10}, + // 20 -> 32 + {std::unordered_set{tv2->axis(0)}, id32}, + // 21 -> 33 + {std::unordered_set{tv2->axis(1)}, id33}}; + + checkStep4Results( + tester.iel_graph, tester.s4_iel_promotion_map, s4_reference_map); } // Test root resolution with the same fusion as Indexing1 @@ -1027,9 +1102,50 @@ TEST_F(IdModelTest, LoopPromotion5) { }; checkStep3Results( - tester.idGraph(IdMappingMode::LOOP), - tester.s3_loop_promotion_map, - s3_reference_map); + tester.s3_loop_graph, tester.s3_loop_promotion_map, s3_reference_map); + + auto id19 = getParentId(tv4->axis(0), 3); + ASSERT_EQ(id19->name(), 19); + auto id20 = getParentId(tv4->axis(0), 2); + ASSERT_EQ(id20->name(), 20); + auto id40 = getChildIdByName(id20, 40); + auto id41 = getChildIdByName(id20, 41); + auto id42 = getChildIdByName(id20, 42); + auto id43 = getChildIdByName(id20, 43); + auto id46 = getChildIdByName(id40, 46); + auto id47 = getChildIdByName(id40, 47); + auto id48 = getChildIdByName(id42, 48); + auto id49 = getChildIdByName(id42, 49); + + std::vector, IterDomain*>> + s4_reference_map = { + // 32 -> 19 + {std::unordered_set{getParentId(tv2->axis(0), 3)}, id19}, + // 33 -> 20 + {std::unordered_set{getParentId(tv2->axis(0), 2)}, id20}, + // 34 -> 40 + {std::unordered_set{getParentId(tv2->axis(0), 1)}, id40}, + // 35 -> 41 + {std::unordered_set{tv2->axis(2)}, id41}, + // 36 -> 46 + {std::unordered_set{tv2->axis(0)}, id46}, + // 37 -> 47 + {std::unordered_set{tv2->axis(1)}, id47}, + // 26 -> 19 + {std::unordered_set{getParentId(tv3->axis(0), 3)}, id19}, + // 27 -> 20 + {std::unordered_set{getParentId(tv3->axis(0), 2)}, id20}, + // 28 -> 42 + {std::unordered_set{getParentId(tv3->axis(0), 1)}, id42}, + // 29 -> 43 + {std::unordered_set{tv3->axis(2)}, id43}, + // 30 -> 48 + {std::unordered_set{tv3->axis(0)}, id48}, + // 31 -> 49 + {std::unordered_set{tv3->axis(1)}, id49}}; + + checkStep4Results( + tester.iel_graph, tester.s4_iel_promotion_map, s4_reference_map); } // Test root resolution with the same fusion as Indexing19 @@ -1123,30 +1239,16 @@ TEST_F(IdModelTest, LoopPromotion6) { tester.idGraph(IdMappingMode::EXACT), tester.s2_iel_promotion_map); - auto id79 = - getValByName(ir_utils::consumerValsOf(tv9->getRootDomain().at(2)), 79) - ->as(); - ASSERT_NE(id79, nullptr) << "IterDomain 79 not found"; - auto id80 = - getValByName(ir_utils::consumerValsOf(tv9->getRootDomain().at(2)), 80) - ->as(); - ASSERT_NE(id80, nullptr) << "IterDomain 80 not found"; - auto id81 = getChildId(id79, 1); - ASSERT_EQ(id81->name(), 81); - auto id82 = getChildId(id79, 1, 1); - ASSERT_EQ(id82->name(), 82); - auto id83 = getChildId(id80, 1); - ASSERT_EQ(id83->name(), 83); - auto id84 = getChildId(id80, 1, 1); - ASSERT_EQ(id84->name(), 84); - auto id85 = getChildId(id81, 1); - ASSERT_EQ(id85->name(), 85); - auto id86 = getChildId(id81, 1, 1); - ASSERT_EQ(id86->name(), 86); - auto id87 = getChildId(id83, 1); - ASSERT_EQ(id87->name(), 87); - auto id88 = getChildId(id83, 1, 1); - ASSERT_EQ(id88->name(), 88); + auto id79 = getChildIdByName(tv9->getRootDomain().at(2), 79); + auto id80 = getChildIdByName(tv9->getRootDomain().at(2), 80); + auto id81 = getChildIdByName(id79, 81); + auto id82 = getChildIdByName(id79, 82); + auto id83 = getChildIdByName(id80, 83); + auto id84 = getChildIdByName(id80, 84); + auto id85 = getChildIdByName(id81, 85); + auto id86 = getChildIdByName(id81, 86); + auto id87 = getChildIdByName(id83, 87); + auto id88 = getChildIdByName(id83, 88); // Check Step 3 results. See the design doc for the expected results std::vector, IterDomain*>> @@ -1223,9 +1325,111 @@ TEST_F(IdModelTest, LoopPromotion6) { }; checkStep3Results( - tester.idGraph(IdMappingMode::LOOP), - tester.s3_loop_promotion_map, - s3_reference_map); + tester.s3_loop_graph, tester.s3_loop_promotion_map, s3_reference_map); + + // For tv1 + auto id94 = getChildIdByName(id80, 94); + auto id95 = getChildIdByName(id80, 95); + auto id109 = getChildIdByName(id94, 109); + auto id110 = getChildIdByName(id94, 110); + + // For tv2 + auto id98 = getChildIdByName(id80, 98); + auto id99 = getChildIdByName(id80, 99); + auto id113 = getChildIdByName(id98, 113); + auto id114 = getChildIdByName(id98, 114); + + // For tv6 + auto id102 = getChildIdByName(id80, 102); + auto id103 = getChildIdByName(id80, 103); + auto id117 = getChildIdByName(id102, 117); + auto id118 = getChildIdByName(id102, 118); + + // For tv4 + auto id111 = getChildIdByName(id80, 111); + auto id112 = getChildIdByName(id80, 112); + auto id129 = getChildIdByName(id111, 129); + auto id130 = getChildIdByName(id111, 130); + + // For tv5 + auto id127 = getChildIdByName(id80, 127); + auto id128 = getChildIdByName(id80, 128); + auto id135 = getChildIdByName(id127, 135); + auto id136 = getChildIdByName(id127, 136); + + // For tv8 + auto id107 = getChildIdByName(id80, 107); + auto id108 = getChildIdByName(id80, 108); + auto id125 = getChildIdByName(id107, 125); + auto id126 = getChildIdByName(id107, 126); + + // For tv9 + auto id121 = getChildIdByName(id80, 121); + auto id122 = getChildIdByName(id80, 122); + auto id131 = getChildIdByName(id121, 131); + auto id132 = getChildIdByName(id121, 132); + + std::vector, IterDomain*>> + s4_reference_map = { + // tv1: 71 -> 94 + {std::unordered_set{getParentId(tv1->axis(0), 1)}, id94}, + // tv1: 72 -> 95 + {std::unordered_set{tv1->axis(2)}, id95}, + // tv1: 73 -> 109 + {std::unordered_set{tv1->axis(0)}, id109}, + // tv1: 74 -> 110 + {std::unordered_set{tv1->axis(1)}, id110}, + // tv2: 47 -> 98 + {std::unordered_set{getParentId(tv2->axis(0), 1)}, id98}, + // tv2: 48 -> 99 + {std::unordered_set{tv2->axis(2)}, id99}, + // tv2: 49 -> 113 + {std::unordered_set{tv2->axis(0)}, id113}, + // tv2: 50 -> 114 + {std::unordered_set{tv2->axis(1)}, id114}, + // tv4: 42 -> 111 + {std::unordered_set{getParentId(tv4->axis(0), 1)}, id111}, + // tv4: 43 -> 112 + {std::unordered_set{tv4->axis(2)}, id112}, + // tv4: 44 -> 129 + {std::unordered_set{tv4->axis(0)}, id129}, + // tv4: 45 -> 130 + {std::unordered_set{tv4->axis(1)}, id130}, + // tv5: 37 -> 127 + {std::unordered_set{getParentId(tv5->axis(0), 1)}, id127}, + // tv5: 38 -> 128 + {std::unordered_set{tv5->axis(2)}, id128}, + // tv5: 39 -> 135 + {std::unordered_set{tv5->axis(0)}, id135}, + // tv5: 40 -> 136 + {std::unordered_set{tv5->axis(1)}, id136}, + // tv6: 62 -> 102 + {std::unordered_set{getParentId(tv6->axis(0), 1)}, id102}, + // tv6: 63 -> 103 + {std::unordered_set{tv6->axis(2)}, id103}, + // tv6: 64 -> 117 + {std::unordered_set{tv6->axis(0)}, id117}, + // tv6: 65 -> 118 + {std::unordered_set{tv6->axis(1)}, id118}, + // tv8: 57 -> 107 + {std::unordered_set{getParentId(tv8->axis(0), 1)}, id107}, + // tv8: 58 -> 108 + {std::unordered_set{tv8->axis(2)}, id108}, + // tv8: 59 -> 125 + {std::unordered_set{tv8->axis(0)}, id125}, + // tv8: 60 -> 126 + {std::unordered_set{tv8->axis(1)}, id126}, + // tv9: 31 -> 121 + {std::unordered_set{getParentId(tv9->axis(0), 1)}, id121}, + // tv9: 32 -> 122 + {std::unordered_set{tv9->axis(2)}, id122}, + // tv9: 33 -> 131 + {std::unordered_set{tv9->axis(0)}, id131}, + // tv9: 34 -> 132 + {std::unordered_set{tv9->axis(1)}, id132}}; + + checkStep4Results( + tester.iel_graph, tester.s4_iel_promotion_map, s4_reference_map); } // Same fusion as NvFuserTest.FusionInlineBroadcastIndexing0 @@ -1290,6 +1494,8 @@ TEST_F(IdModelTest, LoopPromotion7) { tester.idGraph(IdMappingMode::EXACT), tester.s2_iel_promotion_map); + auto id8 = getChildIdByName(tv4->getRootDomain().at(0), 8); + // Check Step 3 results. See the design doc for the expected results std::vector, IterDomain*>> s3_reference_map = { @@ -1301,8 +1507,8 @@ TEST_F(IdModelTest, LoopPromotion7) { getChildId(tv3->getRootDomain().at(0), 1), tv4->getRootDomain().at(0), tv4->getRootDomain().at(1), - getChildId(tv4->getRootDomain().at(0), 1)}, - getChildId(tv4->getRootDomain().at(0), 1)}, + id8}, + id8}, // 17, 15, 9 -> 9 {std::unordered_set{tv2->axis(0), tv3->axis(0), tv4->axis(0)}, tv4->axis(0)}, @@ -1310,9 +1516,36 @@ TEST_F(IdModelTest, LoopPromotion7) { {std::unordered_set{tv3->axis(1)}, tv4->axis(1)}}; checkStep3Results( - tester.idGraph(IdMappingMode::LOOP), - tester.s3_loop_promotion_map, - s3_reference_map); + tester.s3_loop_graph, tester.s3_loop_promotion_map, s3_reference_map); + + // For tv2 + auto id26 = getChildIdByName(id8, 26); + auto id27 = getChildIdByName(id8, 27); + auto id34 = getChildIdByName(id27, 34); + auto id35 = getChildIdByName(id27, 35); + + // For tv3 + auto id30 = getChildIdByName(id8, 30); + auto id31 = getChildIdByName(id8, 31); + + std::vector, IterDomain*>> + s4_reference_map = { + // tv2: 17 -> 26 + {std::unordered_set{tv2->axis(0)}, id26}, + // tv2: 18 -> 27 + {std::unordered_set{getParentId(tv2->axis(1), 1)}, id27}, + // tv2: 21 -> 34 + {std::unordered_set{tv2->axis(1)}, id34}, + // tv2: 22 -> 35 + {std::unordered_set{tv2->axis(2)}, id35}, + // tv3: 15 -> 26 + {std::unordered_set{tv3->axis(0)}, id30}, + // tv3: 16 -> 27 + {std::unordered_set{tv3->axis(1)}, id31}, + }; + + checkStep4Results( + tester.iel_graph, tester.s4_iel_promotion_map, s4_reference_map); } // Same fusion as NvFuserTest.FusionIndexing20 @@ -1406,6 +1639,11 @@ TEST_F(IdModelTest, LoopPromotion8) { tester.idGraph(IdMappingMode::EXACT), tester.s2_iel_promotion_map); + auto id29 = getParentId(tv7->axis(0), 1); + ASSERT_EQ(id29->name(), 29) << "Unexpected ID: " << id29->toString(); + auto id42 = getParentId(tv7->axis(1), 1); + ASSERT_EQ(id42->name(), 42); + // Check Step 3 results. See the design doc for the expected results std::vector, IterDomain*>> s3_reference_map = { @@ -1442,8 +1680,8 @@ TEST_F(IdModelTest, LoopPromotion8) { getChildId( getChildId(tv7->getRootDomain().at(0), 1), 1, 1), // 31 tv7->getRootDomain().at(2), // 16 - getChildId(tv7->getRootDomain().at(2), 1)}, // 42 - getChildId(tv7->getRootDomain().at(2), 1)}, + id42}, // 42 + id42}, // 22 -> 19 {std::unordered_set{tv2->axis(1)}, tv4->axis(1)}, // 40, 43 -> 43 @@ -1453,9 +1691,117 @@ TEST_F(IdModelTest, LoopPromotion8) { }; checkStep3Results( - tester.idGraph(IdMappingMode::LOOP), - tester.s3_loop_promotion_map, - s3_reference_map); + tester.s3_loop_graph, tester.s3_loop_promotion_map, s3_reference_map); + + auto id49 = getChildIdByName(id29, 49); + auto id50 = getChildIdByName(id29, 50); + auto id51 = getChildIdByName(id29, 51); + auto id52 = getChildIdByName(id29, 52); + auto id63 = getChildIdByName(id42, 63); + auto id64 = getChildIdByName(id42, 64); + + std::vector, IterDomain*>> + s4_reference_map = { + // tv1: 35 -> 49 + {std::unordered_set{tv1->axis(0)}, id49}, + // tv1: 36 -> 50 + {std::unordered_set{tv1->axis(1)}, id50}, + // tv2: 21 -> 51 + {std::unordered_set{tv2->axis(0)}, id51}, + // tv2: 22 -> 52 + {std::unordered_set{tv2->axis(1)}, id52}, + // tv5: 40 -> 63 + {std::unordered_set{tv5->axis(1)}, id63}, + // tv5: 41 -> 64 + {std::unordered_set{tv5->axis(2)}, id64}, + }; + + checkStep4Results( + tester.iel_graph, tester.s4_iel_promotion_map, s4_reference_map); +} + +// A repro that produces an invalid loop graph due to the compliment +// mapping. This is not currently supported. +TEST_F(IdModelTest, ComplimentMappingCausingLoopSelfMapping) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeConcreteTensor({7}); + fusion.addInput(tv0); + auto tv1 = makeConcreteTensor({7, 8}); + fusion.addInput(tv1); + auto tv2 = makeConcreteTensor({7, 9}); + fusion.addInput(tv2); + + auto tv3 = broadcast(tv0, {false, true}); + auto tv4 = add(tv1, tv3); + auto tv5 = broadcast(tv4, {false, false, true}); + + auto tv6 = broadcast(tv0, {false, true}); + auto tv7 = add(tv2, tv6); + auto tv8 = broadcast(tv7, {false, true, false}); + + auto tv9 = add(tv5, tv8); + + auto tv10 = set(tv9); + auto tv11 = set(tv10); + fusion.addOutput(tv11); + + // Merge all domains except for tv10 and tv11 + for (auto tv : ir_utils::allTvs(&fusion)) { + if (tv == tv10 || tv == tv11) { + continue; + } + while (tv->nDims() > 1) { + tv->merge(0); + } + } + + // Fully inline all tensors up until tv10 + for (auto tv : ir_utils::allTvs(&fusion)) { + if (tv == tv9 || tv == tv10 || tv == tv11) { + continue; + } + tv->inlineAt(1); + } + + // Fully inline tv10 to tv11 without merging + tv10->inlineAt(-1); + + // Due to the compliment mapping, the leaf domains of tv10 and tv11 + // are loop mapped, which is invalid. + // + // Specifically, here are the tv10 and tv11 tensors: + // + // T10_l[ iS22{7}, iS23{8}, iS24{9} ] ca_pos( 3 ) + // root domain : (iS22{7}, iS23{8}, iS24{9}) + // contiguity: t t t + // leaf domain : (iS22{7}, iS23{8}, iS24{9}) + // T11_g[ iS25{7}, iS26{8}, iS27{9} ] produce_pos( 3 ) + // root domain : (iS25{7}, iS26{8}, iS27{9}) + // contiguity: t t t + // leaf domain : (iS25{7}, iS26{8}, iS27{9}) + // + // Here's the loop graph for tv10 and tv11: + // idg{22 23 24 25 26 27} + + // Due to the invalid mapping, building IdModel should fail for now + EXPECT_THAT( + [&]() { IdModel id_model(&fusion, true, false, false); }, + ::testing::ThrowsMessage(::testing::HasSubstr( + "Detected leaf domains are mapped in the loop graph"))); + + // Enable the below validation once the above problem is resolved. + // + // const ValGraph& loop_graph = id_model.idGraph(IdMappingMode::LOOP); + // + // These assertions should fail at this moment. + // ASSERT_NE( + // loop_graph.toGroup(tv10->axis(0)), loop_graph.toGroup(tv10->axis(1))); + // ASSERT_NE( + // loop_graph.toGroup(tv10->axis(0)), loop_graph.toGroup(tv10->axis(2))); + // ASSERT_NE( + // loop_graph.toGroup(tv10->axis(1)), loop_graph.toGroup(tv10->axis(2))); } namespace { From 6d37a4a94d52a04ec7a78a8aad2cb78fff1de7ec Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 26 Mar 2024 13:40:17 -0700 Subject: [PATCH 2/7] remove unnecessary file --- csrc/id_model/utils.h | 55 ------------------------------------------- 1 file changed, 55 deletions(-) delete mode 100644 csrc/id_model/utils.h diff --git a/csrc/id_model/utils.h b/csrc/id_model/utils.h deleted file mode 100644 index 2d6327bf586..00000000000 --- a/csrc/id_model/utils.h +++ /dev/null @@ -1,55 +0,0 @@ -// clang-format off -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. - * All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - */ -// clang-format on -#pragma once - -#include - -#include -#include -#include - -#define VERBOSE() verbose(__LINE__) -#define WARN() warn(__LINE__) - -namespace nvfuser { - -// Temporary logging utility -class DebugStream { - public: - DebugStream() - : enabled_(getNvFuserEnv("ID_MODEL_VERBOSE")), out_(std::cerr) {} - - template - DebugStream& operator<<(const T& v) { - if (enabled_) { - out_ << v; - } - return *this; - } - - DebugStream& operator<<(std::ostream& (*endl)(std::ostream&)) { - if (enabled_) { - out_ << endl; - } - return *this; - } - - private: - bool enabled_ = false; - std::ostream& out_; -}; - -inline DebugStream verbose(int line) { - return DebugStream() << "[DEBUG@" << line << "] "; -} - -inline DebugStream warn(int line) { - return DebugStream() << "[WARN@" << line << "] "; -} - -} // namespace nvfuser From cb91256bdbc7a87550afad5f8adf3632e6e2d98e Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 26 Mar 2024 13:42:12 -0700 Subject: [PATCH 3/7] cleanup --- tests/cpp/test_id_model.cpp | 89 +------------------------------------ 1 file changed, 1 insertion(+), 88 deletions(-) diff --git a/tests/cpp/test_id_model.cpp b/tests/cpp/test_id_model.cpp index 53145d843d5..c3f8d7b8007 100644 --- a/tests/cpp/test_id_model.cpp +++ b/tests/cpp/test_id_model.cpp @@ -326,7 +326,6 @@ void checkStep4Results( << "Expected to have " << ref_promotion_map.size() << " mappings but found " << iel_promotion_map.size(); - // for (const auto& [iel_group, promotion_id] : iel_promotion_map) { for (const auto& ref_promotion_pair : ref_promotion_map) { const auto& ref_promotion_group = ref_promotion_pair.first; const auto& ref_promotion_id = ref_promotion_pair.second; @@ -339,12 +338,10 @@ void checkStep4Results( }); auto iel_promotion_id = iel_promotion_it->second; - ASSERT_EQ(ref_promotion_id, iel_promotion_id) + EXPECT_EQ(ref_promotion_id, iel_promotion_id) << "Expected promotion: " << ref_promotion_id->toString() << ". Actual: " << iel_promotion_id->toString(); } - - std::cerr << "checkStep4Results done\n"; } // Create a fusion where we're missing a valid concrete id so the compute at map @@ -1720,90 +1717,6 @@ TEST_F(IdModelTest, LoopPromotion8) { tester.iel_graph, tester.s4_iel_promotion_map, s4_reference_map); } -// A repro that produces an invalid loop graph due to the compliment -// mapping. This is not currently supported. -TEST_F(IdModelTest, ComplimentMappingCausingLoopSelfMapping) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeConcreteTensor({7}); - fusion.addInput(tv0); - auto tv1 = makeConcreteTensor({7, 8}); - fusion.addInput(tv1); - auto tv2 = makeConcreteTensor({7, 9}); - fusion.addInput(tv2); - - auto tv3 = broadcast(tv0, {false, true}); - auto tv4 = add(tv1, tv3); - auto tv5 = broadcast(tv4, {false, false, true}); - - auto tv6 = broadcast(tv0, {false, true}); - auto tv7 = add(tv2, tv6); - auto tv8 = broadcast(tv7, {false, true, false}); - - auto tv9 = add(tv5, tv8); - - auto tv10 = set(tv9); - auto tv11 = set(tv10); - fusion.addOutput(tv11); - - // Merge all domains except for tv10 and tv11 - for (auto tv : ir_utils::allTvs(&fusion)) { - if (tv == tv10 || tv == tv11) { - continue; - } - while (tv->nDims() > 1) { - tv->merge(0); - } - } - - // Fully inline all tensors up until tv10 - for (auto tv : ir_utils::allTvs(&fusion)) { - if (tv == tv9 || tv == tv10 || tv == tv11) { - continue; - } - tv->inlineAt(1); - } - - // Fully inline tv10 to tv11 without merging - tv10->inlineAt(-1); - - // Due to the compliment mapping, the leaf domains of tv10 and tv11 - // are loop mapped, which is invalid. - // - // Specifically, here are the tv10 and tv11 tensors: - // - // T10_l[ iS22{7}, iS23{8}, iS24{9} ] ca_pos( 3 ) - // root domain : (iS22{7}, iS23{8}, iS24{9}) - // contiguity: t t t - // leaf domain : (iS22{7}, iS23{8}, iS24{9}) - // T11_g[ iS25{7}, iS26{8}, iS27{9} ] produce_pos( 3 ) - // root domain : (iS25{7}, iS26{8}, iS27{9}) - // contiguity: t t t - // leaf domain : (iS25{7}, iS26{8}, iS27{9}) - // - // Here's the loop graph for tv10 and tv11: - // idg{22 23 24 25 26 27} - - // Due to the invalid mapping, building IdModel should fail for now - EXPECT_THAT( - [&]() { IdModel id_model(&fusion, true, false, false); }, - ::testing::ThrowsMessage(::testing::HasSubstr( - "Detected leaf domains are mapped in the loop graph"))); - - // Enable the below validation once the above problem is resolved. - // - // const ValGraph& loop_graph = id_model.idGraph(IdMappingMode::LOOP); - // - // These assertions should fail at this moment. - // ASSERT_NE( - // loop_graph.toGroup(tv10->axis(0)), loop_graph.toGroup(tv10->axis(1))); - // ASSERT_NE( - // loop_graph.toGroup(tv10->axis(0)), loop_graph.toGroup(tv10->axis(2))); - // ASSERT_NE( - // loop_graph.toGroup(tv10->axis(1)), loop_graph.toGroup(tv10->axis(2))); -} - namespace { bool iterDomainsAreMapped( const IdModel& id_model, From 04cf2295311f9679abd449317ce33981b5b22d4f Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 2 Apr 2024 16:59:29 -0700 Subject: [PATCH 4/7] Update csrc/id_model/id_model.h Co-authored-by: Gao, Xiang --- csrc/id_model/id_model.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h index cb0c51ad4b9..38d533f4734 100644 --- a/csrc/id_model/id_model.h +++ b/csrc/id_model/id_model.h @@ -194,7 +194,7 @@ class IdModel : public PolymorphicBase { // create a mapping from the outputs of the IEL expr to the outputs // of the equivalent expr. When require_loop_mapped_promotion is // true, the equivalent expr needs to be already loop mapped. If no - // such expr is found, the IEL expr is replayed iwth the promoted + // such expr is found, the IEL expr is replayed with the promoted // inputs. require_loop_mapped_promotion is true when this function // is used for step 3. // From cc2adcee84b77ce7e92e7064695d74133c527f4f Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 2 Apr 2024 17:38:08 -0700 Subject: [PATCH 5/7] fix comments --- csrc/id_model/id_model.cpp | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/csrc/id_model/id_model.cpp b/csrc/id_model/id_model.cpp index 4a27e9c87b4..f4aa1dd30a0 100644 --- a/csrc/id_model/id_model.cpp +++ b/csrc/id_model/id_model.cpp @@ -982,16 +982,25 @@ Expr* findMatchingExpr( // domains are loop mapped with the outputs. // // i.e. if we have the inlined domains from: -// T2[i0*i1] pa(1) = T0[i0*b1]ca(1) + T1[i0*i1]ca(1) -// The inlined loop group would be: +// Inputs: +// T0[i0] +// T1[i0, i1] // -// i0, i1, b1, i0*i1, b0*i1 -// Then if we replayed the iel transformations they would be: -// merge(i0, i1) -// merge(i0, b1) +// T2[i0, b2] = broadcast(T0) +// T3[i0, i1] = T2 + T1 // -// So if we replayed them with loop promotion, then i0, i1, b1 would be -// promoted to i0*i1, and the merges would be replayed. +// {T1, T2, T3}->merge(0, 1) +// inlineMost +// +// The inlined loop group would consist of: +// +// {i0, i1, b2, i0*b2, i0*i1} +// +// Note that all these domains would have promotion to i0*i1 at the +// end of Step 3. When the IEL expression of merge(i0, i1) is visited by +// propagatePromotionsInIELGraph again, the promotion to i0*i1 of both +// inputs would be propagated to its output, resulting in promotion of +// i0*i1 to (i0*i1)*(i0*i1), which is not the correct propagation. // // Therefore only promote i0*b1 to i0*i1, or i0*i1 to i0*i1 (i.e. don't // promote an input to any transformation within the loop group). From c19a97ae13f46961578e6c6a4aeadc013ccd5865 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 2 Apr 2024 18:33:27 -0700 Subject: [PATCH 6/7] test cleanup --- tests/cpp/test_id_model.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/cpp/test_id_model.cpp b/tests/cpp/test_id_model.cpp index c3f8d7b8007..9b25fb24c9f 100644 --- a/tests/cpp/test_id_model.cpp +++ b/tests/cpp/test_id_model.cpp @@ -954,10 +954,8 @@ TEST_F(IdModelTest, LoopPromotion4) { auto id10 = getParentId(tv4->axis(0), 1); ASSERT_EQ(id10->name(), 10); - auto id32 = - getValByName(ir_utils::consumerValsOf(id10), 32)->as(); - auto id33 = - getValByName(ir_utils::consumerValsOf(id10), 33)->as(); + auto id32 = getChildIdByName(id10, 32); + auto id33 = getChildIdByName(id10, 33); std::vector, IterDomain*>> s4_reference_map = { From 12719a2540f84f35d7ac1fc1cac464650a7d3c8a Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 3 Apr 2024 08:02:44 -0700 Subject: [PATCH 7/7] disable loop promotion --- csrc/device_lower/lower2device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index f988ce65c4f..76181bd72b4 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -391,7 +391,7 @@ void GpuLower::analysis(Fusion* fusion) { // functionality should be affected. New IterDomains may be created, // so it is expected that generated code may use diffrent variable // names - if (true || isOptionEnabled(EnableOption::IdModel)) { + if (isOptionEnabled(EnableOption::IdModel)) { IdModel id_model(fusion_); }