From 7a485ed20a665580f0e19c7c7a24b2487678eabb Mon Sep 17 00:00:00 2001 From: Andrzej Bekas Date: Wed, 23 Aug 2023 16:10:47 +0200 Subject: [PATCH 1/3] Buildsystem, fixes for gcc 9.4 --- csrc/scheduler/vectorize_helper.cpp | 2 +- test/test_resize.cpp | 38 ++++++++++++++--------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp index 209299eb937..688f6640eca 100644 --- a/csrc/scheduler/vectorize_helper.cpp +++ b/csrc/scheduler/vectorize_helper.cpp @@ -889,7 +889,7 @@ int64_t getVectorizationFactorTransposeGroup( for (auto tv : vec_tv) { auto inner_size_it = contig_inner_map.find(tv); auto tv_vectorize_factor_opt = inner_size_it == contig_inner_map.end() - ? 1 + ? 1L : runtime_info.expressionEvaluator().evaluate(inner_size_it->second); // TODO: Do not assert here. we can just reduce vectorization size to 1 if // we can't infer an inner size. diff --git a/test/test_resize.cpp b/test/test_resize.cpp index 24b428f1171..e3ba0dc6efb 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2535,8 +2535,8 @@ TEST_F(ResizeTest, Slice1DVectorizeManual4) { auto tv1 = slice( tv0, - {{IrBuilder::create(1), - sub(tv0->axis(0)->extent(), IrBuilder::create(3))}}); + {{IrBuilder::create(1L), + sub(tv0->axis(0)->extent(), IrBuilder::create(3L))}}); fusion.addOutput(tv1); tv1->split(0, 4); @@ -2579,7 +2579,7 @@ TEST_F(ResizeTest, Slice2DVectorizeManual1) { tv0, {{IrBuilder::create(slice_offset), sub(tv0->axis(0)->extent(), IrBuilder::create(slice_offset))}, - {IrBuilder::create(0), tv0->axis(1)->extent()}}); + {IrBuilder::create(0L), tv0->axis(1)->extent()}}); fusion.addOutput(tv1); tv1->merge(0); @@ -2618,9 +2618,9 @@ TEST_F(ResizeTest, Slice3DVectorizeManual1) { auto tv1 = slice( tv0, - {{IrBuilder::create(0), tv0->axis(0)->extent()}, - {IrBuilder::create(4), IrBuilder::create(6)}, - {IrBuilder::create(0), tv0->axis(2)->extent()}}); + {{IrBuilder::create(0L), tv0->axis(0)->extent()}, + {IrBuilder::create(4L), IrBuilder::create(6L)}, + {IrBuilder::create(0L), tv0->axis(2)->extent()}}); fusion.addOutput(tv1); // Vectorize tv1 by a factor of 2. The sliced domain and the @@ -2667,10 +2667,10 @@ TEST_F(ResizeTest, Slice3DVectorizeManual2) { auto tv1 = slice( tv0, - {{IrBuilder::create(0), tv0->axis(0)->extent()}, - {IrBuilder::create(0), tv0->axis(1)->extent()}, - {IrBuilder::create(0), IrBuilder::create(1024)}, - {IrBuilder::create(0), tv0->axis(3)->extent()}}); + {{IrBuilder::create(0L), tv0->axis(0)->extent()}, + {IrBuilder::create(0L), tv0->axis(1)->extent()}, + {IrBuilder::create(0L), IrBuilder::create(1024L)}, + {IrBuilder::create(0L), tv0->axis(3)->extent()}}); fusion.addOutput(tv1); // [4, 1, 1024, 3] @@ -2709,19 +2709,19 @@ TEST_F(ResizeTest, SliceAndReshapeRepro540Manual) { auto tv1 = slice( tv0, - {{IrBuilder::create(0), tv0->axis(0)->extent()}, - {IrBuilder::create(0), tv0->axis(1)->extent()}, - {IrBuilder::create(0), IrBuilder::create(1024)}}); + {{IrBuilder::create(0L), tv0->axis(0)->extent()}, + {IrBuilder::create(0L), tv0->axis(1)->extent()}, + {IrBuilder::create(0L), IrBuilder::create(1024L)}}); auto tv2 = slice( tv0, - {{IrBuilder::create(0), tv0->axis(0)->extent()}, - {IrBuilder::create(0), tv0->axis(1)->extent()}, - {IrBuilder::create(1024), IrBuilder::create(2048)}}); + {{IrBuilder::create(0L), tv0->axis(0)->extent()}, + {IrBuilder::create(0L), tv0->axis(1)->extent()}, + {IrBuilder::create(1024L), IrBuilder::create(2048L)}}); auto tv3 = slice( tv0, - {{IrBuilder::create(0), tv0->axis(0)->extent()}, - {IrBuilder::create(0), tv0->axis(1)->extent()}, - {IrBuilder::create(2048), IrBuilder::create(3072)}}); + {{IrBuilder::create(0L), tv0->axis(0)->extent()}, + {IrBuilder::create(0L), tv0->axis(1)->extent()}, + {IrBuilder::create(2048L), IrBuilder::create(3072L)}}); auto tv4 = reshape(tv1, {16, 128, 1024}, {16, 128, 16, 64}); auto tv5 = reshape(tv2, {16, 128, 1024}, {16, 128, 16, 64}); From e0d2b02c2f8521daaf4b582510a581a311f1c4ef Mon Sep 17 00:00:00 2001 From: Andrzej Bekas Date: Mon, 28 Aug 2023 12:06:31 +0200 Subject: [PATCH 2/3] Matmul scheduler - heuristics infra claning - expose batch iterdomain through tools used by heuristics in matmul scheduler, --- csrc/mma_type.h | 2 +- csrc/scheduler/matmul_utils.cpp | 12 +++++++++--- csrc/scheduler/mma_utils.cpp | 19 ++++++++++++++----- csrc/scheduler/mma_utils.h | 5 ++--- 4 files changed, 26 insertions(+), 12 deletions(-) diff --git a/csrc/mma_type.h b/csrc/mma_type.h index 71187f12da1..8c128b5ea63 100644 --- a/csrc/mma_type.h +++ b/csrc/mma_type.h @@ -15,7 +15,7 @@ namespace nvfuser { constexpr std::string_view MATMUL_LOG_PREFIX = "[MATMUL DEBUG] "; //! Named descriptors of domains in matmul -enum class MatmulDomain { M = 0, N, K }; +enum class MatmulDomain { M = 0, N, K, Batch }; //! Named descriptors of TensorView roles in fusion //! INPUT_A - a producer of MMA input A diff --git a/csrc/scheduler/matmul_utils.cpp b/csrc/scheduler/matmul_utils.cpp index 5be71086173..0fa0ac84e3f 100644 --- a/csrc/scheduler/matmul_utils.cpp +++ b/csrc/scheduler/matmul_utils.cpp @@ -39,7 +39,7 @@ namespace { using MatmulLayout = MmaOptions::MmaLayout; //! Access to the structure should be done with labels defined in //! MmaOptions::MmaDomains. -using ProblemShape = std::array; +using ProblemShape = std::array; //! A helper for deciding the type of MMA op for given fusion and problem shape. inline std::optional getMmaOp( @@ -153,11 +153,14 @@ ProblemShape getProblemShape( NVF_ERROR(false, mma_output_domains.getErrorMsg()); } - const auto [m, n, k] = mma_output_domains.getData(); + const auto [m, n, k, batch] = mma_output_domains.getData(); auto m_extend = runtime_info.expressionEvaluator().evaluate(m->extent()); auto n_extend = runtime_info.expressionEvaluator().evaluate(n->extent()); auto k_extend = runtime_info.expressionEvaluator().evaluate(k->extent()); + auto batch_extend = + (batch ? runtime_info.expressionEvaluator().evaluate(batch->extent()) + : 0L); if (!(m_extend && n_extend && k_extend)) { NVF_ERROR( @@ -172,7 +175,10 @@ ProblemShape getProblemShape( } return ProblemShape{ - m_extend.as(), n_extend.as(), k_extend.as()}; + m_extend.as(), + n_extend.as(), + k_extend.as(), + batch_extend.as()}; } std::string isMatmulFusionDefinitionSupported( diff --git a/csrc/scheduler/mma_utils.cpp b/csrc/scheduler/mma_utils.cpp index 9d4fcafcf17..3d992e783a8 100644 --- a/csrc/scheduler/mma_utils.cpp +++ b/csrc/scheduler/mma_utils.cpp @@ -1430,10 +1430,11 @@ ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion) { } const auto mma_output = mma_exprs.front()->out(); - // NOTE: the iter domains of MMA output should be [...,M,K,N] + // NOTE: the iter domains of MMA output should be [...,batch,M,K,N] IterDomain* m = nullptr; IterDomain* n = nullptr; IterDomain* k = nullptr; + IterDomain* batch = nullptr; const auto leaf_domains = static_cast(mma_output)->getLeafDomain(); @@ -1446,9 +1447,17 @@ ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion) { return ss.str(); } - // M,N are inner most concrete iter domains - m = concrete.rbegin()[1]; - n = concrete.rbegin()[0]; + using Pos_t = decltype(leaf_domains)::size_type; + + // N,M,B are the inner most concrete iter domains in tv, so positions are + // counted from the end of the container + constexpr Pos_t POS_N = 0, POS_M = 1, POS_BATCH = 2; + + m = concrete.rbegin()[POS_M]; + n = concrete.rbegin()[POS_N]; + if (concrete.size() > POS_BATCH) { + batch = concrete.rbegin()[POS_BATCH]; + } // K is a reduction domain, search for the inner most reduction domain for (auto iter_domain = leaf_domains.rbegin(); @@ -1461,7 +1470,7 @@ ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion) { } NVF_ERROR(k != nullptr, "Failed to find K domain in MMA output"); - return ProblemIterDomains{m, n, k}; + return ProblemIterDomains{m, n, k, batch}; } MatmulProblemLayoutOpt getMatmulLayout(Fusion* fusion) { diff --git a/csrc/scheduler/mma_utils.h b/csrc/scheduler/mma_utils.h index 755c74ea36d..b0e09218d5a 100644 --- a/csrc/scheduler/mma_utils.h +++ b/csrc/scheduler/mma_utils.h @@ -220,7 +220,7 @@ constexpr size_t MIN_MATMUL_INPUTS_NUMBER = 2; //! An alias for data structure for passing IterDomains representing problem //! shape dimensions //! TODO: extend definition for handling batch matmuls -using ProblemIterDomains = std::array; +using ProblemIterDomains = std::array; //! An alias for mapping between TensorView instance and its role in //! matmul fusion definition, some roles can be assigned to more than @@ -283,10 +283,9 @@ TORCH_CUDA_CU_API MatmulProblemLayoutOpt getMatmulLayout(Fusion* fusion); //! Returns wrapped collection of IterDomains that can be used to get //! problem shape with runtime info. //! Data is stored in the order in which lables are defined in MatmulDomain -//! enum class, that is in the following order: m, n, k. +//! enum class, that is in the following order: m, n, k, batch. //! An error message is stored in retruned object if valid data cannot //! be gathered. -//! TODO: 4th domain must be added for batch gemm support. TORCH_CUDA_CU_API ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion); //! Returns wrapped collection of TensorView roles in fusion. From 03b1cd7e70ab14df3897f23c582f91811fb2208c Mon Sep 17 00:00:00 2001 From: Andrzej Bekas Date: Mon, 28 Aug 2023 14:23:19 +0200 Subject: [PATCH 3/3] Matmul scheduler - epilogue input vectorization - remove WAR for unaligned memory access when smem epilogue is enabled and vectorization parallization mode is propagated, - introduce infrastructure for vectorization analysis in matmul scheduler, --- csrc/scheduler/matmul.cpp | 106 ++++++++++++++++++++++++----------- csrc/scheduler/mma_utils.cpp | 23 +++++++- csrc/scheduler/mma_utils.h | 2 +- csrc/scheduler/registry.cpp | 4 +- 4 files changed, 95 insertions(+), 40 deletions(-) diff --git a/csrc/scheduler/matmul.cpp b/csrc/scheduler/matmul.cpp index 064ebcbbda3..d8cdf113ec7 100644 --- a/csrc/scheduler/matmul.cpp +++ b/csrc/scheduler/matmul.cpp @@ -14,6 +14,9 @@ // NOTE: included to avoid compilation error caused by missing destructor in // 'SchedulerRuntimeInfo' #include +#include +#include +#include #include "mma_type.h" namespace nvfuser { @@ -614,54 +617,89 @@ void scheduleOutputTensor( {c}, {ParallelType::BIDx, ParallelType::BIDy, ParallelType::BIDz}); } + +//! Assign parallelization modes to epilogue inputs. The returned pair of +//! collections contains: +//! - 1st: TVs that can have all parallelization modes propagated, +//! - 2nd: TVs that can have all but vectorization parallel modes +//! propagated, +std::pair, std::vector> +partitionEpilogueInputsByParallelizationModes( + TensorView* reference_tv, + const std::vector& epilogue_inputs) { + const auto tvs_with_vec = scheduler_utils::getInputsOutputsWithInnerDim( + reference_tv, true /*inner_only*/, true /*vectorize_pass*/); + + std::vector epilogue_input_tvs_with_vec = {}; + epilogue_input_tvs_with_vec.reserve(epilogue_inputs.size()); + std::vector epilogue_input_tvs_without_vec = {}; + epilogue_input_tvs_without_vec.reserve(epilogue_inputs.size()); + + for (auto* tv : epilogue_inputs) { + if (std::find(tvs_with_vec.cbegin(), tvs_with_vec.cend(), tv) == + tvs_with_vec.cend()) { + epilogue_input_tvs_without_vec.push_back(tv); + } else { + epilogue_input_tvs_with_vec.push_back(tv); + } + } + + return {epilogue_input_tvs_with_vec, epilogue_input_tvs_without_vec}; +} + //! Propagates transformations from fusion output to fusion tv inputs that are //! producers in the epilogue. Transformations' propagation aims at input tvs //! which are not assigned to core roles, that is, are not MMA inputs. -void scheduleFusionInputsForEpilogue( - const mma_utils::RolesMap& roles_map, - const bool with_smem_epilogue) { - std::vector cached_tvs; - +void scheduleFusionInputsForEpilogue(const mma_utils::RolesMap& roles_map) { // Handling transformations in fusion input tvs with assigned INPUT_C role by - // propagating fusion output transformations through cached views of INPUT_C - // fusion input tvs and by setting vectorization of the inner most iterdomain - // of these cached views + // propagating fusion output transformations and parallelization through + // cached views of fusion inputs with INPUT_C role. + // NOTE: Vectorization parallelization mode of inner most domain requires + // special care. if (roles_map.count(MatmulRole::INPUT_C)) { - auto& c_tvs = roles_map.at(MatmulRole::INPUT_C); - // The system supports only scenario where there is only one fusion output - // with assigned OUTPUT_D role, this condition is already verified so there - // is no need for an additional checks here + // with assigned OUTPUT_D role, this condition is already verified so no + // additional checks are needed auto output_d = roles_map.at(MatmulRole::OUTPUT_D).front(); - for (auto* c : c_tvs) { - cached_tvs.push_back(c->cacheAfter()); + const auto& c_tvs = roles_map.at(MatmulRole::INPUT_C); + + const auto [tvs_with_vectorization, tvs_without_vectorization] = + partitionEpilogueInputsByParallelizationModes(output_d, c_tvs); + + const auto transform_propagator = + [&output_d]( + const std::vector& tvs, + const std::unordered_set& parallel_types) { + std::vector cached_tvs; + cached_tvs.reserve(tvs.size()); + for (auto* tv : tvs) { + cached_tvs.push_back(tv->cacheAfter()); + } + + scheduler_utils::BoundedDirectionalTransformPropagator::backward( + output_d, -1, tvs); + + scheduler_utils::parallelizeAllLike( + output_d, -1, cached_tvs, parallel_types); + }; + + // Propagate all parallelization modes + if (!tvs_with_vectorization.empty()) { + transform_propagator(tvs_with_vectorization, {}); } - - scheduler_utils::BoundedDirectionalTransformPropagator::backward( - output_d, -1, c_tvs); - - std::unordered_set parallel_types = {}; - if (with_smem_epilogue) { - //! In cases where smem epilogue feature is enabled, the vectorization of - //! domains will be propagated to fusion inputs that are epilogue inputs, - //! this may result in unaligned memory reads. Vectorization is - //! explicitly excluded form parallelization types to avoid this issue. - //! This should be changed when vectorization analysis is available and - //! enabled for matmul scheduler. - parallel_types = allParallelTypesExcept({ParallelType::Vectorize}); + // Propagate all parallelization modes but vectorization + if (!tvs_without_vectorization.empty()) { + transform_propagator( + tvs_without_vectorization, + allParallelTypesExcept({ParallelType::Vectorize})); } - scheduler_utils::parallelizeAllLike( - output_d, -1, cached_tvs, parallel_types); - - // The cached INPUT_C tvs are not needed anymore - cached_tvs.clear(); } } } // namespace void scheduleMatmul(Fusion* fusion, const MatmulParams& params) { - const auto& roles_map_opt = mma_utils::getTensorsRoles(fusion); + const auto roles_map_opt = mma_utils::getTensorsRoles(fusion); // NOTE: the contents of roles_map have been already validated during // compute-time checks @@ -1009,7 +1047,7 @@ void scheduleMatmul(Fusion* fusion, const MatmulParams& params) { // operations, input tvs with non-core roles // core roles: essential for matmul, for example mma inputs' producers if (has_non_mma_input_tvs) { - scheduleFusionInputsForEpilogue(roles_map, params.use_smem_epilogue); + scheduleFusionInputsForEpilogue(roles_map); } // auto inline for all tensors except register tensors and output tensor diff --git a/csrc/scheduler/mma_utils.cpp b/csrc/scheduler/mma_utils.cpp index 3d992e783a8..d04b0844cd3 100644 --- a/csrc/scheduler/mma_utils.cpp +++ b/csrc/scheduler/mma_utils.cpp @@ -1569,9 +1569,11 @@ RolesMapOpt getTensorsRoles(Fusion* fusion) { return mma_output_domains.getErrorMsg(); } - const auto findRolesByDomains = [](const DependenciesMap& deps_map, - RolesMap& roles_map, - const bool processing_output) { + TensorView* invalid_tv = nullptr; + const auto findRolesByDomains = [&invalid_tv]( + const DependenciesMap& deps_map, + RolesMap& roles_map, + const bool processing_output) { for (const auto& entry : deps_map) { const auto& domains = entry.second; const auto begin = domains.begin(); @@ -1607,6 +1609,9 @@ RolesMapOpt getTensorsRoles(Fusion* fusion) { roles_map[MatmulRole::OUTPUT_D].push_back(entry.first); continue; } + + invalid_tv = entry.first; + break; } }; @@ -1623,6 +1628,12 @@ RolesMapOpt getTensorsRoles(Fusion* fusion) { resolveTvToMatmulDomainsMapping( deps_map, mma_input_candidates, m, n, k, ca_map); findRolesByDomains(deps_map, roles_map, handling_output); + if (invalid_tv) { + std::stringstream ss; + ss << "One of fusion inputs cannot have role assigned! TV details: " + << invalid_tv->toString() << "\n"; + return {ss.str()}; + } deps_map.clear(); @@ -1631,6 +1642,12 @@ RolesMapOpt getTensorsRoles(Fusion* fusion) { resolveTvToMatmulDomainsMapping( deps_map, mma_output_candidates, m, n, k, ca_map); findRolesByDomains(deps_map, roles_map, handling_output); + if (invalid_tv) { + std::stringstream ss; + ss << "One of fusion outputs cannot have role assigned! TV details: " + << invalid_tv->toString() << "\n"; + return {ss.str()}; + } return roles_map; } diff --git a/csrc/scheduler/mma_utils.h b/csrc/scheduler/mma_utils.h index b0e09218d5a..e7dc543c7a7 100644 --- a/csrc/scheduler/mma_utils.h +++ b/csrc/scheduler/mma_utils.h @@ -250,7 +250,7 @@ class DataWrapperOpt { } std::string getErrorMsg() const { if (data.valueless_by_exception() || - std::holds_alternative(data)) { + !std::holds_alternative(data)) { return "Uninitialized data in data holder object"; } else { return std::get(data); diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp index f2614d61d7e..233a17f8184 100644 --- a/csrc/scheduler/registry.cpp +++ b/csrc/scheduler/registry.cpp @@ -2289,7 +2289,7 @@ class MatmulScheduler : public SchedulerEntry { SchedulerRuntimeInfo& runtime_info, HeuristicSummary* data_cache = nullptr) : SchedulerEntry(ScheduleHeuristic::Matmul) { - computeHeuristics(fusion, runtime_info); + computeHeuristics(fusion, runtime_info, data_cache); } void schedule(Fusion* fusion) override { @@ -2327,7 +2327,7 @@ class MatmulScheduler : public SchedulerEntry { void computeHeuristics( Fusion* fusion, SchedulerRuntimeInfo& runtime_info, - HeuristicSummary* data_cache = nullptr) { + HeuristicSummary* data_cache) { params_ = getMatmulHeuristics(fusion, runtime_info, data_cache); NVF_ERROR(params_ != nullptr); }