NVIDIA · drzejan2 · Aug 23, 2023 · Aug 28, 2023 · Aug 28, 2023 · liqiangxl
diff --git a/csrc/mma_type.h b/csrc/mma_type.h
@@ -15,7 +15,7 @@ namespace nvfuser {
 constexpr std::string_view MATMUL_LOG_PREFIX = "[MATMUL DEBUG] ";
 
 //! Named descriptors of domains in matmul
-enum class MatmulDomain { M = 0, N, K };
+enum class MatmulDomain { M = 0, N, K, Batch };
 
 //! Named descriptors of TensorView roles in fusion
 //!  INPUT_A - a producer of MMA input A

diff --git a/csrc/scheduler/matmul.cpp b/csrc/scheduler/matmul.cpp
@@ -14,6 +14,9 @@
 // NOTE: included to avoid compilation error caused by missing destructor in
 // 'SchedulerRuntimeInfo'
 #include <executor_utils.h>
+#include <algorithm>
+#include <iterator>
+#include <vector>
 #include "mma_type.h"
 
 namespace nvfuser {
@@ -614,54 +617,89 @@ void scheduleOutputTensor(
       {c},
       {ParallelType::BIDx, ParallelType::BIDy, ParallelType::BIDz});
 }
+
+//! Assign parallelization modes to epilogue inputs. The returned pair of
+//!  collections contains:
+//!   - 1st: TVs that can have all parallelization modes propagated,
+//!   - 2nd: TVs that can have all but vectorization parallel modes
+//!     propagated,
+std::pair<std::vector<TensorView*>, std::vector<TensorView*>>
+partitionEpilogueInputsByParallelizationModes(
+    TensorView* reference_tv,
+    const std::vector<TensorView*>& epilogue_inputs) {
+  const auto tvs_with_vec = scheduler_utils::getInputsOutputsWithInnerDim(
+      reference_tv, true /*inner_only*/, true /*vectorize_pass*/);
+
+  std::vector<TensorView*> epilogue_input_tvs_with_vec = {};
+  epilogue_input_tvs_with_vec.reserve(epilogue_inputs.size());
+  std::vector<TensorView*> epilogue_input_tvs_without_vec = {};
+  epilogue_input_tvs_without_vec.reserve(epilogue_inputs.size());
+
+  for (auto* tv : epilogue_inputs) {
+    if (std::find(tvs_with_vec.cbegin(), tvs_with_vec.cend(), tv) ==
+        tvs_with_vec.cend()) {
+      epilogue_input_tvs_without_vec.push_back(tv);
+    } else {
+      epilogue_input_tvs_with_vec.push_back(tv);
+    }
+  }
+
+  return {epilogue_input_tvs_with_vec, epilogue_input_tvs_without_vec};
+}
+
 //! Propagates transformations from fusion output to fusion tv inputs that are
 //!  producers in the epilogue. Transformations' propagation aims at input tvs
 //!  which are not assigned to core roles, that is, are not MMA inputs.
-void scheduleFusionInputsForEpilogue(
-    const mma_utils::RolesMap& roles_map,
-    const bool with_smem_epilogue) {
-  std::vector<TensorView*> cached_tvs;
-
+void scheduleFusionInputsForEpilogue(const mma_utils::RolesMap& roles_map) {
   // Handling transformations in fusion input tvs with assigned INPUT_C role by
-  //  propagating fusion output transformations through cached views of INPUT_C
-  //  fusion input tvs and by setting vectorization of the inner most iterdomain
-  //  of these cached views
+  //  propagating fusion output transformations and parallelization through
+  //  cached views of fusion inputs with INPUT_C role.
+  // NOTE: Vectorization parallelization mode of inner most domain requires
+  //  special care.
   if (roles_map.count(MatmulRole::INPUT_C)) {
-    auto& c_tvs = roles_map.at(MatmulRole::INPUT_C);
-
     // The system supports only scenario where there is only one fusion output
-    //  with assigned OUTPUT_D role, this condition is already verified so there
-    //  is no need for an additional checks here
+    //  with assigned OUTPUT_D role, this condition is already verified so no
+    //  additional checks are needed
     auto output_d = roles_map.at(MatmulRole::OUTPUT_D).front();
-    for (auto* c : c_tvs) {
-      cached_tvs.push_back(c->cacheAfter());
+    const auto& c_tvs = roles_map.at(MatmulRole::INPUT_C);
+
+    const auto [tvs_with_vectorization, tvs_without_vectorization] =
+        partitionEpilogueInputsByParallelizationModes(output_d, c_tvs);
+
+    const auto transform_propagator =
+        [&output_d](
+            const std::vector<TensorView*>& tvs,
+            const std::unordered_set<ParallelType>& parallel_types) {
+          std::vector<TensorView*> cached_tvs;
+          cached_tvs.reserve(tvs.size());
+          for (auto* tv : tvs) {
+            cached_tvs.push_back(tv->cacheAfter());
+          }
+
+          scheduler_utils::BoundedDirectionalTransformPropagator::backward(
+              output_d, -1, tvs);
+
+          scheduler_utils::parallelizeAllLike(
+              output_d, -1, cached_tvs, parallel_types);
+        };
+
+    // Propagate all parallelization modes
+    if (!tvs_with_vectorization.empty()) {
+      transform_propagator(tvs_with_vectorization, {});
     }
-
-    scheduler_utils::BoundedDirectionalTransformPropagator::backward(
-        output_d, -1, c_tvs);
-
-    std::unordered_set<ParallelType> parallel_types = {};
-    if (with_smem_epilogue) {
-      //! In cases where smem epilogue feature is enabled, the vectorization of
-      //!  domains will be propagated to fusion inputs that are epilogue inputs,
-      //!  this may result in unaligned memory reads. Vectorization is
-      //!  explicitly excluded form parallelization types to avoid this issue.
-      //! This should be changed when vectorization analysis is available and
-      //!  enabled for matmul scheduler.
-      parallel_types = allParallelTypesExcept({ParallelType::Vectorize});
+    // Propagate all parallelization modes but vectorization
+    if (!tvs_without_vectorization.empty()) {
+      transform_propagator(
+          tvs_without_vectorization,
+          allParallelTypesExcept({ParallelType::Vectorize}));
     }
-    scheduler_utils::parallelizeAllLike(
-        output_d, -1, cached_tvs, parallel_types);
-
-    // The cached INPUT_C tvs are not needed anymore
-    cached_tvs.clear();
   }
 }
 
 } // namespace
 
 void scheduleMatmul(Fusion* fusion, const MatmulParams& params) {
-  const auto& roles_map_opt = mma_utils::getTensorsRoles(fusion);
+  const auto roles_map_opt = mma_utils::getTensorsRoles(fusion);
 
   // NOTE: the contents of roles_map have been already validated during
   //  compute-time checks
@@ -1009,7 +1047,7 @@ void scheduleMatmul(Fusion* fusion, const MatmulParams& params) {
   //  operations, input tvs with non-core roles
   //  core roles: essential for matmul, for example mma inputs' producers
   if (has_non_mma_input_tvs) {
-    scheduleFusionInputsForEpilogue(roles_map, params.use_smem_epilogue);
+    scheduleFusionInputsForEpilogue(roles_map);
   }
 
   // auto inline for all tensors except register tensors and output tensor

diff --git a/csrc/scheduler/matmul_utils.cpp b/csrc/scheduler/matmul_utils.cpp
@@ -39,7 +39,7 @@ namespace {
 using MatmulLayout = MmaOptions::MmaLayout;
 //! Access to the structure should be done with labels defined in
 //!  MmaOptions::MmaDomains.
-using ProblemShape = std::array<int64_t, 3>;
+using ProblemShape = std::array<int64_t, 4>;
 
 //! A helper for deciding the type of MMA op for given fusion and problem shape.
 inline std::optional<MmaOptions::MacroType> getMmaOp(
@@ -153,11 +153,14 @@ ProblemShape getProblemShape(
     NVF_ERROR(false, mma_output_domains.getErrorMsg());
   }
 
-  const auto [m, n, k] = mma_output_domains.getData();
+  const auto [m, n, k, batch] = mma_output_domains.getData();
 
   auto m_extend = runtime_info.expressionEvaluator().evaluate(m->extent());
   auto n_extend = runtime_info.expressionEvaluator().evaluate(n->extent());
   auto k_extend = runtime_info.expressionEvaluator().evaluate(k->extent());
+  auto batch_extend =
+      (batch ? runtime_info.expressionEvaluator().evaluate(batch->extent())
+             : 0L);
 
   if (!(m_extend && n_extend && k_extend)) {
     NVF_ERROR(
@@ -172,7 +175,10 @@ ProblemShape getProblemShape(
   }
 
   return ProblemShape{
-      m_extend.as<int64_t>(), n_extend.as<int64_t>(), k_extend.as<int64_t>()};
+      m_extend.as<int64_t>(),
+      n_extend.as<int64_t>(),
+      k_extend.as<int64_t>(),
+      batch_extend.as<int64_t>()};
 }
 
 std::string isMatmulFusionDefinitionSupported(

diff --git a/csrc/scheduler/mma_utils.cpp b/csrc/scheduler/mma_utils.cpp
@@ -1430,10 +1430,11 @@ ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion) {
   }
   const auto mma_output = mma_exprs.front()->out();
 
-  // NOTE: the iter domains of MMA output should be [...,M,K,N]
+  // NOTE: the iter domains of MMA output should be [...,batch,M,K,N]
   IterDomain* m = nullptr;
   IterDomain* n = nullptr;
   IterDomain* k = nullptr;
+  IterDomain* batch = nullptr;
 
   const auto leaf_domains =
       static_cast<const TensorView*>(mma_output)->getLeafDomain();
@@ -1446,9 +1447,17 @@ ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion) {
     return ss.str();
   }
 
-  // M,N are inner most concrete iter domains
-  m = concrete.rbegin()[1];
-  n = concrete.rbegin()[0];
+  using Pos_t = decltype(leaf_domains)::size_type;
+
+  // N,M,B are the inner most concrete iter domains in tv, so positions are
+  //  counted from the end of the container
+  constexpr Pos_t POS_N = 0, POS_M = 1, POS_BATCH = 2;
+
+  m = concrete.rbegin()[POS_M];
+  n = concrete.rbegin()[POS_N];
+  if (concrete.size() > POS_BATCH) {
+    batch = concrete.rbegin()[POS_BATCH];
+  }
 
   // K is a reduction domain, search for the inner most reduction domain
   for (auto iter_domain = leaf_domains.rbegin();
@@ -1461,7 +1470,7 @@ ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion) {
   }
   NVF_ERROR(k != nullptr, "Failed to find K domain in MMA output");
 
-  return ProblemIterDomains{m, n, k};
+  return ProblemIterDomains{m, n, k, batch};
 }
 
 MatmulProblemLayoutOpt getMatmulLayout(Fusion* fusion) {
@@ -1560,9 +1569,11 @@ RolesMapOpt getTensorsRoles(Fusion* fusion) {
     return mma_output_domains.getErrorMsg();
   }
 
-  const auto findRolesByDomains = [](const DependenciesMap& deps_map,
-                                     RolesMap& roles_map,
-                                     const bool processing_output) {
+  TensorView* invalid_tv = nullptr;
+  const auto findRolesByDomains = [&invalid_tv](
+                                      const DependenciesMap& deps_map,
+                                      RolesMap& roles_map,
+                                      const bool processing_output) {
     for (const auto& entry : deps_map) {
       const auto& domains = entry.second;
       const auto begin = domains.begin();
@@ -1598,6 +1609,9 @@ RolesMapOpt getTensorsRoles(Fusion* fusion) {
         roles_map[MatmulRole::OUTPUT_D].push_back(entry.first);
         continue;
       }
+
+      invalid_tv = entry.first;
+      break;
     }
   };
 
@@ -1614,6 +1628,12 @@ RolesMapOpt getTensorsRoles(Fusion* fusion) {
   resolveTvToMatmulDomainsMapping(
       deps_map, mma_input_candidates, m, n, k, ca_map);
   findRolesByDomains(deps_map, roles_map, handling_output);
+  if (invalid_tv) {
+    std::stringstream ss;
+    ss << "One of fusion inputs cannot have role assigned! TV details: "
+       << invalid_tv->toString() << "\n";
+    return {ss.str()};
+  }
 
   deps_map.clear();
 
@@ -1622,6 +1642,12 @@ RolesMapOpt getTensorsRoles(Fusion* fusion) {
   resolveTvToMatmulDomainsMapping(
       deps_map, mma_output_candidates, m, n, k, ca_map);
   findRolesByDomains(deps_map, roles_map, handling_output);
+  if (invalid_tv) {
+    std::stringstream ss;
+    ss << "One of fusion outputs cannot have role assigned! TV details: "
+       << invalid_tv->toString() << "\n";
+    return {ss.str()};
+  }
 
   return roles_map;
 }

diff --git a/csrc/scheduler/mma_utils.h b/csrc/scheduler/mma_utils.h
@@ -220,7 +220,7 @@ constexpr size_t MIN_MATMUL_INPUTS_NUMBER = 2;
 //! An alias for data structure for passing IterDomains representing problem
 //! shape dimensions
 //!  TODO: extend definition for handling batch matmuls
-using ProblemIterDomains = std::array<IterDomain*, 3>;
+using ProblemIterDomains = std::array<IterDomain*, 4>;
 
 //! An alias for mapping between TensorView instance and its role in
 //!  matmul fusion definition, some roles can be assigned to more than
@@ -250,7 +250,7 @@ class DataWrapperOpt {
   }
   std::string getErrorMsg() const {
     if (data.valueless_by_exception() ||
-        std::holds_alternative<std::string>(data)) {
+        !std::holds_alternative<std::string>(data)) {
       return "Uninitialized data in data holder object";
     } else {
       return std::get<std::string>(data);
@@ -283,10 +283,9 @@ TORCH_CUDA_CU_API MatmulProblemLayoutOpt getMatmulLayout(Fusion* fusion);
 //! Returns wrapped collection of IterDomains that can be used to get
 //!  problem shape with runtime info.
 //!  Data is stored in the order in which lables are defined in MatmulDomain
-//!  enum class, that is in the following order: m, n, k.
+//!  enum class, that is in the following order: m, n, k, batch.
 //!  An error message is stored in retruned object if valid data cannot
 //!  be gathered.
-//!  TODO: 4th domain must be added for batch gemm support.
 TORCH_CUDA_CU_API ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion);
 
 //! Returns wrapped collection of TensorView roles in fusion.

diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
@@ -2289,7 +2289,7 @@ class MatmulScheduler : public SchedulerEntry {
       SchedulerRuntimeInfo& runtime_info,
       HeuristicSummary* data_cache = nullptr)
       : SchedulerEntry(ScheduleHeuristic::Matmul) {
-    computeHeuristics(fusion, runtime_info);
+    computeHeuristics(fusion, runtime_info, data_cache);
   }
 
   void schedule(Fusion* fusion) override {
@@ -2327,7 +2327,7 @@ class MatmulScheduler : public SchedulerEntry {
   void computeHeuristics(
       Fusion* fusion,
       SchedulerRuntimeInfo& runtime_info,
-      HeuristicSummary* data_cache = nullptr) {
+      HeuristicSummary* data_cache) {
     params_ = getMatmulHeuristics(fusion, runtime_info, data_cache);
     NVF_ERROR(params_ != nullptr);
   }

diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp
@@ -889,7 +889,7 @@ int64_t getVectorizationFactorTransposeGroup(
   for (auto tv : vec_tv) {
     auto inner_size_it = contig_inner_map.find(tv);
     auto tv_vectorize_factor_opt = inner_size_it == contig_inner_map.end()
-        ? 1
+        ? 1L
         : runtime_info.expressionEvaluator().evaluate(inner_size_it->second);
     // TODO: Do not assert here. we can just reduce vectorization size to 1 if
     // we can't infer an inner size.