From 7a485ed20a665580f0e19c7c7a24b2487678eabb Mon Sep 17 00:00:00 2001
From: Andrzej Bekas <abekas@nvidia.com>
Date: Wed, 23 Aug 2023 16:10:47 +0200
Subject: [PATCH 1/3] Buildsystem, fixes for gcc 9.4

---
 csrc/scheduler/vectorize_helper.cpp |  2 +-
 test/test_resize.cpp                | 38 ++++++++++++++---------------
 2 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp
index 209299eb937..688f6640eca 100644
--- a/csrc/scheduler/vectorize_helper.cpp
+++ b/csrc/scheduler/vectorize_helper.cpp
@@ -889,7 +889,7 @@ int64_t getVectorizationFactorTransposeGroup(
   for (auto tv : vec_tv) {
     auto inner_size_it = contig_inner_map.find(tv);
     auto tv_vectorize_factor_opt = inner_size_it == contig_inner_map.end()
-        ? 1
+        ? 1L
         : runtime_info.expressionEvaluator().evaluate(inner_size_it->second);
     // TODO: Do not assert here. we can just reduce vectorization size to 1 if
     // we can't infer an inner size.
diff --git a/test/test_resize.cpp b/test/test_resize.cpp
index 24b428f1171..e3ba0dc6efb 100644
--- a/test/test_resize.cpp
+++ b/test/test_resize.cpp
@@ -2535,8 +2535,8 @@ TEST_F(ResizeTest, Slice1DVectorizeManual4) {
 
   auto tv1 = slice(
       tv0,
-      {{IrBuilder::create<Val>(1),
-        sub(tv0->axis(0)->extent(), IrBuilder::create<Val>(3))}});
+      {{IrBuilder::create<Val>(1L),
+        sub(tv0->axis(0)->extent(), IrBuilder::create<Val>(3L))}});
   fusion.addOutput(tv1);
 
   tv1->split(0, 4);
@@ -2579,7 +2579,7 @@ TEST_F(ResizeTest, Slice2DVectorizeManual1) {
       tv0,
       {{IrBuilder::create<Val>(slice_offset),
         sub(tv0->axis(0)->extent(), IrBuilder::create<Val>(slice_offset))},
-       {IrBuilder::create<Val>(0), tv0->axis(1)->extent()}});
+       {IrBuilder::create<Val>(0L), tv0->axis(1)->extent()}});
   fusion.addOutput(tv1);
 
   tv1->merge(0);
@@ -2618,9 +2618,9 @@ TEST_F(ResizeTest, Slice3DVectorizeManual1) {
 
   auto tv1 = slice(
       tv0,
-      {{IrBuilder::create<Val>(0), tv0->axis(0)->extent()},
-       {IrBuilder::create<Val>(4), IrBuilder::create<Val>(6)},
-       {IrBuilder::create<Val>(0), tv0->axis(2)->extent()}});
+      {{IrBuilder::create<Val>(0L), tv0->axis(0)->extent()},
+       {IrBuilder::create<Val>(4L), IrBuilder::create<Val>(6L)},
+       {IrBuilder::create<Val>(0L), tv0->axis(2)->extent()}});
   fusion.addOutput(tv1);
 
   // Vectorize tv1 by a factor of 2. The sliced domain and the
@@ -2667,10 +2667,10 @@ TEST_F(ResizeTest, Slice3DVectorizeManual2) {
 
   auto tv1 = slice(
       tv0,
-      {{IrBuilder::create<Val>(0), tv0->axis(0)->extent()},
-       {IrBuilder::create<Val>(0), tv0->axis(1)->extent()},
-       {IrBuilder::create<Val>(0), IrBuilder::create<Val>(1024)},
-       {IrBuilder::create<Val>(0), tv0->axis(3)->extent()}});
+      {{IrBuilder::create<Val>(0L), tv0->axis(0)->extent()},
+       {IrBuilder::create<Val>(0L), tv0->axis(1)->extent()},
+       {IrBuilder::create<Val>(0L), IrBuilder::create<Val>(1024L)},
+       {IrBuilder::create<Val>(0L), tv0->axis(3)->extent()}});
   fusion.addOutput(tv1);
 
   // [4, 1, 1024, 3]
@@ -2709,19 +2709,19 @@ TEST_F(ResizeTest, SliceAndReshapeRepro540Manual) {
 
   auto tv1 = slice(
       tv0,
-      {{IrBuilder::create<Val>(0), tv0->axis(0)->extent()},
-       {IrBuilder::create<Val>(0), tv0->axis(1)->extent()},
-       {IrBuilder::create<Val>(0), IrBuilder::create<Val>(1024)}});
+      {{IrBuilder::create<Val>(0L), tv0->axis(0)->extent()},
+       {IrBuilder::create<Val>(0L), tv0->axis(1)->extent()},
+       {IrBuilder::create<Val>(0L), IrBuilder::create<Val>(1024L)}});
   auto tv2 = slice(
       tv0,
-      {{IrBuilder::create<Val>(0), tv0->axis(0)->extent()},
-       {IrBuilder::create<Val>(0), tv0->axis(1)->extent()},
-       {IrBuilder::create<Val>(1024), IrBuilder::create<Val>(2048)}});
+      {{IrBuilder::create<Val>(0L), tv0->axis(0)->extent()},
+       {IrBuilder::create<Val>(0L), tv0->axis(1)->extent()},
+       {IrBuilder::create<Val>(1024L), IrBuilder::create<Val>(2048L)}});
   auto tv3 = slice(
       tv0,
-      {{IrBuilder::create<Val>(0), tv0->axis(0)->extent()},
-       {IrBuilder::create<Val>(0), tv0->axis(1)->extent()},
-       {IrBuilder::create<Val>(2048), IrBuilder::create<Val>(3072)}});
+      {{IrBuilder::create<Val>(0L), tv0->axis(0)->extent()},
+       {IrBuilder::create<Val>(0L), tv0->axis(1)->extent()},
+       {IrBuilder::create<Val>(2048L), IrBuilder::create<Val>(3072L)}});
 
   auto tv4 = reshape(tv1, {16, 128, 1024}, {16, 128, 16, 64});
   auto tv5 = reshape(tv2, {16, 128, 1024}, {16, 128, 16, 64});

From e0d2b02c2f8521daaf4b582510a581a311f1c4ef Mon Sep 17 00:00:00 2001
From: Andrzej Bekas <abekas@nvidia.com>
Date: Mon, 28 Aug 2023 12:06:31 +0200
Subject: [PATCH 2/3] Matmul scheduler - heuristics infra claning

- expose batch iterdomain through tools used by heuristics in matmul
  scheduler,
---
 csrc/mma_type.h                 |  2 +-
 csrc/scheduler/matmul_utils.cpp | 12 +++++++++---
 csrc/scheduler/mma_utils.cpp    | 19 ++++++++++++++-----
 csrc/scheduler/mma_utils.h      |  5 ++---
 4 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/csrc/mma_type.h b/csrc/mma_type.h
index 71187f12da1..8c128b5ea63 100644
--- a/csrc/mma_type.h
+++ b/csrc/mma_type.h
@@ -15,7 +15,7 @@ namespace nvfuser {
 constexpr std::string_view MATMUL_LOG_PREFIX = "[MATMUL DEBUG] ";
 
 //! Named descriptors of domains in matmul
-enum class MatmulDomain { M = 0, N, K };
+enum class MatmulDomain { M = 0, N, K, Batch };
 
 //! Named descriptors of TensorView roles in fusion
 //!  INPUT_A - a producer of MMA input A
diff --git a/csrc/scheduler/matmul_utils.cpp b/csrc/scheduler/matmul_utils.cpp
index 5be71086173..0fa0ac84e3f 100644
--- a/csrc/scheduler/matmul_utils.cpp
+++ b/csrc/scheduler/matmul_utils.cpp
@@ -39,7 +39,7 @@ namespace {
 using MatmulLayout = MmaOptions::MmaLayout;
 //! Access to the structure should be done with labels defined in
 //!  MmaOptions::MmaDomains.
-using ProblemShape = std::array<int64_t, 3>;
+using ProblemShape = std::array<int64_t, 4>;
 
 //! A helper for deciding the type of MMA op for given fusion and problem shape.
 inline std::optional<MmaOptions::MacroType> getMmaOp(
@@ -153,11 +153,14 @@ ProblemShape getProblemShape(
     NVF_ERROR(false, mma_output_domains.getErrorMsg());
   }
 
-  const auto [m, n, k] = mma_output_domains.getData();
+  const auto [m, n, k, batch] = mma_output_domains.getData();
 
   auto m_extend = runtime_info.expressionEvaluator().evaluate(m->extent());
   auto n_extend = runtime_info.expressionEvaluator().evaluate(n->extent());
   auto k_extend = runtime_info.expressionEvaluator().evaluate(k->extent());
+  auto batch_extend =
+      (batch ? runtime_info.expressionEvaluator().evaluate(batch->extent())
+             : 0L);
 
   if (!(m_extend && n_extend && k_extend)) {
     NVF_ERROR(
@@ -172,7 +175,10 @@ ProblemShape getProblemShape(
   }
 
   return ProblemShape{
-      m_extend.as<int64_t>(), n_extend.as<int64_t>(), k_extend.as<int64_t>()};
+      m_extend.as<int64_t>(),
+      n_extend.as<int64_t>(),
+      k_extend.as<int64_t>(),
+      batch_extend.as<int64_t>()};
 }
 
 std::string isMatmulFusionDefinitionSupported(
diff --git a/csrc/scheduler/mma_utils.cpp b/csrc/scheduler/mma_utils.cpp
index 9d4fcafcf17..3d992e783a8 100644
--- a/csrc/scheduler/mma_utils.cpp
+++ b/csrc/scheduler/mma_utils.cpp
@@ -1430,10 +1430,11 @@ ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion) {
   }
   const auto mma_output = mma_exprs.front()->out();
 
-  // NOTE: the iter domains of MMA output should be [...,M,K,N]
+  // NOTE: the iter domains of MMA output should be [...,batch,M,K,N]
   IterDomain* m = nullptr;
   IterDomain* n = nullptr;
   IterDomain* k = nullptr;
+  IterDomain* batch = nullptr;
 
   const auto leaf_domains =
       static_cast<const TensorView*>(mma_output)->getLeafDomain();
@@ -1446,9 +1447,17 @@ ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion) {
     return ss.str();
   }
 
-  // M,N are inner most concrete iter domains
-  m = concrete.rbegin()[1];
-  n = concrete.rbegin()[0];
+  using Pos_t = decltype(leaf_domains)::size_type;
+
+  // N,M,B are the inner most concrete iter domains in tv, so positions are
+  //  counted from the end of the container
+  constexpr Pos_t POS_N = 0, POS_M = 1, POS_BATCH = 2;
+
+  m = concrete.rbegin()[POS_M];
+  n = concrete.rbegin()[POS_N];
+  if (concrete.size() > POS_BATCH) {
+    batch = concrete.rbegin()[POS_BATCH];
+  }
 
   // K is a reduction domain, search for the inner most reduction domain
   for (auto iter_domain = leaf_domains.rbegin();
@@ -1461,7 +1470,7 @@ ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion) {
   }
   NVF_ERROR(k != nullptr, "Failed to find K domain in MMA output");
 
-  return ProblemIterDomains{m, n, k};
+  return ProblemIterDomains{m, n, k, batch};
 }
 
 MatmulProblemLayoutOpt getMatmulLayout(Fusion* fusion) {
diff --git a/csrc/scheduler/mma_utils.h b/csrc/scheduler/mma_utils.h
index 755c74ea36d..b0e09218d5a 100644
--- a/csrc/scheduler/mma_utils.h
+++ b/csrc/scheduler/mma_utils.h
@@ -220,7 +220,7 @@ constexpr size_t MIN_MATMUL_INPUTS_NUMBER = 2;
 //! An alias for data structure for passing IterDomains representing problem
 //! shape dimensions
 //!  TODO: extend definition for handling batch matmuls
-using ProblemIterDomains = std::array<IterDomain*, 3>;
+using ProblemIterDomains = std::array<IterDomain*, 4>;
 
 //! An alias for mapping between TensorView instance and its role in
 //!  matmul fusion definition, some roles can be assigned to more than
@@ -283,10 +283,9 @@ TORCH_CUDA_CU_API MatmulProblemLayoutOpt getMatmulLayout(Fusion* fusion);
 //! Returns wrapped collection of IterDomains that can be used to get
 //!  problem shape with runtime info.
 //!  Data is stored in the order in which lables are defined in MatmulDomain
-//!  enum class, that is in the following order: m, n, k.
+//!  enum class, that is in the following order: m, n, k, batch.
 //!  An error message is stored in retruned object if valid data cannot
 //!  be gathered.
-//!  TODO: 4th domain must be added for batch gemm support.
 TORCH_CUDA_CU_API ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion);
 
 //! Returns wrapped collection of TensorView roles in fusion.

From 03b1cd7e70ab14df3897f23c582f91811fb2208c Mon Sep 17 00:00:00 2001
From: Andrzej Bekas <abekas@nvidia.com>
Date: Mon, 28 Aug 2023 14:23:19 +0200
Subject: [PATCH 3/3] Matmul scheduler - epilogue input vectorization

- remove WAR for unaligned memory access when smem epilogue is enabled
  and vectorization parallization mode is propagated,
- introduce infrastructure for vectorization analysis in matmul
  scheduler,
---
 csrc/scheduler/matmul.cpp    | 106 ++++++++++++++++++++++++-----------
 csrc/scheduler/mma_utils.cpp |  23 +++++++-
 csrc/scheduler/mma_utils.h   |   2 +-
 csrc/scheduler/registry.cpp  |   4 +-
 4 files changed, 95 insertions(+), 40 deletions(-)

diff --git a/csrc/scheduler/matmul.cpp b/csrc/scheduler/matmul.cpp
index 064ebcbbda3..d8cdf113ec7 100644
--- a/csrc/scheduler/matmul.cpp
+++ b/csrc/scheduler/matmul.cpp
@@ -14,6 +14,9 @@
 // NOTE: included to avoid compilation error caused by missing destructor in
 // 'SchedulerRuntimeInfo'
 #include <executor_utils.h>
+#include <algorithm>
+#include <iterator>
+#include <vector>
 #include "mma_type.h"
 
 namespace nvfuser {
@@ -614,54 +617,89 @@ void scheduleOutputTensor(
       {c},
       {ParallelType::BIDx, ParallelType::BIDy, ParallelType::BIDz});
 }
+
+//! Assign parallelization modes to epilogue inputs. The returned pair of
+//!  collections contains:
+//!   - 1st: TVs that can have all parallelization modes propagated,
+//!   - 2nd: TVs that can have all but vectorization parallel modes
+//!     propagated,
+std::pair<std::vector<TensorView*>, std::vector<TensorView*>>
+partitionEpilogueInputsByParallelizationModes(
+    TensorView* reference_tv,
+    const std::vector<TensorView*>& epilogue_inputs) {
+  const auto tvs_with_vec = scheduler_utils::getInputsOutputsWithInnerDim(
+      reference_tv, true /*inner_only*/, true /*vectorize_pass*/);
+
+  std::vector<TensorView*> epilogue_input_tvs_with_vec = {};
+  epilogue_input_tvs_with_vec.reserve(epilogue_inputs.size());
+  std::vector<TensorView*> epilogue_input_tvs_without_vec = {};
+  epilogue_input_tvs_without_vec.reserve(epilogue_inputs.size());
+
+  for (auto* tv : epilogue_inputs) {
+    if (std::find(tvs_with_vec.cbegin(), tvs_with_vec.cend(), tv) ==
+        tvs_with_vec.cend()) {
+      epilogue_input_tvs_without_vec.push_back(tv);
+    } else {
+      epilogue_input_tvs_with_vec.push_back(tv);
+    }
+  }
+
+  return {epilogue_input_tvs_with_vec, epilogue_input_tvs_without_vec};
+}
+
 //! Propagates transformations from fusion output to fusion tv inputs that are
 //!  producers in the epilogue. Transformations' propagation aims at input tvs
 //!  which are not assigned to core roles, that is, are not MMA inputs.
-void scheduleFusionInputsForEpilogue(
-    const mma_utils::RolesMap& roles_map,
-    const bool with_smem_epilogue) {
-  std::vector<TensorView*> cached_tvs;
-
+void scheduleFusionInputsForEpilogue(const mma_utils::RolesMap& roles_map) {
   // Handling transformations in fusion input tvs with assigned INPUT_C role by
-  //  propagating fusion output transformations through cached views of INPUT_C
-  //  fusion input tvs and by setting vectorization of the inner most iterdomain
-  //  of these cached views
+  //  propagating fusion output transformations and parallelization through
+  //  cached views of fusion inputs with INPUT_C role.
+  // NOTE: Vectorization parallelization mode of inner most domain requires
+  //  special care.
   if (roles_map.count(MatmulRole::INPUT_C)) {
-    auto& c_tvs = roles_map.at(MatmulRole::INPUT_C);
-
     // The system supports only scenario where there is only one fusion output
-    //  with assigned OUTPUT_D role, this condition is already verified so there
-    //  is no need for an additional checks here
+    //  with assigned OUTPUT_D role, this condition is already verified so no
+    //  additional checks are needed
     auto output_d = roles_map.at(MatmulRole::OUTPUT_D).front();
-    for (auto* c : c_tvs) {
-      cached_tvs.push_back(c->cacheAfter());
+    const auto& c_tvs = roles_map.at(MatmulRole::INPUT_C);
+
+    const auto [tvs_with_vectorization, tvs_without_vectorization] =
+        partitionEpilogueInputsByParallelizationModes(output_d, c_tvs);
+
+    const auto transform_propagator =
+        [&output_d](
+            const std::vector<TensorView*>& tvs,
+            const std::unordered_set<ParallelType>& parallel_types) {
+          std::vector<TensorView*> cached_tvs;
+          cached_tvs.reserve(tvs.size());
+          for (auto* tv : tvs) {
+            cached_tvs.push_back(tv->cacheAfter());
+          }
+
+          scheduler_utils::BoundedDirectionalTransformPropagator::backward(
+              output_d, -1, tvs);
+
+          scheduler_utils::parallelizeAllLike(
+              output_d, -1, cached_tvs, parallel_types);
+        };
+
+    // Propagate all parallelization modes
+    if (!tvs_with_vectorization.empty()) {
+      transform_propagator(tvs_with_vectorization, {});
     }
-
-    scheduler_utils::BoundedDirectionalTransformPropagator::backward(
-        output_d, -1, c_tvs);
-
-    std::unordered_set<ParallelType> parallel_types = {};
-    if (with_smem_epilogue) {
-      //! In cases where smem epilogue feature is enabled, the vectorization of
-      //!  domains will be propagated to fusion inputs that are epilogue inputs,
-      //!  this may result in unaligned memory reads. Vectorization is
-      //!  explicitly excluded form parallelization types to avoid this issue.
-      //! This should be changed when vectorization analysis is available and
-      //!  enabled for matmul scheduler.
-      parallel_types = allParallelTypesExcept({ParallelType::Vectorize});
+    // Propagate all parallelization modes but vectorization
+    if (!tvs_without_vectorization.empty()) {
+      transform_propagator(
+          tvs_without_vectorization,
+          allParallelTypesExcept({ParallelType::Vectorize}));
     }
-    scheduler_utils::parallelizeAllLike(
-        output_d, -1, cached_tvs, parallel_types);
-
-    // The cached INPUT_C tvs are not needed anymore
-    cached_tvs.clear();
   }
 }
 
 } // namespace
 
 void scheduleMatmul(Fusion* fusion, const MatmulParams& params) {
-  const auto& roles_map_opt = mma_utils::getTensorsRoles(fusion);
+  const auto roles_map_opt = mma_utils::getTensorsRoles(fusion);
 
   // NOTE: the contents of roles_map have been already validated during
   //  compute-time checks
@@ -1009,7 +1047,7 @@ void scheduleMatmul(Fusion* fusion, const MatmulParams& params) {
   //  operations, input tvs with non-core roles
   //  core roles: essential for matmul, for example mma inputs' producers
   if (has_non_mma_input_tvs) {
-    scheduleFusionInputsForEpilogue(roles_map, params.use_smem_epilogue);
+    scheduleFusionInputsForEpilogue(roles_map);
   }
 
   // auto inline for all tensors except register tensors and output tensor
diff --git a/csrc/scheduler/mma_utils.cpp b/csrc/scheduler/mma_utils.cpp
index 3d992e783a8..d04b0844cd3 100644
--- a/csrc/scheduler/mma_utils.cpp
+++ b/csrc/scheduler/mma_utils.cpp
@@ -1569,9 +1569,11 @@ RolesMapOpt getTensorsRoles(Fusion* fusion) {
     return mma_output_domains.getErrorMsg();
   }
 
-  const auto findRolesByDomains = [](const DependenciesMap& deps_map,
-                                     RolesMap& roles_map,
-                                     const bool processing_output) {
+  TensorView* invalid_tv = nullptr;
+  const auto findRolesByDomains = [&invalid_tv](
+                                      const DependenciesMap& deps_map,
+                                      RolesMap& roles_map,
+                                      const bool processing_output) {
     for (const auto& entry : deps_map) {
       const auto& domains = entry.second;
       const auto begin = domains.begin();
@@ -1607,6 +1609,9 @@ RolesMapOpt getTensorsRoles(Fusion* fusion) {
         roles_map[MatmulRole::OUTPUT_D].push_back(entry.first);
         continue;
       }
+
+      invalid_tv = entry.first;
+      break;
     }
   };
 
@@ -1623,6 +1628,12 @@ RolesMapOpt getTensorsRoles(Fusion* fusion) {
   resolveTvToMatmulDomainsMapping(
       deps_map, mma_input_candidates, m, n, k, ca_map);
   findRolesByDomains(deps_map, roles_map, handling_output);
+  if (invalid_tv) {
+    std::stringstream ss;
+    ss << "One of fusion inputs cannot have role assigned! TV details: "
+       << invalid_tv->toString() << "\n";
+    return {ss.str()};
+  }
 
   deps_map.clear();
 
@@ -1631,6 +1642,12 @@ RolesMapOpt getTensorsRoles(Fusion* fusion) {
   resolveTvToMatmulDomainsMapping(
       deps_map, mma_output_candidates, m, n, k, ca_map);
   findRolesByDomains(deps_map, roles_map, handling_output);
+  if (invalid_tv) {
+    std::stringstream ss;
+    ss << "One of fusion outputs cannot have role assigned! TV details: "
+       << invalid_tv->toString() << "\n";
+    return {ss.str()};
+  }
 
   return roles_map;
 }
diff --git a/csrc/scheduler/mma_utils.h b/csrc/scheduler/mma_utils.h
index b0e09218d5a..e7dc543c7a7 100644
--- a/csrc/scheduler/mma_utils.h
+++ b/csrc/scheduler/mma_utils.h
@@ -250,7 +250,7 @@ class DataWrapperOpt {
   }
   std::string getErrorMsg() const {
     if (data.valueless_by_exception() ||
-        std::holds_alternative<std::string>(data)) {
+        !std::holds_alternative<std::string>(data)) {
       return "Uninitialized data in data holder object";
     } else {
       return std::get<std::string>(data);
diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
index f2614d61d7e..233a17f8184 100644
--- a/csrc/scheduler/registry.cpp
+++ b/csrc/scheduler/registry.cpp
@@ -2289,7 +2289,7 @@ class MatmulScheduler : public SchedulerEntry {
       SchedulerRuntimeInfo& runtime_info,
       HeuristicSummary* data_cache = nullptr)
       : SchedulerEntry(ScheduleHeuristic::Matmul) {
-    computeHeuristics(fusion, runtime_info);
+    computeHeuristics(fusion, runtime_info, data_cache);
   }
 
   void schedule(Fusion* fusion) override {
@@ -2327,7 +2327,7 @@ class MatmulScheduler : public SchedulerEntry {
   void computeHeuristics(
       Fusion* fusion,
       SchedulerRuntimeInfo& runtime_info,
-      HeuristicSummary* data_cache = nullptr) {
+      HeuristicSummary* data_cache) {
     params_ = getMatmulHeuristics(fusion, runtime_info, data_cache);
     NVF_ERROR(params_ != nullptr);
   }