From 1e459509cce2b98f14b0449019794ae32fd18733 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Thu, 30 May 2024 16:50:39 -0700
Subject: [PATCH 01/22] WIP: IdModel-based indexing

---
 CMakeLists.txt                     |   2 +
 csrc/device_lower/lower2device.cpp |   2 +-
 csrc/device_lower/utils.cpp        |  22 +
 csrc/device_lower/utils.h          |   2 +
 csrc/id_model/id_model.h           |   9 +
 csrc/id_model/indexing.cpp         | 753 +++++++++++++++++++++++++++++
 csrc/id_model/indexing.h           |  80 +++
 csrc/id_model/utils.h              |  55 +++
 csrc/ir/utils.h                    |  16 +
 tests/cpp/test_indexing.cpp        | 123 +++++
 10 files changed, 1063 insertions(+), 1 deletion(-)
 create mode 100644 csrc/id_model/indexing.cpp
 create mode 100644 csrc/id_model/indexing.h
 create mode 100644 csrc/id_model/utils.h
 create mode 100644 tests/cpp/test_indexing.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed87fd7bbbe..6be651ca58c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,6 +135,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/host_ir/executor.cpp
   ${NVFUSER_SRCS_DIR}/host_ir/host_ir.cpp
   ${NVFUSER_SRCS_DIR}/id_model/id_model.cpp
+  ${NVFUSER_SRCS_DIR}/id_model/indexing.cpp
   ${NVFUSER_SRCS_DIR}/id_model/loop_promotion.cpp
   ${NVFUSER_SRCS_DIR}/id_model/schedule.cpp
   ${NVFUSER_SRCS_DIR}/id_model/to_string.cpp
@@ -506,6 +507,7 @@ list(APPEND JIT_TEST_SRCS
   ${NVFUSER_ROOT}/tests/cpp/test_gpu_transpose.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_gpu_utils.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_id_model.cpp
+  ${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_linked_hash_map.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_loop_rotation.cpp
diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp
index 0206ab7f885..6871e6a592d 100644
--- a/csrc/device_lower/lower2device.cpp
+++ b/csrc/device_lower/lower2device.cpp
@@ -390,7 +390,7 @@ void GpuLower::analysis(Fusion* fusion) {
   // functionality should be affected. New IterDomains may be created,
   // so it is expected that generated code may use diffrent variable
   // names
-  if (isOptionEnabled(EnableOption::IdModel)) {
+  if (true || isOptionEnabled(EnableOption::IdModel)) {
     IdModel id_model(fusion_);
   }
 
diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp
index 9e1883b3d91..657effa5442 100644
--- a/csrc/device_lower/utils.cpp
+++ b/csrc/device_lower/utils.cpp
@@ -920,6 +920,28 @@ std::array<UnitDim, 2> getMmaLayout(const MmaOp* expr) {
   return layout;
 }
 
+// Returns true if expr is an expression that initializes a reduction
+// buffer.
+bool isReductionInitExpr(const Expr* expr) {
+  // False if its output isn't a TensorView
+  if (!ir_utils::isTvOp(expr)) {
+    return false;
+  }
+  // False if it doesn't have any reduction axis
+  const auto out_tv = ir_utils::getTvOutput(expr);
+  if (!out_tv->domain()->hasReduction()) {
+    return false;
+  }
+  // False if it has have TensorView inputs as initialization should
+  // never use TensorViews
+  const auto tv_filter_inp_view =
+      ir_utils::filterByType<TensorView>(expr->inputs());
+  if (tv_filter_inp_view.begin() != tv_filter_inp_view.end()) {
+    return false;
+  }
+  return true;
+}
+
 } // namespace lower_utils
 
 } // namespace nvfuser
diff --git a/csrc/device_lower/utils.h b/csrc/device_lower/utils.h
index 77acc095750..0d1e0b5f96a 100644
--- a/csrc/device_lower/utils.h
+++ b/csrc/device_lower/utils.h
@@ -322,6 +322,8 @@ Val* getNumThreadsInTensorView(TensorView* tv);
 //! Get the unit dimensions of A and B for the given MmaOp.
 std::array<UnitDim, 2> getMmaLayout(const MmaOp* expr);
 
+bool isReductionInitExpr(const Expr* expr);
+
 } // namespace lower_utils
 
 } // namespace nvfuser
diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h
index 5b60ff474c6..b912978ca48 100644
--- a/csrc/id_model/id_model.h
+++ b/csrc/id_model/id_model.h
@@ -153,6 +153,15 @@ class IdModel : public PolymorphicBase {
 
   std::string toString() const;
 
+  bool empty() const {
+    return tvs_.empty();
+  }
+
+  Fusion* fusion() const {
+    NVF_ERROR(!tvs_.empty());
+    return tvs_.at(0)->fusion();
+  }
+
   // Build all graphs, i.e., Exact, AlmostExact, Permissive and
   // LOOP. This is by default called from the constructor
   void buildAllGraphs();
diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
new file mode 100644
index 00000000000..9a912b4b4b1
--- /dev/null
+++ b/csrc/id_model/indexing.cpp
@@ -0,0 +1,753 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include <device_lower/analysis/index_compute.h>
+#include <device_lower/lower2device.h>
+#include <device_lower/utils.h>
+#include <expr_simplifier.h>
+#include <id_model/indexing.h>
+#include <id_model/to_string.h>
+#include <id_model/utils.h>
+#include <index_compute.h>
+#include <ir/builder.h>
+#include <ir/graphviz.h>
+#include <ir/utils.h>
+#include <val_graph_visitor.h>
+
+#include <algorithm>
+#include <fstream>
+
+namespace nvfuser {
+
+namespace {
+
+class IndexingTraversal : public ValGraphBFS {
+ public:
+  IndexingTraversal(
+      const ValGraph& graph,
+      std::vector<GroupType> from_groups,
+      std::vector<GroupType> to_groups)
+      : ValGraphBFS(graph, from_groups, to_groups) {}
+
+  virtual ~IndexingTraversal() = default;
+
+  static ExprPath getExprsBetween(
+      const std::vector<IterDomain*>& from_domains,
+      const std::vector<IterDomain*>& to_domains,
+      const ValGraph& graph) {
+    const ValGroups from_groups = graph.toGroups(from_domains);
+    const ValGroups to_groups = graph.toGroups(to_domains);
+
+    IndexingTraversal traversal(
+        graph,
+        {from_groups.vector().begin(), from_groups.vector().end()},
+        {to_groups.vector().begin(), to_groups.vector().end()});
+    traversal.traverse();
+    return traversal.getShortestExprPath();
+  }
+
+  using ValGraphBFS::isVisited;
+
+  bool isDependencySatisfied(const GroupType& group) const override {
+    if (const ValGroup* vg = std::get_if<ValGroup>(&group);
+        vg != nullptr && (*vg)->front()->as<IterDomain>()->isBroadcast()) {
+      VERBOSE() << "Dependency satisfied as it's broadcast" << std::endl;
+      return true;
+    }
+    return ValGraphBFS::isDependencySatisfied(group);
+  }
+};
+
+// Get the promotion domain of a given loop domain.
+IterDomain* getLoopPromotion(IterDomain* loop_id, const IdModel& id_model) {
+  const auto& loop_graph = id_model.idGraph(IdMappingMode::LOOP);
+  const auto& loop_promotion_map = id_model.loopPromotionMap();
+  const auto& loop_group = loop_graph.toGroup(loop_id);
+
+  auto loop_promotion_map_it = loop_promotion_map.find(loop_group);
+  NVF_ERROR(
+      loop_promotion_map_it != loop_promotion_map.end(),
+      "No loop promotion found: ",
+      loop_id->toString(),
+      ". Loop group: ",
+      nvfuser::toString(loop_group));
+
+  return loop_promotion_map_it->second;
+}
+
+// Get the loop domains of a given expr, which are (potentially
+// promoted) loop domains of the consumer tensor.
+std::vector<IterDomain*> getLoopDomains(
+    const Expr* expr,
+    const IdModel& id_model) {
+  // Assume consumer-based indexing. Needs to revisit for ops like
+  // scatter
+  auto loop_domains = ir_utils::getTvOutput(expr)->getLeafDomain();
+
+  // If this is an expr initializing a buffer for a reduction, there
+  // should be no loops for reduction domains
+  if (lower_utils::isReductionInitExpr(expr)) {
+    loop_domains.erase(
+        std::remove_if(
+            loop_domains.begin(),
+            loop_domains.end(),
+            [](IterDomain* id) -> bool { return id->isReduction(); }),
+        loop_domains.end());
+  }
+
+  for (auto& loop_id : loop_domains) {
+    loop_id = getLoopPromotion(loop_id, id_model);
+  }
+
+  return loop_domains;
+}
+
+bool isAllocated(IterDomain* id, const TensorView* tv) {
+  // If the extent is 1, it's effectively the same as broadcast.
+  return ir_utils::isShared(tv->getMemoryType(), id->getParallelType()) &&
+      !id->isBroadcast() && !id->isReduction() && !id->extent()->isOneInt();
+}
+
+Val* getAllocationStride(TensorView* tv, int64_t alloc_dim) {
+  const auto& alloc_dom = tv->getMaybeAllocationDomain();
+  int64_t stride_dim = -1;
+  for (const auto i : c10::irange(alloc_dim + 1)) {
+    if (alloc_dom.at(i)->isReduction()) {
+      continue;
+    }
+    ++stride_dim;
+  }
+  if (stride_dim == -1) {
+    return nullptr;
+  }
+
+  return IrBuilder::getItemExpr(
+      IrBuilder::getAttrExpr(IrBuilder::metadataExpr(tv), "alloc_stride"),
+      stride_dim);
+}
+
+// Currently it's only Shared or Local but Global can be the case
+// too.
+bool isAllocationBasedOnLeaf(TensorView* tv) {
+  return tv->getMemoryType() == MemoryType::Shared ||
+      tv->getMemoryType() == MemoryType::Local;
+}
+
+// Get the allocation domains of a given tensor. Also returns its
+// strides as well as
+std::tuple<std::vector<IterDomain*>, std::vector<Val*>> getAllocationDomains(
+    TensorView* tv,
+    const IdModel& id_model) {
+  std::vector<IterDomain*> allocation_domains;
+
+  auto inlining_pos = tv->getComputeAtPosition();
+
+  bool use_set_allocatin_domain = false;
+
+  if (tv->hasAllocation()) {
+    if (tv->getMemoryType() == MemoryType::Shared ||
+        tv->getMemoryType() == MemoryType::Local) {
+      if (std::is_permutation(
+              tv->getLeafDomain().begin(),
+              tv->getLeafDomain().end(),
+              tv->getAllocationDomain().begin())) {
+        use_set_allocatin_domain = true;
+      }
+    } else {
+      use_set_allocatin_domain = true;
+    }
+  }
+
+  // Ignore allocation of non-global tensors for now
+  if (use_set_allocatin_domain) {
+    allocation_domains = tv->getAllocationDomain();
+    NVF_ERROR(!tv->isDoubleBuffered());
+  } else {
+    // If allocation domain is not set, assume that:
+    // Local/Shared: leaf domains to the right of the CA position
+    // Global: rfactor domains
+    if (tv->getMemoryType() == MemoryType::Global) {
+      VERBOSE() << "Tv does not have allocation of " << tv->toString() << ", "
+                << toDelimitedString(tv->getMaybeAllocationDomain())
+                << std::endl;
+      allocation_domains = tv->getRFactorDomain();
+      NVF_ERROR(!tv->isDoubleBuffered());
+    } else if (tv->getMemoryType() == MemoryType::Shared) {
+      for (const auto i : c10::irange(tv->nDims())) {
+        auto leaf_id = tv->axis(i);
+        VERBOSE() << "Smem leaf domain: " << leaf_id->toString() << " of "
+                  << tv->toString() << std::endl;
+        auto pt = leaf_id->getParallelType();
+        if (isParallelTypeDeviceDim(pt) || isParallelTypeBlockDim(pt)) {
+          continue;
+        }
+        if (i < inlining_pos && !isParallelTypeThreadDim(pt)) {
+          continue;
+        }
+        allocation_domains.push_back(leaf_id);
+      }
+    } else {
+      allocation_domains = {
+          tv->getLeafDomain().begin() + inlining_pos,
+          tv->getLeafDomain().end()};
+    }
+  }
+
+  // TODO: Fix alloation domains with vectorization
+  // This is an ugly workaround, but the allocation domain of a tensor
+  // with vectorized domains may not be the same as the leaf fomain
+  // since the vectorized domain must be at the innermost position in
+  // the allocation domain, but it's allowed to be located anywhwere
+  // in the leaf domain.
+  // This shouldn't be necessary for global memory tensors as their
+  // allocation domains are rfactor domains
+  {
+    if (tv->getMemoryType() != MemoryType::Global) {
+      IterDomain* id_to_move_back = nullptr;
+      // Vectorized load
+      if (tv->definition() != nullptr && tv->definition()->isA<LoadStoreOp>() &&
+          tv->definition()->as<LoadStoreOp>()->opType() ==
+              LoadStoreOpType::Set) {
+        auto vec_it = std::find_if(
+            allocation_domains.begin(),
+            allocation_domains.end(),
+            [](auto index_domain) -> bool {
+              return isParallelTypeVectorize(index_domain->getParallelType());
+            });
+        if (vec_it != allocation_domains.end() &&
+            *vec_it != allocation_domains.back()) {
+          id_to_move_back = *vec_it;
+        }
+      } else {
+        for (const auto ls_use :
+             ir_utils::filterByType<LoadStoreOp>(tv->uses())) {
+          if (ls_use->opType() != LoadStoreOpType::Set) {
+            continue;
+          }
+          auto consumer_tv = ls_use->out()->as<TensorView>();
+          auto vec_it = std::find_if(
+              consumer_tv->getLeafDomain().begin(),
+              consumer_tv->getLeafDomain().end(),
+              [](auto consumer_leaf_id) -> bool {
+                return isParallelTypeVectorize(
+                    consumer_leaf_id->getParallelType());
+              });
+          if (vec_it == consumer_tv->getLeafDomain().end()) {
+            continue;
+          }
+          const auto& vec_group =
+              id_model.idGraph(IdMappingMode::EXACT).toGroup(*vec_it);
+          auto index_it = std::find_if(
+              allocation_domains.begin(),
+              allocation_domains.end(),
+              [&](auto index_id) -> bool { return vec_group->has(index_id); });
+          if (index_it == allocation_domains.end() ||
+              *index_it == allocation_domains.back()) {
+            continue;
+          }
+
+          id_to_move_back = *index_it;
+        }
+      }
+
+      if (id_to_move_back != nullptr) {
+        // reorder the vec domain to the end of the index domains
+        std::vector<IterDomain*> reordered_index_domains;
+        reordered_index_domains.reserve(allocation_domains.size());
+        for (const auto id : allocation_domains) {
+          if (id != id_to_move_back) {
+            reordered_index_domains.push_back(id);
+          }
+        }
+        reordered_index_domains.push_back(id_to_move_back);
+        allocation_domains = reordered_index_domains;
+      }
+    }
+  }
+
+  auto tv_for_promotion = tv;
+
+  std::vector<Val*> strides(allocation_domains.size(), nullptr);
+  for (const auto i : c10::irange(allocation_domains.size())) {
+    auto dim = allocation_domains.size() - i - 1;
+    auto index_domain = allocation_domains.at(dim);
+
+    if (index_domain->isReduction() || index_domain->isBroadcast()) {
+      continue;
+    }
+
+    strides[dim] = getAllocationStride(tv, (int64_t)dim);
+  }
+
+  std::vector<IterDomain*> actual_index_domains;
+  std::vector<Val*> actual_strides;
+  std::vector<bool> actual_contiguity;
+  for (const auto i : c10::irange(allocation_domains.size())) {
+    auto index_domain = allocation_domains.at(i);
+    if (!isAllocated(index_domain, tv)) {
+      continue;
+    }
+
+    // If it's a leaf domain, the promoted domain is the true domain
+    // for allocation and indexing.
+    bool is_leaf = std::find(
+                       tv_for_promotion->getLeafDomain().begin(),
+                       tv_for_promotion->getLeafDomain().end(),
+                       index_domain) != tv->getLeafDomain().end();
+    auto actual_id =
+        is_leaf ? getLoopPromotion(index_domain, id_model) : index_domain;
+    VERBOSE() << "Index domain: " << index_domain->toString()
+              << ", actual domain (promotion domain): " << actual_id->toString()
+              << std::endl;
+
+    actual_index_domains.push_back(actual_id);
+    actual_strides.push_back(strides.at(i));
+    NVF_ERROR(
+        actual_strides.back() != nullptr,
+        "Stride unknown for ",
+        index_domain->toString(),
+        " (promoted to ",
+        actual_id->toString(),
+        ")");
+  }
+
+  NVF_ERROR(actual_index_domains.size() == actual_strides.size());
+
+  return {actual_index_domains, actual_strides};
+}
+
+ExprPath getIndexingTraversalPath(
+    const Expr* expr,
+    const std::vector<IterDomain*>& from_domains,
+    const std::vector<IterDomain*>& to_domains,
+    const ValGraph& traversal_graph) {
+  VERBOSE() << "getIndexingTraversalPath: " << toDelimitedString(from_domains)
+            << " -> " << toDelimitedString(to_domains) << std::endl;
+
+  auto indexing_path = IndexingTraversal::getExprsBetween(
+      from_domains, to_domains, traversal_graph);
+
+  VERBOSE() << "Indexing path:\n";
+  for (const auto& [expr_group, direction] : indexing_path) {
+    Expr* expr = expr_group->front();
+    VERBOSE() << direction << " " << expr->toString();
+  }
+  VERBOSE() << "--- path done ---\n";
+
+  return indexing_path;
+}
+
+class IdGraphIndexCompute : public OptOutDispatch {
+ public:
+  IdGraphIndexCompute(
+      const ValGraph& exact_graph,
+      const std::unordered_map<ValGroup, Val*>& initial_index_map)
+      : traversal_graph_(exact_graph), index_map_(initial_index_map) {}
+
+  using OptOutDispatch::handle;
+
+  void handle(Split* split) override;
+
+  void handle(Merge* merge) override;
+
+  bool isForward(Expr* expr) const;
+
+  bool hasIndex(IterDomain* id) const;
+
+  Val* getIndex(IterDomain* id) const;
+
+  void setIndex(IterDomain* id, Val* idx);
+
+  std::unordered_map<ValGroup, Val*> indexMap() const {
+    return index_map_;
+  }
+
+  void propagate(const ExprGroup& expr_group, Direction direction) {
+    NVF_ERROR(!expr_group->empty());
+
+    dispatch(expr_group->front());
+  }
+
+ private:
+  const ValGraph& traversal_graph_;
+  std::unordered_map<ValGroup, Val*> index_map_;
+};
+
+bool IdGraphIndexCompute::hasIndex(IterDomain* id) const {
+  // If it's a broadcast, its index is always zero.
+  if (id->isBroadcast()) {
+    return true;
+  }
+  const ValGroup& id_group = traversal_graph_.toGroup(id);
+  return index_map_.find(id_group) != index_map_.end();
+}
+
+Val* IdGraphIndexCompute::getIndex(IterDomain* id) const {
+  // If it's a broadcast, its index is always zero.
+  if (id->isBroadcast()) {
+    return id->fusion()->zeroVal();
+  }
+  const ValGroup& id_group = traversal_graph_.toGroup(id);
+  auto it = index_map_.find(id_group);
+  NVF_ERROR(it != index_map_.end(), "Index not found: ", id->toString());
+  return it->second;
+}
+
+void IdGraphIndexCompute::setIndex(IterDomain* id, Val* idx) {
+  VERBOSE() << "setIndex: " << id->name() << " -> " << idx->toInlineString()
+            << std::endl;
+  const ValGroup& id_group = traversal_graph_.toGroup(id);
+  index_map_.emplace(id_group, idx);
+}
+
+bool IdGraphIndexCompute::isForward(Expr* expr) const {
+  bool ready = true;
+  for (const auto inp : ir_utils::filterByType<IterDomain>(expr->inputs())) {
+    if (!hasIndex(inp)) {
+      VERBOSE() << "No index for input: " << inp->toString() << std::endl;
+      ready = false;
+      break;
+    }
+  }
+  if (ready) {
+    return true;
+  }
+
+  // Can just return false here. Just make sure the outputs are
+  // already processed
+  for (const auto out : ir_utils::filterByType<IterDomain>(expr->outputs())) {
+    NVF_ERROR(hasIndex(out), "Output index not found: ", out->toString());
+  }
+
+  return false;
+}
+
+void IdGraphIndexCompute::handle(Split* split) {
+  const bool is_forward = isForward(split);
+
+  VERBOSE() << "IdGraphIndexCompute handle (" << (is_forward ? "fwd" : "bwd")
+            << "): " << split->toString();
+
+  if (is_forward) {
+    auto in_idx = getIndex(split->in());
+    auto inner_extent = split->inner()->extent();
+    auto outer_idx = SimplifyingIrBuilder::divExpr(in_idx, inner_extent);
+    Val* inner_idx = nullptr;
+    inner_idx = SimplifyingIrBuilder::modExpr(in_idx, inner_extent);
+    setIndex(split->outer(), outer_idx);
+    setIndex(split->inner(), inner_idx);
+  } else {
+    auto outer_idx = getIndex(split->outer());
+    auto inner_idx = getIndex(split->inner());
+    auto inner_extent = split->inner()->extent();
+    auto in_idx = SimplifyingIrBuilder::addExpr(
+        SimplifyingIrBuilder::mulExpr(outer_idx, inner_extent), inner_idx);
+    setIndex(split->in(), in_idx);
+  }
+}
+
+void IdGraphIndexCompute::handle(Merge* merge) {
+  const bool is_forward = isForward(merge);
+
+  VERBOSE() << "IdGraphIndexCompute handle (" << (is_forward ? "fwd" : "bwd")
+            << "): " << merge->toString();
+
+  // TODO: use getMaybeExpandedExtent?
+  auto inner_ext = merge->inner()->extent();
+
+  if (is_forward) {
+    auto outer_idx = getIndex(merge->outer());
+    auto inner_idx = getIndex(merge->inner());
+    auto out_idx = SimplifyingIrBuilder::addExpr(
+        SimplifyingIrBuilder::mulExpr(outer_idx, inner_ext), inner_idx);
+    setIndex(merge->out(), out_idx);
+  } else {
+    auto out_idx = getIndex(merge->out());
+    auto outer_idx = SimplifyingIrBuilder::divExpr(out_idx, inner_ext);
+    setIndex(merge->outer(), outer_idx);
+    Val* inner_idx = SimplifyingIrBuilder::modExpr(out_idx, inner_ext);
+    setIndex(merge->inner(), inner_idx);
+  }
+}
+
+ParallelType getParallelType(const ValGroup& loop_group) {
+  ParallelType common_pt = ParallelType::Serial;
+  for (const auto val : *loop_group) {
+    auto pt = val->as<IterDomain>()->getParallelType();
+    if (common_pt == pt || pt == ParallelType::Serial) {
+      continue;
+    } else if (common_pt == ParallelType::Serial) {
+      common_pt = pt;
+    } else {
+      // Inconsistent parallelization
+      NVF_ERROR(
+          false,
+          "Inconsistent parallelization detected. ",
+          "Known type: ",
+          common_pt,
+          "New type: ",
+          pt);
+    }
+  }
+
+  return common_pt;
+}
+
+kir::ForLoop* getForLoop(
+    IterDomain* loop_id,
+    const std::vector<kir::ForLoop*>& for_loops,
+    const ValGraph& loop_graph) {
+  auto it = std::find_if(
+      for_loops.begin(), for_loops.end(), [&](kir::ForLoop* for_loop) -> bool {
+        IterDomain* for_loop_id = for_loop->iter_domain();
+        return loop_graph.disjointValSets().strictAreMapped(
+            loop_id, for_loop_id);
+      });
+  if (it != for_loops.end()) {
+    return *it;
+  } else {
+    return nullptr;
+  }
+}
+
+} // namespace
+
+TensorIndexer::TensorIndexer(const IdModel& id_model) : id_model_(id_model) {
+  buildLoopIndexMap();
+}
+
+void TensorIndexer::buildLoopIndexMap() {
+  if (id_model_.empty()) {
+    return;
+  }
+
+  Fusion* fusion = id_model_.fusion();
+  FusionGuard fg(fusion);
+
+  auto shouldUseZeroIndex = [&](const ValGroup& loop_group) -> bool {
+    ParallelType ptype = getParallelType(loop_group);
+    if (isParallelTypeThread(ptype)) {
+      return false;
+    }
+
+    // The device paralle type is not included in "isThread". We don't
+    // allocate any index variable for device-parallel domains.
+    if (isParallelTypeDeviceDim(ptype)) {
+      return true;
+    }
+
+    // All loops in this set are non-parallel, non-concretized broadcast
+    //  iterdomains, their "index variable" should be zero.
+    if (std::all_of(loop_group->begin(), loop_group->end(), [](Val* val) {
+          return val->as<IterDomain>()->isBroadcast();
+        })) {
+      VERBOSE() << "All domains are broadcast: "
+                << nvfuser::toString(loop_group) << std::endl;
+      return true;
+    }
+
+    // Trivial loop
+    // TODO: consider expanded extent?
+    auto leaf_id =
+        getLoopPromotion(loop_group->front()->as<IterDomain>(), id_model_);
+    if (!leaf_id->maybePartial() &&
+        simplifyExpr(leaf_id->extent())->isOneInt()) {
+      return true;
+    }
+
+    return false;
+  };
+
+  for (auto expr : fusion->exprs()) {
+    if (!ir_utils::isTvOp(expr)) {
+      continue;
+    }
+    auto tv_output = ir_utils::getTvOutput(expr);
+    for (auto leaf_id : tv_output->getLeafDomain()) {
+      const ValGroup& loop_group =
+          id_model_.idGraph(IdMappingMode::LOOP).toGroup(leaf_id);
+
+      if (loop_index_map_.find(loop_group) != loop_index_map_.end()) {
+        // Index already assigned
+        continue;
+      }
+
+      // TODO: halo loop not considered
+      // TODO: double buffering not considered
+
+      Val* loop_index = nullptr;
+
+      // First allocate thread and grid parallel indices:
+      //  The validation pass will check that the parallel bindings within the
+      //  loop nodes are consistent so all the loops within this disjoint set
+      //  will be realized implicitly using parallel index variables.
+      ParallelType ptype = getParallelType(loop_group);
+      if (isParallelTypeThread(ptype)) {
+        loop_index = NamedScalar::getParallelIndex(ptype);
+      } else if (shouldUseZeroIndex(loop_group)) {
+        VERBOSE() << "Use zero for " << nvfuser::toString(loop_group)
+                  << std::endl;
+        loop_index = fusion->zeroVal();
+      } else {
+        // Everything now should be serial concrete loops. For the mean
+        // time, just use the same index integer val generated for
+        // ComputeAtMap if available.
+        if (GpuLower::hasCurrent()) {
+          const auto& ca_map = GpuLower::current()->caMap();
+          for (const auto& id :
+               ir_utils::filterByType<IterDomain>(loop_group->vector())) {
+            if (!ca_map->getIdSets(IdMappingMode::LOOP).mappingExists(id)) {
+              continue;
+            }
+            VERBOSE() << "Trying to find index val for " << id->toString()
+                      << std::endl;
+            loop_index = ca_map->getIndexVariable(id);
+            break;
+          }
+          if (loop_index == nullptr) {
+            VERBOSE() << "No existing index found for "
+                      << nvfuser::toString(loop_group) << std::endl;
+          }
+        } else {
+          loop_index = IrBuilder::create<Val>(DataType::Index);
+        }
+      }
+      loop_index_map_[loop_group] = loop_index;
+      VERBOSE() << "Loop index map: " << nvfuser::toString(loop_group) << " -> "
+                << loop_index->toInlineString() << std::endl;
+    }
+  }
+}
+
+Val* TensorIndexer::getLoopIndex(IterDomain* loop_id) const {
+  // loop_id must be a loop domain.
+  const auto& loop_group =
+      id_model_.idGraph(IdMappingMode::LOOP).toGroup(loop_id);
+  auto loop_index_map_it = loop_index_map_.find(loop_group);
+  NVF_ERROR(
+      loop_index_map_it != loop_index_map_.end(),
+      "No loop index found for ",
+      loop_id->toString());
+
+  Val* loop_index = loop_index_map_it->second;
+  return loop_index;
+}
+
+std::unordered_map<ValGroup, Val*> TensorIndexer::getInitialIndexMap(
+    const Expr* expr,
+    const std::vector<IterDomain*>& loop_domains) const {
+  // loop_index_map_ is a map on the loop graph. For index
+  // propagation, need a map for the exact graph
+
+  std::unordered_map<ValGroup, Val*> initial_index_map;
+
+  for (IterDomain* loop_id : loop_domains) {
+    Val* loop_index = getLoopIndex(loop_id);
+    const auto& exact_group = traversalGraph().toGroup(loop_id);
+    VERBOSE() << "Setting initial index. " << loop_id->toString() << ", "
+              << nvfuser::toString(exact_group) << ", "
+              << loop_index->toInlineString() << std::endl;
+
+    if (initial_index_map.find(exact_group) != initial_index_map.end()) {
+      // Initial index already set. This can happen as exact_group is
+      // actually an almost-exact group. It should be just size-1
+      // domain.
+      NVF_ERROR(
+          loop_index->isZeroInt(),
+          "Unexpected initial index: ",
+          loop_index->toInlineString());
+      auto existing_index = initial_index_map.at(exact_group);
+      NVF_ERROR(
+          existing_index->isZeroInt(),
+          "Unexpected initial index: ",
+          existing_index->toInlineString());
+      continue;
+    }
+
+    NVF_ERROR(
+        initial_index_map.emplace(exact_group, loop_index).second,
+        "Initial index already set for ",
+        nvfuser::toString(exact_group),
+        ". Existing: ",
+        initial_index_map.at(exact_group)->toInlineString(),
+        ". New: ",
+        loop_index->toInlineString());
+  }
+
+  return initial_index_map;
+}
+
+Val* TensorIndexer::getTensorIndex(
+    TensorView* tv,
+    const Expr* expr,
+    const std::optional<std::vector<kir::ForLoop*>>& for_loops) {
+  VERBOSE() << "getIndex of " << tv->toString() << " in " << expr->toString();
+
+  const auto [allocation_domains, strides] =
+      getAllocationDomains(tv, id_model_);
+
+  VERBOSE() << "Allocation domains: " << toDelimitedString(allocation_domains)
+            << std::endl;
+
+  const auto& index_info = getIndex(expr, allocation_domains);
+  const auto& index_map = index_info.index_map;
+
+  // Linearize the indices with strides.
+  // TODO: Contiguous indexing
+
+  Val* index = tv->fusion()->zeroVal();
+  for (const auto i : c10::irange(allocation_domains.size())) {
+    // Traverse from innermost to outermost
+    IterDomain* allocation_domain =
+        allocation_domains.at(allocation_domains.size() - 1 - i);
+
+    Val* stride = strides.at(allocation_domains.size() - 1 - i);
+
+    auto idx_it = index_map.find(traversalGraph().toGroup(allocation_domain));
+    NVF_ERROR(
+        idx_it != index_map.end(),
+        "Index not found for ",
+        allocation_domain->toString());
+    Val* idx = idx_it->second;
+    VERBOSE() << "Index of " << allocation_domain->toString() << ": "
+              << idx->toInlineString() << std::endl;
+
+    index = SimplifyingIrBuilder::addExpr(
+        index, SimplifyingIrBuilder::mulExpr(idx, stride));
+  }
+
+  VERBOSE() << "Final index: " << index->toInlineString() << std::endl;
+
+  return index;
+}
+
+IndexingInfo TensorIndexer::getIndex(
+    const Expr* expr,
+    const std::vector<IterDomain*>& index_domains) const {
+  const auto loop_domains = getLoopDomains(expr, id_model_);
+  VERBOSE() << "Loop domains: " << toDelimitedString(loop_domains) << std::endl;
+
+  VERBOSE() << "Index domains: " << toDelimitedString(index_domains)
+            << std::endl;
+
+  const ExprPath traversal_path = getIndexingTraversalPath(
+      expr, loop_domains, index_domains, traversalGraph());
+
+  const std::unordered_map<ValGroup, Val*> initial_index_map =
+      getInitialIndexMap(expr, loop_domains);
+
+  IdGraphIndexCompute index_compute(traversalGraph(), initial_index_map);
+
+  for (const auto& [expr_group, direction] : traversal_path) {
+    index_compute.propagate(expr_group, direction);
+  }
+
+  IndexingInfo info{traversal_path, index_compute.indexMap()};
+  return info;
+}
+
+} // namespace nvfuser
diff --git a/csrc/id_model/indexing.h b/csrc/id_model/indexing.h
new file mode 100644
index 00000000000..dfd7c847964
--- /dev/null
+++ b/csrc/id_model/indexing.h
@@ -0,0 +1,80 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <device_lower/analysis/trivial_broadcast.h>
+#include <id_model/id_model.h>
+#include <ir/base_nodes.h>
+#include <ir/interface_nodes.h>
+#include <type.h>
+#include <val_graph_visitor.h>
+
+// Just for RootPredicateInfo. Should be moved to its own header file
+#include <index_compute.h>
+
+#include <unordered_map>
+
+namespace nvfuser {
+
+struct IndexingInfo {
+  ExprPath traversal_path;
+  std::unordered_map<ValGroup, Val*> index_map;
+};
+
+// The basic algorithm of indexing is:
+//
+// 1. Find the loop domains
+// 2. Find the allocation domains
+// 3. Find the path from the loop domains to the allocation domains
+// 4. Set the initial index vals
+// 5. Propagate the initial indices of the loop domains to the allocation
+// domains
+class TensorIndexer {
+ public:
+  TensorIndexer(const IdModel& id_model);
+
+  // The actual ForLoop's are required to support double buffering as
+  // that affects indexing. If the loops parameter is empty, it's
+  // simply ignored. That may be useful if (preliminary) indeices are
+  // needed before the double buffering pass
+  Val* getTensorIndex(
+      TensorView* tv,
+      const Expr* expr,
+      const std::optional<std::vector<kir::ForLoop*>>& loops);
+
+ private:
+  const ValGraph& traversalGraph() const {
+    return id_model_.idGraph(IdMappingMode::ALMOSTEXACT);
+  }
+
+  // Build the map of loop groups to their index Vals.
+  void buildLoopIndexMap();
+
+  // Get the index of a loop domain.
+  Val* getLoopIndex(IterDomain* loop_id) const;
+
+  //
+  std::unordered_map<ValGroup, Val*> getInitialIndexMap(
+      const Expr* expr,
+      const std::vector<IterDomain*>& loop_domains) const;
+
+  IndexingInfo getIndex(
+      const Expr* expr,
+      const std::vector<IterDomain*>& index_domains) const;
+
+ private:
+  const IdModel& id_model_;
+
+  // Mappings from loop groups to their indices. Serial loops will
+  // be mapped a unique loop index Val. Parallel loops will be mapped
+  // to NamedScalar such as "threadIdx.x". This map needs to be built
+  // once and can be reused for different tensors.
+  std::unordered_map<ValGroup, Val*> loop_index_map_;
+};
+
+} // namespace nvfuser
diff --git a/csrc/id_model/utils.h b/csrc/id_model/utils.h
new file mode 100644
index 00000000000..2d6327bf586
--- /dev/null
+++ b/csrc/id_model/utils.h
@@ -0,0 +1,55 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <utils.h>
+
+#include <functional>
+#include <iostream>
+#include <sstream>
+
+#define VERBOSE() verbose(__LINE__)
+#define WARN() warn(__LINE__)
+
+namespace nvfuser {
+
+// Temporary logging utility
+class DebugStream {
+ public:
+  DebugStream()
+      : enabled_(getNvFuserEnv("ID_MODEL_VERBOSE")), out_(std::cerr) {}
+
+  template <typename T>
+  DebugStream& operator<<(const T& v) {
+    if (enabled_) {
+      out_ << v;
+    }
+    return *this;
+  }
+
+  DebugStream& operator<<(std::ostream& (*endl)(std::ostream&)) {
+    if (enabled_) {
+      out_ << endl;
+    }
+    return *this;
+  }
+
+ private:
+  bool enabled_ = false;
+  std::ostream& out_;
+};
+
+inline DebugStream verbose(int line) {
+  return DebugStream() << "[DEBUG@" << line << "] ";
+}
+
+inline DebugStream warn(int line) {
+  return DebugStream() << "[WARN@" << line << "] ";
+}
+
+} // namespace nvfuser
diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h
index a4b5454fa2c..4323cc50e33 100644
--- a/csrc/ir/utils.h
+++ b/csrc/ir/utils.h
@@ -654,4 +654,20 @@ std::optional<std::vector<int64_t>> computePermutation(
 
 bool hasTrivialAllocationDomain(const TensorView* tv);
 
+// Returns true if memory_type is shared in parallel_type
+inline bool isShared(MemoryType memory_type, ParallelType parallel_type) {
+  switch (memory_type) {
+    case MemoryType::Local:
+      return !isParallelTypeThread(parallel_type) &&
+          !isParallelTypeDeviceDim(parallel_type);
+    case MemoryType::Shared:
+      return !isParallelTypeBlockDim(parallel_type) &&
+          !isParallelTypeDeviceDim(parallel_type);
+    case MemoryType::Global:
+      return !isParallelTypeDeviceDim(parallel_type);
+    default:
+      NVF_ERROR(false, "Unknown MemoryType: ", memory_type);
+  }
+}
+
 } // namespace nvfuser::ir_utils
diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
new file mode 100644
index 00000000000..be268dcfb85
--- /dev/null
+++ b/tests/cpp/test_indexing.cpp
@@ -0,0 +1,123 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include <csrc/exceptions.h>
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <tests/cpp/utils.h>
+#include <tests/cpp/validator.h>
+
+#include <fusion.h>
+#include <id_model/id_model.h>
+#include <id_model/indexing.h>
+#include <id_model/to_string.h>
+#include <inlining.h>
+#include <ops/all_ops.h>
+#include <transform_iter.h>
+#include <val_graph_visitor.h>
+
+namespace nvfuser {
+
+using IndexingTest = NVFuserTest;
+
+TEST_F(IndexingTest, Simple1) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Val>(1.0));
+  fusion.addOutput(tv1);
+
+  tv1->merge(0);
+  tv1->split(0, 4);
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  std::cerr << indexer.getTensorIndex(tv1, tv1->definition(), std::nullopt)
+                   ->toInlineString()
+            << std::endl;
+  std::cerr << indexer.getTensorIndex(tv0, tv1->definition(), std::nullopt)
+                   ->toInlineString()
+            << std::endl;
+}
+
+TEST_F(IndexingTest, Simple2) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Val>(1.0));
+  auto tv2 = add(tv0, IrBuilder::create<Val>(1.0));
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv2);
+
+  tv1->split(0, 4);
+  tv2->split(0, 8);
+
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  fusion.printKernel();
+
+  std::vector<int64_t> input_shape{17};
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(input_shape, options);
+  std::vector<c10::IValue> aten_inputs({t0});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
+}
+
+TEST_F(IndexingTest, Reshape) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int64_t> shape1({100});
+  const std::vector<int64_t> shape2({4, 25});
+  const std::vector<int64_t> shape3({5, 2, 10});
+
+  // [i0]
+  auto tv0 = makeContigConcreteTensor(shape1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+
+  // [i2, i3]
+  auto tv2 = reshape(tv1, shape1, shape2);
+
+  // [i2, i3]
+  auto tv3 = add(tv2, fusion.oneVal());
+
+  // [i4, i5, i6]
+  auto tv4 = reshape(tv3, shape2, shape3);
+
+  // [i4, i5, i6]
+  auto tv5 = add(tv4, fusion.oneVal());
+
+  fusion.addOutput(tv5);
+
+  TransformPropagator propagator(tv5);
+  MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
+
+  inlineMost();
+
+  fusion.print();
+
+  IdModel id_model(&fusion, true, false, false);
+}
+
+} // namespace nvfuser

From f333d42e92fe88333648923c09958067348bb9e4 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Tue, 4 Jun 2024 14:16:47 -0700
Subject: [PATCH 02/22] cleanup

---
 csrc/id_model/indexing.cpp  | 378 ++++++++++--------------------------
 csrc/id_model/indexing.h    |  40 ++--
 tests/cpp/test_indexing.cpp |   4 +-
 3 files changed, 136 insertions(+), 286 deletions(-)

diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index 9a912b4b4b1..0e06feb9e34 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -25,43 +25,6 @@ namespace nvfuser {
 
 namespace {
 
-class IndexingTraversal : public ValGraphBFS {
- public:
-  IndexingTraversal(
-      const ValGraph& graph,
-      std::vector<GroupType> from_groups,
-      std::vector<GroupType> to_groups)
-      : ValGraphBFS(graph, from_groups, to_groups) {}
-
-  virtual ~IndexingTraversal() = default;
-
-  static ExprPath getExprsBetween(
-      const std::vector<IterDomain*>& from_domains,
-      const std::vector<IterDomain*>& to_domains,
-      const ValGraph& graph) {
-    const ValGroups from_groups = graph.toGroups(from_domains);
-    const ValGroups to_groups = graph.toGroups(to_domains);
-
-    IndexingTraversal traversal(
-        graph,
-        {from_groups.vector().begin(), from_groups.vector().end()},
-        {to_groups.vector().begin(), to_groups.vector().end()});
-    traversal.traverse();
-    return traversal.getShortestExprPath();
-  }
-
-  using ValGraphBFS::isVisited;
-
-  bool isDependencySatisfied(const GroupType& group) const override {
-    if (const ValGroup* vg = std::get_if<ValGroup>(&group);
-        vg != nullptr && (*vg)->front()->as<IterDomain>()->isBroadcast()) {
-      VERBOSE() << "Dependency satisfied as it's broadcast" << std::endl;
-      return true;
-    }
-    return ValGraphBFS::isDependencySatisfied(group);
-  }
-};
-
 // Get the promotion domain of a given loop domain.
 IterDomain* getLoopPromotion(IterDomain* loop_id, const IdModel& id_model) {
   const auto& loop_graph = id_model.idGraph(IdMappingMode::LOOP);
@@ -138,7 +101,7 @@ bool isAllocationBasedOnLeaf(TensorView* tv) {
 }
 
 // Get the allocation domains of a given tensor. Also returns its
-// strides as well as
+// strides.
 std::tuple<std::vector<IterDomain*>, std::vector<Val*>> getAllocationDomains(
     TensorView* tv,
     const IdModel& id_model) {
@@ -197,78 +160,6 @@ std::tuple<std::vector<IterDomain*>, std::vector<Val*>> getAllocationDomains(
     }
   }
 
-  // TODO: Fix alloation domains with vectorization
-  // This is an ugly workaround, but the allocation domain of a tensor
-  // with vectorized domains may not be the same as the leaf fomain
-  // since the vectorized domain must be at the innermost position in
-  // the allocation domain, but it's allowed to be located anywhwere
-  // in the leaf domain.
-  // This shouldn't be necessary for global memory tensors as their
-  // allocation domains are rfactor domains
-  {
-    if (tv->getMemoryType() != MemoryType::Global) {
-      IterDomain* id_to_move_back = nullptr;
-      // Vectorized load
-      if (tv->definition() != nullptr && tv->definition()->isA<LoadStoreOp>() &&
-          tv->definition()->as<LoadStoreOp>()->opType() ==
-              LoadStoreOpType::Set) {
-        auto vec_it = std::find_if(
-            allocation_domains.begin(),
-            allocation_domains.end(),
-            [](auto index_domain) -> bool {
-              return isParallelTypeVectorize(index_domain->getParallelType());
-            });
-        if (vec_it != allocation_domains.end() &&
-            *vec_it != allocation_domains.back()) {
-          id_to_move_back = *vec_it;
-        }
-      } else {
-        for (const auto ls_use :
-             ir_utils::filterByType<LoadStoreOp>(tv->uses())) {
-          if (ls_use->opType() != LoadStoreOpType::Set) {
-            continue;
-          }
-          auto consumer_tv = ls_use->out()->as<TensorView>();
-          auto vec_it = std::find_if(
-              consumer_tv->getLeafDomain().begin(),
-              consumer_tv->getLeafDomain().end(),
-              [](auto consumer_leaf_id) -> bool {
-                return isParallelTypeVectorize(
-                    consumer_leaf_id->getParallelType());
-              });
-          if (vec_it == consumer_tv->getLeafDomain().end()) {
-            continue;
-          }
-          const auto& vec_group =
-              id_model.idGraph(IdMappingMode::EXACT).toGroup(*vec_it);
-          auto index_it = std::find_if(
-              allocation_domains.begin(),
-              allocation_domains.end(),
-              [&](auto index_id) -> bool { return vec_group->has(index_id); });
-          if (index_it == allocation_domains.end() ||
-              *index_it == allocation_domains.back()) {
-            continue;
-          }
-
-          id_to_move_back = *index_it;
-        }
-      }
-
-      if (id_to_move_back != nullptr) {
-        // reorder the vec domain to the end of the index domains
-        std::vector<IterDomain*> reordered_index_domains;
-        reordered_index_domains.reserve(allocation_domains.size());
-        for (const auto id : allocation_domains) {
-          if (id != id_to_move_back) {
-            reordered_index_domains.push_back(id);
-          }
-        }
-        reordered_index_domains.push_back(id_to_move_back);
-        allocation_domains = reordered_index_domains;
-      }
-    }
-  }
-
   auto tv_for_promotion = tv;
 
   std::vector<Val*> strides(allocation_domains.size(), nullptr);
@@ -320,34 +211,26 @@ std::tuple<std::vector<IterDomain*>, std::vector<Val*>> getAllocationDomains(
   return {actual_index_domains, actual_strides};
 }
 
-ExprPath getIndexingTraversalPath(
-    const Expr* expr,
-    const std::vector<IterDomain*>& from_domains,
-    const std::vector<IterDomain*>& to_domains,
-    const ValGraph& traversal_graph) {
-  VERBOSE() << "getIndexingTraversalPath: " << toDelimitedString(from_domains)
-            << " -> " << toDelimitedString(to_domains) << std::endl;
-
-  auto indexing_path = IndexingTraversal::getExprsBetween(
-      from_domains, to_domains, traversal_graph);
-
-  VERBOSE() << "Indexing path:\n";
-  for (const auto& [expr_group, direction] : indexing_path) {
-    Expr* expr = expr_group->front();
-    VERBOSE() << direction << " " << expr->toString();
-  }
-  VERBOSE() << "--- path done ---\n";
-
-  return indexing_path;
-}
-
+// Similar to IndexCompute but adapted for the graph-based indexing
 class IdGraphIndexCompute : public OptOutDispatch {
  public:
   IdGraphIndexCompute(
-      const ValGraph& exact_graph,
+      const ValGraph& traversal_graph,
       const std::unordered_map<ValGroup, Val*>& initial_index_map)
-      : traversal_graph_(exact_graph), index_map_(initial_index_map) {}
+      : traversal_graph_(traversal_graph), index_map_(initial_index_map) {}
+
+  // Propagate the index map through a given expr of a specified
+  // direction.
+  void propagate(const ExprGroup& expr_group, Direction direction) {
+    NVF_ERROR(!expr_group->empty());
+    dispatch(expr_group->front());
+  }
+
+  const std::unordered_map<ValGroup, Val*> indexMap() const {
+    return index_map_;
+  }
 
+ private:
   using OptOutDispatch::handle;
 
   void handle(Split* split) override;
@@ -356,20 +239,30 @@ class IdGraphIndexCompute : public OptOutDispatch {
 
   bool isForward(Expr* expr) const;
 
-  bool hasIndex(IterDomain* id) const;
-
-  Val* getIndex(IterDomain* id) const;
-
-  void setIndex(IterDomain* id, Val* idx);
+  bool hasIndex(IterDomain* id) const {
+    // If it's a broadcast, its index is always zero.
+    if (id->isBroadcast()) {
+      return true;
+    }
+    return indexMap().find(toGroup(id)) != indexMap().end();
+  }
 
-  std::unordered_map<ValGroup, Val*> indexMap() const {
-    return index_map_;
+  Val* getIndex(IterDomain* id) const {
+    // If it's a broadcast, its index is always zero.
+    if (id->isBroadcast()) {
+      return id->fusion()->zeroVal();
+    }
+    auto it = index_map_.find(toGroup(id));
+    NVF_ERROR(it != index_map_.end(), "Index not found: ", id->toString());
+    return it->second;
   }
 
-  void propagate(const ExprGroup& expr_group, Direction direction) {
-    NVF_ERROR(!expr_group->empty());
+  void setIndex(IterDomain* id, Val* idx) {
+    index_map_.emplace(toGroup(id), idx);
+  }
 
-    dispatch(expr_group->front());
+  const ValGroup& toGroup(IterDomain* id) const {
+    return traversal_graph_.toGroup(id);
   }
 
  private:
@@ -377,33 +270,7 @@ class IdGraphIndexCompute : public OptOutDispatch {
   std::unordered_map<ValGroup, Val*> index_map_;
 };
 
-bool IdGraphIndexCompute::hasIndex(IterDomain* id) const {
-  // If it's a broadcast, its index is always zero.
-  if (id->isBroadcast()) {
-    return true;
-  }
-  const ValGroup& id_group = traversal_graph_.toGroup(id);
-  return index_map_.find(id_group) != index_map_.end();
-}
-
-Val* IdGraphIndexCompute::getIndex(IterDomain* id) const {
-  // If it's a broadcast, its index is always zero.
-  if (id->isBroadcast()) {
-    return id->fusion()->zeroVal();
-  }
-  const ValGroup& id_group = traversal_graph_.toGroup(id);
-  auto it = index_map_.find(id_group);
-  NVF_ERROR(it != index_map_.end(), "Index not found: ", id->toString());
-  return it->second;
-}
-
-void IdGraphIndexCompute::setIndex(IterDomain* id, Val* idx) {
-  VERBOSE() << "setIndex: " << id->name() << " -> " << idx->toInlineString()
-            << std::endl;
-  const ValGroup& id_group = traversal_graph_.toGroup(id);
-  index_map_.emplace(id_group, idx);
-}
-
+// TODO: Should use the explicit direction
 bool IdGraphIndexCompute::isForward(Expr* expr) const {
   bool ready = true;
   for (const auto inp : ir_utils::filterByType<IterDomain>(expr->inputs())) {
@@ -456,7 +323,6 @@ void IdGraphIndexCompute::handle(Merge* merge) {
   VERBOSE() << "IdGraphIndexCompute handle (" << (is_forward ? "fwd" : "bwd")
             << "): " << merge->toString();
 
-  // TODO: use getMaybeExpandedExtent?
   auto inner_ext = merge->inner()->extent();
 
   if (is_forward) {
@@ -474,6 +340,13 @@ void IdGraphIndexCompute::handle(Merge* merge) {
   }
 }
 
+} // namespace
+
+TensorIndexer::TensorIndexer(const IdModel& id_model) : id_model_(id_model) {
+  buildLoopIndexMap();
+}
+
+namespace {
 ParallelType getParallelType(const ValGroup& loop_group) {
   ParallelType common_pt = ParallelType::Serial;
   for (const auto val : *loop_group) {
@@ -496,71 +369,14 @@ ParallelType getParallelType(const ValGroup& loop_group) {
 
   return common_pt;
 }
-
-kir::ForLoop* getForLoop(
-    IterDomain* loop_id,
-    const std::vector<kir::ForLoop*>& for_loops,
-    const ValGraph& loop_graph) {
-  auto it = std::find_if(
-      for_loops.begin(), for_loops.end(), [&](kir::ForLoop* for_loop) -> bool {
-        IterDomain* for_loop_id = for_loop->iter_domain();
-        return loop_graph.disjointValSets().strictAreMapped(
-            loop_id, for_loop_id);
-      });
-  if (it != for_loops.end()) {
-    return *it;
-  } else {
-    return nullptr;
-  }
-}
-
 } // namespace
 
-TensorIndexer::TensorIndexer(const IdModel& id_model) : id_model_(id_model) {
-  buildLoopIndexMap();
-}
-
 void TensorIndexer::buildLoopIndexMap() {
   if (id_model_.empty()) {
     return;
   }
 
   Fusion* fusion = id_model_.fusion();
-  FusionGuard fg(fusion);
-
-  auto shouldUseZeroIndex = [&](const ValGroup& loop_group) -> bool {
-    ParallelType ptype = getParallelType(loop_group);
-    if (isParallelTypeThread(ptype)) {
-      return false;
-    }
-
-    // The device paralle type is not included in "isThread". We don't
-    // allocate any index variable for device-parallel domains.
-    if (isParallelTypeDeviceDim(ptype)) {
-      return true;
-    }
-
-    // All loops in this set are non-parallel, non-concretized broadcast
-    //  iterdomains, their "index variable" should be zero.
-    if (std::all_of(loop_group->begin(), loop_group->end(), [](Val* val) {
-          return val->as<IterDomain>()->isBroadcast();
-        })) {
-      VERBOSE() << "All domains are broadcast: "
-                << nvfuser::toString(loop_group) << std::endl;
-      return true;
-    }
-
-    // Trivial loop
-    // TODO: consider expanded extent?
-    auto leaf_id =
-        getLoopPromotion(loop_group->front()->as<IterDomain>(), id_model_);
-    if (!leaf_id->maybePartial() &&
-        simplifyExpr(leaf_id->extent())->isOneInt()) {
-      return true;
-    }
-
-    return false;
-  };
 
   for (auto expr : fusion->exprs()) {
     if (!ir_utils::isTvOp(expr)) {
@@ -576,9 +392,6 @@ void TensorIndexer::buildLoopIndexMap() {
         continue;
       }
 
-      // TODO: halo loop not considered
-      // TODO: double buffering not considered
-
       Val* loop_index = nullptr;
 
       // First allocate thread and grid parallel indices:
@@ -589,8 +402,6 @@ void TensorIndexer::buildLoopIndexMap() {
       if (isParallelTypeThread(ptype)) {
         loop_index = NamedScalar::getParallelIndex(ptype);
       } else if (shouldUseZeroIndex(loop_group)) {
-        VERBOSE() << "Use zero for " << nvfuser::toString(loop_group)
-                  << std::endl;
         loop_index = fusion->zeroVal();
       } else {
         // Everything now should be serial concrete loops. For the mean
@@ -603,26 +414,63 @@ void TensorIndexer::buildLoopIndexMap() {
             if (!ca_map->getIdSets(IdMappingMode::LOOP).mappingExists(id)) {
               continue;
             }
-            VERBOSE() << "Trying to find index val for " << id->toString()
-                      << std::endl;
             loop_index = ca_map->getIndexVariable(id);
             break;
           }
-          if (loop_index == nullptr) {
-            VERBOSE() << "No existing index found for "
-                      << nvfuser::toString(loop_group) << std::endl;
-          }
+          NVF_ERROR(loop_index != nullptr,
+                    "No existing index found for ",
+                    nvfuser::toString(loop_group));
         } else {
+          // Not reusing the ComputeATMap index assignments
           loop_index = IrBuilder::create<Val>(DataType::Index);
         }
       }
+
+      NVF_ERROR(loop_index != nullptr);
       loop_index_map_[loop_group] = loop_index;
-      VERBOSE() << "Loop index map: " << nvfuser::toString(loop_group) << " -> "
-                << loop_index->toInlineString() << std::endl;
     }
   }
 }
 
+bool TensorIndexer::shouldUseZeroIndex(const ValGroup& loop_group) const {
+  // For parallelized domains that have index NamedScalar's such as
+  // threadIdx.x, just use the NamedScalar. It doesn't automatically
+  // mean such parallel indices are actually used in the final index
+  // expr. For example, TID-parallelized Local tensors won't have
+  // TID-parallelized iter domains as allocation domains, so threadIdx
+  // won't appear in the final index expr.
+  ParallelType ptype = getParallelType(loop_group);
+  if (isParallelTypeThread(ptype)) {
+    return false;
+  }
+
+  // Note that the device paralle type is not included in
+  // "isThread". This is necessary because we don't have a NamedScalar
+  // for DID. Since it's always partitioned in any memory space
+  // currently supported, it's guaranteed to be zero.
+  if (isParallelTypeDeviceDim(ptype)) {
+    return true;
+  }
+
+  // All loops in this set are non-parallel, non-concretized broadcast
+  //  iterdomains, their "index variable" should be zero.
+  if (std::all_of(loop_group->begin(), loop_group->end(), [](Val* val) {
+    return val->as<IterDomain>()->isBroadcast();
+  })) {
+    return true;
+  }
+
+  // Trivial loop
+  auto leaf_id =
+      getLoopPromotion(loop_group->front()->as<IterDomain>(), id_model_);
+  if (!leaf_id->maybePartial() &&
+      simplifyExpr(leaf_id->extent())->isOneInt()) {
+    return true;
+  }
+
+  return false;
+}
+
 Val* TensorIndexer::getLoopIndex(IterDomain* loop_id) const {
   // loop_id must be a loop domain.
   const auto& loop_group =
@@ -638,29 +486,23 @@ Val* TensorIndexer::getLoopIndex(IterDomain* loop_id) const {
 }
 
 std::unordered_map<ValGroup, Val*> TensorIndexer::getInitialIndexMap(
-    const Expr* expr,
     const std::vector<IterDomain*>& loop_domains) const {
-  // loop_index_map_ is a map on the loop graph. For index
-  // propagation, need a map for the exact graph
-
   std::unordered_map<ValGroup, Val*> initial_index_map;
 
+  // For a given list of the loop domains, assign its corresponding
+  // index Val.
   for (IterDomain* loop_id : loop_domains) {
     Val* loop_index = getLoopIndex(loop_id);
-    const auto& exact_group = traversalGraph().toGroup(loop_id);
-    VERBOSE() << "Setting initial index. " << loop_id->toString() << ", "
-              << nvfuser::toString(exact_group) << ", "
-              << loop_index->toInlineString() << std::endl;
-
-    if (initial_index_map.find(exact_group) != initial_index_map.end()) {
-      // Initial index already set. This can happen as exact_group is
-      // actually an almost-exact group. It should be just size-1
-      // domain.
+    const auto& almost_exact_group = traversalGraph().toGroup(loop_id);
+
+    if (initial_index_map.find(almost_exact_group) != initial_index_map.end()) {
+      // Initial index already set. This can happen as this is an
+      // almost exact group. It should be just size-1 domain.
       NVF_ERROR(
           loop_index->isZeroInt(),
           "Unexpected initial index: ",
           loop_index->toInlineString());
-      auto existing_index = initial_index_map.at(exact_group);
+      auto existing_index = initial_index_map.at(almost_exact_group);
       NVF_ERROR(
           existing_index->isZeroInt(),
           "Unexpected initial index: ",
@@ -668,23 +510,15 @@ std::unordered_map<ValGroup, Val*> TensorIndexer::getInitialIndexMap(
       continue;
     }
 
-    NVF_ERROR(
-        initial_index_map.emplace(exact_group, loop_index).second,
-        "Initial index already set for ",
-        nvfuser::toString(exact_group),
-        ". Existing: ",
-        initial_index_map.at(exact_group)->toInlineString(),
-        ". New: ",
-        loop_index->toInlineString());
+    initial_index_map.emplace(almost_exact_group, loop_index);
   }
 
   return initial_index_map;
 }
 
-Val* TensorIndexer::getTensorIndex(
+Val* TensorIndexer::getLinearIndex(
     TensorView* tv,
-    const Expr* expr,
-    const std::optional<std::vector<kir::ForLoop*>>& for_loops) {
+    const Expr* expr) {
   VERBOSE() << "getIndex of " << tv->toString() << " in " << expr->toString();
 
   const auto [allocation_domains, strides] =
@@ -693,7 +527,7 @@ Val* TensorIndexer::getTensorIndex(
   VERBOSE() << "Allocation domains: " << toDelimitedString(allocation_domains)
             << std::endl;
 
-  const auto& index_info = getIndex(expr, allocation_domains);
+  const auto& index_info = computeIndex(expr, allocation_domains);
   const auto& index_map = index_info.index_map;
 
   // Linearize the indices with strides.
@@ -725,7 +559,7 @@ Val* TensorIndexer::getTensorIndex(
   return index;
 }
 
-IndexingInfo TensorIndexer::getIndex(
+IndexingInfo TensorIndexer::computeIndex(
     const Expr* expr,
     const std::vector<IterDomain*>& index_domains) const {
   const auto loop_domains = getLoopDomains(expr, id_model_);
@@ -734,11 +568,13 @@ IndexingInfo TensorIndexer::getIndex(
   VERBOSE() << "Index domains: " << toDelimitedString(index_domains)
             << std::endl;
 
-  const ExprPath traversal_path = getIndexingTraversalPath(
-      expr, loop_domains, index_domains, traversalGraph());
+  const ValGroups loop_groups = traversalGraph().toGroups(loop_domains);
+  const ValGroups index_groups = traversalGraph().toGroups(index_domains);  
+  const ExprPath traversal_path = ValGraphBFS::getExprsBetween(
+      traversalGraph(), loop_groups, index_groups);
 
   const std::unordered_map<ValGroup, Val*> initial_index_map =
-      getInitialIndexMap(expr, loop_domains);
+      getInitialIndexMap(loop_domains);
 
   IdGraphIndexCompute index_compute(traversalGraph(), initial_index_map);
 
diff --git a/csrc/id_model/indexing.h b/csrc/id_model/indexing.h
index dfd7c847964..180a289fa78 100644
--- a/csrc/id_model/indexing.h
+++ b/csrc/id_model/indexing.h
@@ -22,7 +22,9 @@
 namespace nvfuser {
 
 struct IndexingInfo {
+  // Indexing traversal path from loop domains
   ExprPath traversal_path;
+  // Index mappings of ID groups along the traversal path
   std::unordered_map<ValGroup, Val*> index_map;
 };
 
@@ -31,42 +33,54 @@ struct IndexingInfo {
 // 1. Find the loop domains
 // 2. Find the allocation domains
 // 3. Find the path from the loop domains to the allocation domains
-// 4. Set the initial index vals
+// 4. Set the initial index vals for the loop domains
 // 5. Propagate the initial indices of the loop domains to the allocation
 // domains
+//
+// The indexing traversal is done on the AlmostExact graph augmented
+// with the loop promotion map since both the loop and allocations
+// domains may be promoted.
 class TensorIndexer {
  public:
   TensorIndexer(const IdModel& id_model);
 
-  // The actual ForLoop's are required to support double buffering as
-  // that affects indexing. If the loops parameter is empty, it's
-  // simply ignored. That may be useful if (preliminary) indeices are
-  // needed before the double buffering pass
-  Val* getTensorIndex(
-      TensorView* tv,
-      const Expr* expr,
-      const std::optional<std::vector<kir::ForLoop*>>& loops);
+  // Get a linear index of a given tensor appearing in a given expr, either
+  // as a consumer or a producer. The predicate indexing will have a
+  // separate interface.
+  Val* getLinearIndex(TensorView* tv, const Expr* expr);
 
  private:
+  // The AlmostExact graph is used since size-1 splits and merges
+  // should not affect actual index exprs.
   const ValGraph& traversalGraph() const {
     return id_model_.idGraph(IdMappingMode::ALMOSTEXACT);
   }
 
-  // Build the map of loop groups to their index Vals.
+  // Build a map of loop groups to their index Vals. See the comment
+  // on loop_index_map_.
   void buildLoopIndexMap();
 
   // Get the index of a loop domain.
   Val* getLoopIndex(IterDomain* loop_id) const;
 
-  //
+  // Propagate the loop indices of a given list of loop domains to the
+  // traversal graph (i.e., the AlmostExact graph). Uses the loop
+  // index map, which is built for the Loop graph.
   std::unordered_map<ValGroup, Val*> getInitialIndexMap(
-      const Expr* expr,
       const std::vector<IterDomain*>& loop_domains) const;
 
-  IndexingInfo getIndex(
+  // Returns the index map as well as its traversal path of given
+  // index domains appearing in a given expr. Used by
+  // getLinearIndex.
+  IndexingInfo computeIndex(
       const Expr* expr,
       const std::vector<IterDomain*>& index_domains) const;
 
+  // Check if the loop index of a a loop group should be always
+  // just zero. For example, a loop group with an extent of one, i.e.,
+  // a broadcast-only loop group, should just use zero.
+  bool shouldUseZeroIndex(const ValGroup& loop_group) const;
+
  private:
   const IdModel& id_model_;
 
diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
index be268dcfb85..c0f02a04d41 100644
--- a/tests/cpp/test_indexing.cpp
+++ b/tests/cpp/test_indexing.cpp
@@ -41,10 +41,10 @@ TEST_F(IndexingTest, Simple1) {
   IdModel id_model(&fusion);
   TensorIndexer indexer(id_model);
 
-  std::cerr << indexer.getTensorIndex(tv1, tv1->definition(), std::nullopt)
+  std::cerr << indexer.getLinearIndex(tv1, tv1->definition())
                    ->toInlineString()
             << std::endl;
-  std::cerr << indexer.getTensorIndex(tv0, tv1->definition(), std::nullopt)
+  std::cerr << indexer.getLinearIndex(tv0, tv1->definition())
                    ->toInlineString()
             << std::endl;
 }

From 7cc7acf42d9d2b2f12ba76a4c573d2b70b594db8 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Tue, 4 Jun 2024 14:17:54 -0700
Subject: [PATCH 03/22] cleanup

---
 csrc/id_model/indexing.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index 0e06feb9e34..a3524b75dbf 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -240,18 +240,10 @@ class IdGraphIndexCompute : public OptOutDispatch {
   bool isForward(Expr* expr) const;
 
   bool hasIndex(IterDomain* id) const {
-    // If it's a broadcast, its index is always zero.
-    if (id->isBroadcast()) {
-      return true;
-    }
     return indexMap().find(toGroup(id)) != indexMap().end();
   }
 
   Val* getIndex(IterDomain* id) const {
-    // If it's a broadcast, its index is always zero.
-    if (id->isBroadcast()) {
-      return id->fusion()->zeroVal();
-    }
     auto it = index_map_.find(toGroup(id));
     NVF_ERROR(it != index_map_.end(), "Index not found: ", id->toString());
     return it->second;

From 39ad06a0916071a15e13d082f9f5501793f7df6e Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Thu, 30 May 2024 16:50:39 -0700
Subject: [PATCH 04/22] Initial PR of IdModel-based indexing

Only includes minimum logic for basic indexing. Notably, no support for
broadcast in this PR.
---
 CMakeLists.txt                     |   2 +
 csrc/device_lower/lower2device.cpp |   2 +-
 csrc/device_lower/utils.cpp        |  22 ++
 csrc/device_lower/utils.h          |   2 +
 csrc/id_model/id_model.h           |   9 +
 csrc/id_model/indexing.cpp         | 443 +++++++++++++++++++++++++++++
 csrc/id_model/indexing.h           | 100 +++++++
 csrc/id_model/utils.h              |  55 ++++
 csrc/ir/utils.h                    |  36 +++
 tests/cpp/test_indexing.cpp        | 384 +++++++++++++++++++++++++
 10 files changed, 1054 insertions(+), 1 deletion(-)
 create mode 100644 csrc/id_model/indexing.cpp
 create mode 100644 csrc/id_model/indexing.h
 create mode 100644 csrc/id_model/utils.h
 create mode 100644 tests/cpp/test_indexing.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed87fd7bbbe..6be651ca58c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,6 +135,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/host_ir/executor.cpp
   ${NVFUSER_SRCS_DIR}/host_ir/host_ir.cpp
   ${NVFUSER_SRCS_DIR}/id_model/id_model.cpp
+  ${NVFUSER_SRCS_DIR}/id_model/indexing.cpp
   ${NVFUSER_SRCS_DIR}/id_model/loop_promotion.cpp
   ${NVFUSER_SRCS_DIR}/id_model/schedule.cpp
   ${NVFUSER_SRCS_DIR}/id_model/to_string.cpp
@@ -506,6 +507,7 @@ list(APPEND JIT_TEST_SRCS
   ${NVFUSER_ROOT}/tests/cpp/test_gpu_transpose.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_gpu_utils.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_id_model.cpp
+  ${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_linked_hash_map.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_loop_rotation.cpp
diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp
index c50f160eb51..1a464417631 100644
--- a/csrc/device_lower/lower2device.cpp
+++ b/csrc/device_lower/lower2device.cpp
@@ -391,7 +391,7 @@ void GpuLower::analysis(Fusion* fusion) {
   // functionality should be affected. New IterDomains may be created,
   // so it is expected that generated code may use diffrent variable
   // names
-  if (isOptionEnabled(EnableOption::IdModel)) {
+  if (true || isOptionEnabled(EnableOption::IdModel)) {
     IdModel id_model(fusion_);
   }
 
diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp
index 955c1513dd3..bbf0526e461 100644
--- a/csrc/device_lower/utils.cpp
+++ b/csrc/device_lower/utils.cpp
@@ -920,6 +920,28 @@ std::array<UnitDim, 2> getMmaLayout(const MmaOp* expr) {
   return layout;
 }
 
+// Returns true if expr is an expression that initializes a reduction
+// buffer.
+bool isReductionInitExpr(const Expr* expr) {
+  // False if its output isn't a TensorView
+  if (!ir_utils::isTvOp(expr)) {
+    return false;
+  }
+  // False if it doesn't have any reduction axis
+  const auto out_tv = ir_utils::getTvOutput(expr);
+  if (!out_tv->domain()->hasReduction()) {
+    return false;
+  }
+  // False if it has have TensorView inputs as initialization should
+  // never use TensorViews
+  const auto tv_filter_inp_view =
+      ir_utils::filterByType<TensorView>(expr->inputs());
+  if (tv_filter_inp_view.begin() != tv_filter_inp_view.end()) {
+    return false;
+  }
+  return true;
+}
+
 } // namespace lower_utils
 
 } // namespace nvfuser
diff --git a/csrc/device_lower/utils.h b/csrc/device_lower/utils.h
index 60f3459814a..cad0ae5b996 100644
--- a/csrc/device_lower/utils.h
+++ b/csrc/device_lower/utils.h
@@ -322,6 +322,8 @@ Val* getNumThreadsInTensorView(TensorView* tv);
 //! Get the unit dimensions of A and B for the given MmaOp.
 std::array<UnitDim, 2> getMmaLayout(const MmaOp* expr);
 
+bool isReductionInitExpr(const Expr* expr);
+
 } // namespace lower_utils
 
 } // namespace nvfuser
diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h
index 5b60ff474c6..b912978ca48 100644
--- a/csrc/id_model/id_model.h
+++ b/csrc/id_model/id_model.h
@@ -153,6 +153,15 @@ class IdModel : public PolymorphicBase {
 
   std::string toString() const;
 
+  bool empty() const {
+    return tvs_.empty();
+  }
+
+  Fusion* fusion() const {
+    NVF_ERROR(!tvs_.empty());
+    return tvs_.at(0)->fusion();
+  }
+
   // Build all graphs, i.e., Exact, AlmostExact, Permissive and
   // LOOP. This is by default called from the constructor
   void buildAllGraphs();
diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
new file mode 100644
index 00000000000..c80b6c3548d
--- /dev/null
+++ b/csrc/id_model/indexing.cpp
@@ -0,0 +1,443 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include <device_lower/analysis/index_compute.h>
+#include <device_lower/lower2device.h>
+#include <device_lower/utils.h>
+#include <expr_simplifier.h>
+#include <id_model/indexing.h>
+#include <id_model/to_string.h>
+#include <id_model/utils.h>
+#include <index_compute.h>
+#include <ir/builder.h>
+#include <ir/graphviz.h>
+#include <ir/utils.h>
+#include <val_graph_visitor.h>
+
+#include <algorithm>
+#include <fstream>
+
+namespace nvfuser {
+
+namespace {
+
+// True if a given domain is a loop doamin of a given tensor and its
+// loop is partitioned with respect to the memory type of the tensor
+bool isPartitionedLoop(TensorView* tv, IterDomain* id) {
+  // False if id is not a loop ID
+  if (std::find(tv->getLeafDomain().begin(), tv->getLeafDomain().end(), id) ==
+      tv->getLeafDomain().end()) {
+    return false;
+  }
+
+  // If the memory of this domain is partitioned with respect to the
+  // parallel type of the domain, there's no allocation for the domain
+  return ir_utils::isPartitionedMemory(
+      tv->getMemoryType(), id->getParallelType());
+}
+
+bool isSizeOneDomain(IterDomain* id) {
+  return id->isBroadcast() || id->isReduction() || id->extent()->isOneInt();
+}
+
+// True if a given domain of a tensor *may* require allocation
+bool mayRequireAllocation(TensorView* tv, IterDomain* id) {
+  return !isPartitionedLoop(tv, id) && !isSizeOneDomain(id);
+}
+
+// Get the allocation stride of a given allocation domain
+Val* getStrideOfGlobalMemoryTensor(TensorView* tv, int64_t alloc_dim) {
+  NVF_ERROR(tv->getMemoryType() == MemoryType::Global);
+
+  // Allocation domains can include reduction domains, but
+  // alloc_stride arraies do not.
+  const auto& alloc_dom = tv->getMaybeAllocationDomain();
+  int64_t stride_dim = -1;
+  for (const auto i : c10::irange(alloc_dim + 1)) {
+    if (alloc_dom.at(i)->isReduction()) {
+      continue;
+    }
+    ++stride_dim;
+  }
+
+  NVF_ERROR(stride_dim != -1);
+
+  return IrBuilder::getItemExpr(
+      IrBuilder::getAttrExpr(IrBuilder::metadataExpr(tv), "alloc_stride"),
+      stride_dim);
+}
+
+// Get the allocation domains of a given tensor. Also returns its
+// strides.
+//
+// TODO: Ideally, all tensors should have their correct allocation
+// domains, but that isn't always the case at this moment. The logic
+// here is duplicated in multiple locations and should be cleaned up.
+std::tuple<std::vector<IterDomain*>, std::vector<Val*>> getAllocationDomains(
+    TensorView* tv,
+    const IdModel& id_model) {
+  std::vector<IterDomain*> allocation_domains;
+  std::vector<std::optional<bool>> contiguity;
+
+  // Use the allocation domain if set for the tensor
+  if (tv->hasAllocation()) {
+    allocation_domains = tv->getAllocationDomain();
+    contiguity = tv->domain()->contiguity();
+  } else {
+    // If allocation domain is not set, assume that:
+    // - Global: logical domains
+    // - Local/Shared: loop domains to the right of the CA position
+    const auto inlining_pos = tv->getComputeAtPosition();
+    if (tv->getMemoryType() == MemoryType::Global) {
+      allocation_domains = tv->getRFactorDomain();
+      contiguity = tv->domain()->contiguity();
+    } else {
+      for (const auto i : c10::irange(tv->nDims())) {
+        auto loop_id = tv->getLeafDomain().at(i);
+        auto pt = loop_id->getParallelType();
+        if (!mayRequireAllocation(tv, loop_id)) {
+          continue;
+        }
+
+        // If the position is left of the inlinig position, no need to
+        // alloate the domain unless it's shared. For example, if this
+        // is a Shared tensor and the domain is parallelized with TID,
+        // even if it's outside of the CA position, since the domain
+        // is shared, it must be allocated.
+        if (i < inlining_pos &&
+            !ir_utils::isSharedMemory(tv->getMemoryType(), pt)) {
+          continue;
+        }
+
+        allocation_domains.push_back(loop_id);
+      }
+      // Assume Local and Shared are always fully contiguous
+      contiguity =
+          std::vector<std::optional<bool>>(allocation_domains.size(), true);
+    }
+  }
+
+  // Compute the strides from innermost to outermost domains
+  std::vector<Val*> strides(allocation_domains.size(), nullptr);
+  Val* cur_contig_stride = tv->fusion()->oneVal();
+  for (const auto i : c10::irange(allocation_domains.size())) {
+    auto dim = allocation_domains.size() - i - 1;
+    auto allocation_domain = allocation_domains.at(dim);
+
+    if (!mayRequireAllocation(tv, allocation_domain)) {
+      continue;
+    }
+
+    const std::optional<bool> contig_flag = contiguity.at(dim);
+    // Broadcast doesn't have contig flag but it must have been
+    // already filtered out
+    NVF_ERROR(contig_flag.has_value());
+
+    if (contig_flag.value()) {
+      strides[dim] = cur_contig_stride;
+      cur_contig_stride = SimplifyingIrBuilder::mulExpr(
+          allocation_domains.at(dim)->extent(), cur_contig_stride);
+    } else {
+      // Assume that the tensor should always be a Global memory
+      // tensor if it has non-contig allocation domains
+      NVF_ERROR(tv->getMemoryType() == MemoryType::Global);
+      strides[dim] = getStrideOfGlobalMemoryTensor(tv, (int64_t)dim);
+      cur_contig_stride = strides[dim];
+    }
+  }
+
+  // Filter out non-allocated domains. This is already done for Local
+  // and Shared tensors with no set allocation domains, but not for
+  // the other cases. For example, a reduction output tensor that is
+  // also a fusion output may still have reduction domains in their
+  // allocation domains, which aren't relevant for indexing
+  std::vector<IterDomain*> actual_allocation_domains;
+  std::vector<Val*> actual_strides;
+  for (const auto i : c10::irange(allocation_domains.size())) {
+    auto allocation_domain = allocation_domains.at(i);
+    if (!mayRequireAllocation(tv, allocation_domain)) {
+      continue;
+    }
+    auto stride = strides.at(i);
+    NVF_ERROR(stride != nullptr);
+    actual_allocation_domains.push_back(allocation_domain);
+    actual_strides.push_back(stride);
+  }
+
+  return {actual_allocation_domains, actual_strides};
+}
+
+// Similar to IndexCompute but adapted for the graph-based indexing
+class IdGraphIndexCompute : public OptOutDispatch {
+ public:
+  IdGraphIndexCompute(
+      const ValGraph& traversal_graph,
+      const std::unordered_map<ValGroup, Val*>& initial_index_map)
+      : traversal_graph_(traversal_graph), index_map_(initial_index_map) {}
+
+  // Propagate the index map through a given expr of a specified
+  // direction.
+  void propagate(const ExprGroup& expr_group, Direction direction) {
+    NVF_ERROR(!expr_group->empty());
+    dispatch(expr_group->front());
+  }
+
+  const std::unordered_map<ValGroup, Val*> indexMap() const {
+    return index_map_;
+  }
+
+ private:
+  using OptOutDispatch::handle;
+
+  void handle(Split* split) override;
+
+  void handle(Merge* merge) override;
+
+  bool isForward(Expr* expr) const;
+
+  bool hasIndex(IterDomain* id) const {
+    return indexMap().find(toGroup(id)) != indexMap().end();
+  }
+
+  Val* getIndex(IterDomain* id) const {
+    auto it = index_map_.find(toGroup(id));
+    NVF_ERROR(it != index_map_.end(), "Index not found: ", id->toString());
+    return it->second;
+  }
+
+  void setIndex(IterDomain* id, Val* idx) {
+    index_map_.emplace(toGroup(id), idx);
+  }
+
+  const ValGroup& toGroup(IterDomain* id) const {
+    return traversal_graph_.toGroup(id);
+  }
+
+ private:
+  const ValGraph& traversal_graph_;
+  std::unordered_map<ValGroup, Val*> index_map_;
+};
+
+// TODO: Should use the explicit direction
+bool IdGraphIndexCompute::isForward(Expr* expr) const {
+  bool ready = true;
+  for (const auto inp : ir_utils::filterByType<IterDomain>(expr->inputs())) {
+    if (!hasIndex(inp)) {
+      ready = false;
+      break;
+    }
+  }
+  if (ready) {
+    return true;
+  }
+
+  // Can just return false here. Just make sure the outputs are
+  // already processed
+  for (const auto out : ir_utils::filterByType<IterDomain>(expr->outputs())) {
+    NVF_ERROR(hasIndex(out), "Output index not found: ", out->toString());
+  }
+
+  return false;
+}
+
+void IdGraphIndexCompute::handle(Split* split) {
+  const bool is_forward = isForward(split);
+
+  if (is_forward) {
+    auto in_idx = getIndex(split->in());
+    auto inner_extent = split->inner()->extent();
+    auto outer_idx = SimplifyingIrBuilder::divExpr(in_idx, inner_extent);
+    Val* inner_idx = nullptr;
+    inner_idx = SimplifyingIrBuilder::modExpr(in_idx, inner_extent);
+    setIndex(split->outer(), outer_idx);
+    setIndex(split->inner(), inner_idx);
+  } else {
+    auto outer_idx = getIndex(split->outer());
+    auto inner_idx = getIndex(split->inner());
+    auto inner_extent = split->inner()->extent();
+    auto in_idx = SimplifyingIrBuilder::addExpr(
+        SimplifyingIrBuilder::mulExpr(outer_idx, inner_extent), inner_idx);
+    setIndex(split->in(), in_idx);
+  }
+}
+
+void IdGraphIndexCompute::handle(Merge* merge) {
+  const bool is_forward = isForward(merge);
+
+  auto inner_ext = merge->inner()->extent();
+
+  if (is_forward) {
+    auto outer_idx = getIndex(merge->outer());
+    auto inner_idx = getIndex(merge->inner());
+    auto out_idx = SimplifyingIrBuilder::addExpr(
+        SimplifyingIrBuilder::mulExpr(outer_idx, inner_ext), inner_idx);
+    setIndex(merge->out(), out_idx);
+  } else {
+    auto out_idx = getIndex(merge->out());
+    auto outer_idx = SimplifyingIrBuilder::divExpr(out_idx, inner_ext);
+    setIndex(merge->outer(), outer_idx);
+    Val* inner_idx = SimplifyingIrBuilder::modExpr(out_idx, inner_ext);
+    setIndex(merge->inner(), inner_idx);
+  }
+}
+
+} // namespace
+
+TensorIndexer::TensorIndexer(const IdModel& id_model) : id_model_(id_model) {
+  buildLoopIndexMap();
+}
+
+namespace {
+ParallelType getParallelType(const ValGroup& loop_group) {
+  ParallelType common_pt = ParallelType::Serial;
+  for (const auto val : *loop_group) {
+    auto pt = val->as<IterDomain>()->getParallelType();
+    if (common_pt == pt || pt == ParallelType::Serial) {
+      continue;
+    } else if (common_pt == ParallelType::Serial) {
+      common_pt = pt;
+    } else {
+      // Inconsistent parallelization
+      NVF_ERROR(
+          false,
+          "Inconsistent parallelization detected. ",
+          "Known type: ",
+          common_pt,
+          "New type: ",
+          pt);
+    }
+  }
+
+  return common_pt;
+}
+} // namespace
+
+void TensorIndexer::buildLoopIndexMap() {
+  if (id_model_.empty()) {
+    return;
+  }
+
+  Fusion* fusion = id_model_.fusion();
+
+  for (auto expr : fusion->exprs()) {
+    if (!ir_utils::isTvOp(expr)) {
+      continue;
+    }
+    auto tv_output = ir_utils::getTvOutput(expr);
+    for (auto leaf_id : tv_output->getLeafDomain()) {
+      const ValGroup& loop_group =
+          id_model_.idGraph(IdMappingMode::LOOP).toGroup(leaf_id);
+
+      if (loop_index_map_.find(loop_group) != loop_index_map_.end()) {
+        // Index already assigned
+        continue;
+      }
+
+      Val* loop_index = nullptr;
+
+      ParallelType ptype = getParallelType(loop_group);
+      if (isParallelTypeThread(ptype)) {
+        loop_index = NamedScalar::getParallelIndex(ptype);
+      } else {
+        loop_index = IrBuilder::create<Val>(DataType::Index);
+      }
+
+      loop_index_map_[loop_group] = loop_index;
+    }
+  }
+}
+
+Val* TensorIndexer::getLoopIndex(IterDomain* loop_id) const {
+  // loop_id must be a loop domain.
+  const auto& loop_group =
+      id_model_.idGraph(IdMappingMode::LOOP).toGroup(loop_id);
+  auto loop_index_map_it = loop_index_map_.find(loop_group);
+  NVF_ERROR(
+      loop_index_map_it != loop_index_map_.end(),
+      "No loop index found for ",
+      loop_id->toString());
+
+  Val* loop_index = loop_index_map_it->second;
+  return loop_index;
+}
+
+std::unordered_map<ValGroup, Val*> TensorIndexer::getInitialIndexMap(
+    const std::vector<IterDomain*>& loop_domains) const {
+  std::unordered_map<ValGroup, Val*> initial_index_map;
+
+  // For a given list of the loop domains, assign its corresponding
+  // index Val.
+  for (IterDomain* loop_id : loop_domains) {
+    Val* loop_index = getLoopIndex(loop_id);
+    const auto& almost_exact_group = traversalGraph().toGroup(loop_id);
+
+    initial_index_map.emplace(almost_exact_group, loop_index);
+  }
+
+  return initial_index_map;
+}
+
+Val* TensorIndexer::getLinearIndex(TensorView* tv, const Expr* expr) {
+  const auto [allocation_domains, strides] =
+      getAllocationDomains(tv, id_model_);
+
+  const auto& index_info = computeIndex(expr, allocation_domains);
+  const auto& index_map = index_info.index_map;
+
+  // Linearize the indices with strides.
+  // TODO: Contiguous indexing
+  Val* index = tv->fusion()->zeroVal();
+  for (const auto i : c10::irange(allocation_domains.size())) {
+    // Traverse from innermost to outermost
+    IterDomain* allocation_domain =
+        allocation_domains.at(allocation_domains.size() - 1 - i);
+
+    Val* stride = strides.at(allocation_domains.size() - 1 - i);
+
+    auto idx_it = index_map.find(traversalGraph().toGroup(allocation_domain));
+    NVF_ERROR(
+        idx_it != index_map.end(),
+        "Index not found for ",
+        allocation_domain->toString());
+    Val* idx = idx_it->second;
+    index = SimplifyingIrBuilder::addExpr(
+        index, SimplifyingIrBuilder::mulExpr(idx, stride));
+  }
+
+  return index;
+}
+
+std::vector<IterDomain*> TensorIndexer::getLoopDomains(const Expr* expr) const {
+  // Assume consumer-based indexing. Needs to revisit for ops like
+  // scatter
+  return ir_utils::getTvOutput(expr)->getLeafDomain();
+}
+
+IndexingInfo TensorIndexer::computeIndex(
+    const Expr* expr,
+    const std::vector<IterDomain*>& index_domains) const {
+  const auto loop_domains = getLoopDomains(expr);
+
+  const ValGroups loop_groups = traversalGraph().toGroups(loop_domains);
+  const ValGroups index_groups = traversalGraph().toGroups(index_domains);
+  const ExprPath traversal_path =
+      ValGraphBFS::getExprsBetween(traversalGraph(), loop_groups, index_groups);
+
+  const std::unordered_map<ValGroup, Val*> initial_index_map =
+      getInitialIndexMap(loop_domains);
+
+  IdGraphIndexCompute index_compute(traversalGraph(), initial_index_map);
+
+  for (const auto& [expr_group, direction] : traversal_path) {
+    index_compute.propagate(expr_group, direction);
+  }
+
+  IndexingInfo info{traversal_path, index_compute.indexMap()};
+  return info;
+}
+
+} // namespace nvfuser
diff --git a/csrc/id_model/indexing.h b/csrc/id_model/indexing.h
new file mode 100644
index 00000000000..1dae562c9c7
--- /dev/null
+++ b/csrc/id_model/indexing.h
@@ -0,0 +1,100 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <device_lower/analysis/trivial_broadcast.h>
+#include <id_model/id_model.h>
+#include <ir/base_nodes.h>
+#include <ir/interface_nodes.h>
+#include <type.h>
+#include <val_graph_visitor.h>
+
+// Just for RootPredicateInfo. Should be moved to its own header file
+#include <index_compute.h>
+
+#include <unordered_map>
+
+namespace nvfuser {
+
+struct IndexingInfo {
+  // Indexing traversal path from loop domains
+  ExprPath traversal_path;
+  // Index mappings of ID groups along the traversal path
+  std::unordered_map<ValGroup, Val*> index_map;
+};
+
+// The basic algorithm of indexing is:
+//
+// 1. Find the loop domains
+// 2. Find the allocation domains
+// 3. Find the path from the loop domains to the allocation domains
+// 4. Set the initial index vals for the loop domains
+// 5. Propagate the initial indices of the loop domains to the allocation
+// domains
+//
+// The indexing traversal is done on the AlmostExact graph augmented
+// with the loop promotion map since both the loop and allocations
+// domains may be promoted.
+class TensorIndexer {
+ public:
+  TensorIndexer(const IdModel& id_model);
+
+  // Get a linear index of a given tensor appearing in a given expr, either
+  // as a consumer or a producer. The predicate indexing will have a
+  // separate interface.
+  Val* getLinearIndex(TensorView* tv, const Expr* expr);
+
+ private:
+  // The AlmostExact graph is used since size-1 splits and merges
+  // should not affect actual index exprs.
+  const ValGraph& traversalGraph() const {
+    return id_model_.idGraph(IdMappingMode::ALMOSTEXACT);
+  }
+
+  // Build a map of loop groups to their index Vals. See the comment
+  // on loop_index_map_.
+  void buildLoopIndexMap();
+
+  // Get the index of a loop domain.
+  Val* getLoopIndex(IterDomain* loop_id) const;
+
+  // Propagate the loop indices of a given list of loop domains to the
+  // traversal graph (i.e., the AlmostExact graph). Uses the loop
+  // index map, which is built for the Loop graph.
+  std::unordered_map<ValGroup, Val*> getInitialIndexMap(
+      const std::vector<IterDomain*>& loop_domains) const;
+
+  // Get the loop domains of a given expr. Currently, they're always
+  // the loop domains of a consumer tensor, but in the future this
+  // function may return the loop domains of a producer for
+  // producer-based indexing.
+  std::vector<IterDomain*> getLoopDomains(const Expr* expr) const;
+
+  // Returns the index map as well as its traversal path of given
+  // index domains appearing in a given expr. Used by
+  // getLinearIndex.
+  IndexingInfo computeIndex(
+      const Expr* expr,
+      const std::vector<IterDomain*>& index_domains) const;
+
+  // Check if the loop index of a a loop group should be always
+  // just zero. For example, a loop group with an extent of one, i.e.,
+  // a broadcast-only loop group, should just use zero.
+  bool shouldUseZeroIndex(const ValGroup& loop_group) const;
+
+ private:
+  const IdModel& id_model_;
+
+  // Mappings from loop groups to their indices. Serial loops will
+  // be mapped a unique loop index Val. Parallel loops will be mapped
+  // to NamedScalar such as "threadIdx.x". This map needs to be built
+  // once and can be reused for different tensors.
+  std::unordered_map<ValGroup, Val*> loop_index_map_;
+};
+
+} // namespace nvfuser
diff --git a/csrc/id_model/utils.h b/csrc/id_model/utils.h
new file mode 100644
index 00000000000..2d6327bf586
--- /dev/null
+++ b/csrc/id_model/utils.h
@@ -0,0 +1,55 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <utils.h>
+
+#include <functional>
+#include <iostream>
+#include <sstream>
+
+#define VERBOSE() verbose(__LINE__)
+#define WARN() warn(__LINE__)
+
+namespace nvfuser {
+
+// Temporary logging utility
+class DebugStream {
+ public:
+  DebugStream()
+      : enabled_(getNvFuserEnv("ID_MODEL_VERBOSE")), out_(std::cerr) {}
+
+  template <typename T>
+  DebugStream& operator<<(const T& v) {
+    if (enabled_) {
+      out_ << v;
+    }
+    return *this;
+  }
+
+  DebugStream& operator<<(std::ostream& (*endl)(std::ostream&)) {
+    if (enabled_) {
+      out_ << endl;
+    }
+    return *this;
+  }
+
+ private:
+  bool enabled_ = false;
+  std::ostream& out_;
+};
+
+inline DebugStream verbose(int line) {
+  return DebugStream() << "[DEBUG@" << line << "] ";
+}
+
+inline DebugStream warn(int line) {
+  return DebugStream() << "[WARN@" << line << "] ";
+}
+
+} // namespace nvfuser
diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h
index 14357bd4207..0fa65960edc 100644
--- a/csrc/ir/utils.h
+++ b/csrc/ir/utils.h
@@ -654,4 +654,40 @@ std::optional<std::vector<int64_t>> computePermutation(
 
 bool hasTrivialAllocationDomain(const TensorView* tv);
 
+// Returns true if memory_type is partitioned in parallel_type
+inline bool isPartitionedMemory(
+    MemoryType memory_type,
+    ParallelType parallel_type) {
+  switch (memory_type) {
+    case MemoryType::Local:
+      return isParallelTypeThread(parallel_type) ||
+          isParallelTypeDeviceDim(parallel_type);
+    case MemoryType::Shared:
+      return isParallelTypeBlockDim(parallel_type) ||
+          isParallelTypeDeviceDim(parallel_type);
+    case MemoryType::Global:
+      return isParallelTypeDeviceDim(parallel_type);
+    default:
+      NVF_ERROR(false, "Unknown MemoryType: ", memory_type);
+  }
+}
+
+// Returns true if memory_type is shared in parallel_type
+inline bool isSharedMemory(MemoryType memory_type, ParallelType parallel_type) {
+  switch (memory_type) {
+    case MemoryType::Local:
+      // Nothing is shared if it's Local
+      return false;
+    case MemoryType::Shared:
+      // Only TID parallelized domains are shared if it's Shared
+      return isParallelTypeThreadDim(parallel_type);
+    case MemoryType::Global:
+      // Only TID and BID parallelized domains are shared if it's Global
+      return isParallelTypeThreadDim(parallel_type) ||
+          isParallelTypeBlockDim(parallel_type);
+    default:
+      NVF_ERROR(false, "Unknown MemoryType: ", memory_type);
+  }
+}
+
 } // namespace nvfuser::ir_utils
diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
new file mode 100644
index 00000000000..3750338d511
--- /dev/null
+++ b/tests/cpp/test_indexing.cpp
@@ -0,0 +1,384 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include <csrc/exceptions.h>
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <tests/cpp/utils.h>
+#include <tests/cpp/validator.h>
+
+#include <fusion.h>
+#include <id_model/id_model.h>
+#include <id_model/indexing.h>
+#include <id_model/to_string.h>
+#include <inlining.h>
+#include <ir/builder.h>
+#include <ops/all_ops.h>
+#include <scheduler/utils.h>
+
+namespace nvfuser {
+
+using IndexingTest = NVFuserTest;
+
+namespace {
+
+std::vector<Val*> getLoopIndices(TensorView* tv, const TensorIndexer& indexer) {
+  std::vector<Val*> loop_indices;
+  for (const auto& loop_id : tv->getLeafDomain()) {
+    loop_indices.push_back(indexer.getLoopIndex(loop_id));
+  }
+  return loop_indices;
+}
+
+} // namespace
+
+// Simple pointwise test with no parallelization
+TEST_F(IndexingTest, SimplePointwise1) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Val>(1.0));
+  auto tv2 = add(tv1, IrBuilder::create<Val>(1.0));
+  fusion.addOutput(tv2);
+
+  tv2->flatten();
+  tv2->split(0, 4);
+
+  TransformPropagator propagator(tv2);
+  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
+
+  tv1->inlineAt(1);
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  std::vector<Val*> tv1_loop_indices = getLoopIndices(tv1, indexer);
+  std::vector<Val*> tv2_loop_indices = getLoopIndices(tv2, indexer);
+
+  auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition());
+
+  auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition());
+  auto tv1_producer_index = indexer.getLinearIndex(tv1, tv2->definition());
+  auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition());
+
+  auto tv0_producer_index_ref = SimplifyingIrBuilder::addExpr(
+      SimplifyingIrBuilder::mulExpr(
+          SimplifyingIrBuilder::modExpr(
+              SimplifyingIrBuilder::addExpr(
+                  SimplifyingIrBuilder::mulExpr(
+                      tv1_loop_indices.at(0), tv1->axis(1)->extent()),
+                  tv1_loop_indices.at(1)),
+              tv1->getRFactorDomain().at(1)->extent()),
+          IrBuilder::getItemExpr(
+              IrBuilder::getAttrExpr(
+                  IrBuilder::metadataExpr(tv0), "alloc_stride"),
+              (int64_t)1)),
+      SimplifyingIrBuilder::mulExpr(
+          SimplifyingIrBuilder::divExpr(
+              SimplifyingIrBuilder::addExpr(
+                  SimplifyingIrBuilder::mulExpr(
+                      tv1_loop_indices.at(0), tv1->axis(1)->extent()),
+                  tv1_loop_indices.at(1)),
+              tv1->getRFactorDomain().at(1)->extent()),
+          IrBuilder::getItemExpr(
+              IrBuilder::getAttrExpr(
+                  IrBuilder::metadataExpr(tv0), "alloc_stride"),
+              (int64_t)0)));
+
+  auto tv1_consumer_index_ref = tv1_loop_indices.at(1);
+  auto tv1_producer_index_ref = tv2_loop_indices.at(1);
+
+  auto tv2_consumer_index_ref = SimplifyingIrBuilder::addExpr(
+      SimplifyingIrBuilder::modExpr(
+          SimplifyingIrBuilder::addExpr(
+              SimplifyingIrBuilder::mulExpr(
+                  tv2_loop_indices.at(0), tv2->axis(1)->extent()),
+              tv2_loop_indices.at(1)),
+          tv2->getRFactorDomain().at(1)->extent()),
+      SimplifyingIrBuilder::mulExpr(
+          SimplifyingIrBuilder::divExpr(
+              SimplifyingIrBuilder::addExpr(
+                  SimplifyingIrBuilder::mulExpr(
+                      tv2_loop_indices.at(0), tv2->axis(1)->extent()),
+                  tv2_loop_indices.at(1)),
+              tv2->getRFactorDomain().at(1)->extent()),
+          tv2->getRFactorDomain().at(1)->extent()));
+
+  EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref))
+      << "Ref: " << tv0_producer_index_ref->toInlineString()
+      << ". Actual: " << tv0_producer_index->toInlineString();
+
+  EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref))
+      << "Ref: " << tv1_consumer_index_ref->toInlineString()
+      << ". Actual: " << tv1_consumer_index->toInlineString();
+
+  EXPECT_TRUE(tv1_producer_index->sameAs(tv1_producer_index_ref))
+      << "Ref: " << tv1_producer_index_ref->toInlineString()
+      << ". Actual: " << tv1_producer_index->toInlineString();
+
+  EXPECT_TRUE(tv2_consumer_index->sameAs(tv2_consumer_index_ref))
+      << "Ref: " << tv2_consumer_index_ref->toInlineString()
+      << ". Actual: " << tv2_consumer_index->toInlineString();
+}
+
+// Almost same fusion as SimplePointwiseSerial but TID and BID
+// parallelizaiton with no inlining
+TEST_F(IndexingTest, SimplePointwise2) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Val>(1.0));
+  auto tv2 = add(tv1, IrBuilder::create<Val>(1.0));
+  auto tv3 = add(tv2, IrBuilder::create<Val>(1.0));
+  fusion.addOutput(tv3);
+
+  tv3->flatten();
+  tv3->split(0, 4);
+
+  TransformPropagator propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion));
+
+  // Test shared memory indexing
+  tv2->setMemoryType(MemoryType::Shared);
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  // tv0 and tv3 are global tensors and should have the same index:
+  // "(blockIdx.x * 4 + threadIdx.x) % tv0->axis(1)->extent() +
+  // (blockIdx.x * 4 + threadIdx.x) / tv0->axis(1)->extent() *
+  // tv0->axis(1)->extent()
+  //
+  // tv1 is a Local tensor. Since it's fully parallelized, its index
+  // should be always zero
+  //
+  // tv2 is a Shared tensor. Only the TIDx parallelized domain should
+  // contribute to the index
+
+  auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition());
+  auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition());
+  auto tv1_producer_index = indexer.getLinearIndex(tv1, tv2->definition());
+  auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition());
+  auto tv2_producer_index = indexer.getLinearIndex(tv2, tv3->definition());
+  auto tv3_consumer_index = indexer.getLinearIndex(tv3, tv3->definition());
+
+  auto contig_idx = SimplifyingIrBuilder::addExpr(
+      SimplifyingIrBuilder::mulExpr(
+          NamedScalar::getParallelIndex(ParallelType::BIDx),
+          tv2->axis(1)->extent()),
+      NamedScalar::getParallelIndex(ParallelType::TIDx));
+
+  auto global_ref = SimplifyingIrBuilder::addExpr(
+      SimplifyingIrBuilder::modExpr(
+          contig_idx, tv0->getRFactorDomain().at(1)->extent()),
+      SimplifyingIrBuilder::mulExpr(
+          SimplifyingIrBuilder::divExpr(
+              contig_idx, tv0->getRFactorDomain().at(1)->extent()),
+          tv0->getRFactorDomain().at(1)->extent()));
+
+  auto shared_ref = NamedScalar::getParallelIndex(ParallelType::TIDx);
+
+  EXPECT_TRUE(tv0_producer_index->sameAs(global_ref))
+      << "Ref: " << global_ref->toInlineString()
+      << ". Actual: " << tv0_producer_index->toInlineString();
+
+  EXPECT_TRUE(tv1_consumer_index->isZeroInt())
+      << "Actual: " << tv1_consumer_index->toInlineString();
+
+  EXPECT_TRUE(tv1_producer_index->isZeroInt())
+      << "Actual: " << tv1_producer_index->toInlineString();
+
+  EXPECT_TRUE(tv2_producer_index->sameAs(shared_ref))
+      << "Ref: " << shared_ref->toInlineString()
+      << ". Actual: " << tv2_producer_index->toInlineString();
+
+  EXPECT_TRUE(tv2_consumer_index->sameAs(shared_ref))
+      << "Ref: " << shared_ref->toInlineString()
+      << ". Actual: " << tv2_consumer_index->toInlineString();
+
+  EXPECT_TRUE(tv3_consumer_index->sameAs(global_ref))
+      << "Ref: " << global_ref->toInlineString()
+      << ". Actual: " << tv3_consumer_index->toInlineString();
+}
+
+// Simple reduction with no parallelization
+TEST_F(IndexingTest, SimpleReduction) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = add(tv1, IrBuilder::create<Val>(1.0));
+  fusion.addOutput(tv2);
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  std::vector<Val*> tv1_loop_indices = getLoopIndices(tv1, indexer);
+  std::vector<Val*> tv2_loop_indices = getLoopIndices(tv2, indexer);
+
+  auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition());
+  auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition());
+  auto tv1_producer_index = indexer.getLinearIndex(tv1, tv2->definition());
+  auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition());
+
+  auto tv0_producer_index_ref = SimplifyingIrBuilder::addExpr(
+      tv1_loop_indices.at(1),
+      SimplifyingIrBuilder::mulExpr(
+          tv1_loop_indices.at(0), tv0->getRFactorDomain().at(1)->extent()));
+
+  auto tv1_consumer_index_ref = tv1_loop_indices.at(0);
+  auto tv1_producer_index_ref = tv2_loop_indices.at(0);
+  auto tv2_consumer_index_ref = tv2_loop_indices.at(0);
+
+  EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref));
+  EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref));
+  EXPECT_TRUE(tv1_producer_index->sameAs(tv1_producer_index_ref));
+  EXPECT_TRUE(tv2_consumer_index->sameAs(tv2_consumer_index_ref));
+}
+
+// Fusion copied from AllocationDomainTest.TransposedIntermediate
+TEST_F(IndexingTest, AllocationDomain) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigConcreteTensor({32, 32});
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = set(tv1);
+  fusion.addOutput(tv2);
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+
+  std::vector<IterDomain*> tv1_transposed = {tv1->axis(1), tv1->axis(0)};
+  tv1->setAllocationDomain(tv1_transposed, true);
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition());
+  auto tv1_producer_index = indexer.getLinearIndex(tv1, tv2->definition());
+
+  std::vector<Val*> tv1_loop_indices = getLoopIndices(tv1, indexer);
+  std::vector<Val*> tv2_loop_indices = getLoopIndices(tv2, indexer);
+
+  // Note that the allocation domain is permuted
+  auto tv1_consumer_index_ref = SimplifyingIrBuilder::addExpr(
+      tv1_loop_indices.at(0),
+      SimplifyingIrBuilder::mulExpr(
+          tv1_loop_indices.at(1), tv1->getRFactorDomain().at(0)->extent()));
+
+  auto tv1_producer_index_ref = SimplifyingIrBuilder::addExpr(
+      tv2_loop_indices.at(0),
+      SimplifyingIrBuilder::mulExpr(
+          tv2_loop_indices.at(1), tv1->getRFactorDomain().at(0)->extent()));
+
+  EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref))
+      << "Ref: " << tv1_consumer_index_ref->toInlineString()
+      << ". Actual: " << tv1_consumer_index->toInlineString();
+
+  EXPECT_TRUE(tv1_producer_index->sameAs(tv1_producer_index_ref))
+      << "Ref: " << tv1_producer_index_ref->toInlineString()
+      << ". Actual: " << tv1_producer_index->toInlineString();
+}
+
+TEST_F(IndexingTest, Reshape) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const std::vector<int64_t> shape1({100});
+  const std::vector<int64_t> shape2({4, 25});
+  const std::vector<int64_t> shape3({5, 2, 10});
+
+  // [i0]
+  auto tv0 = makeContigConcreteTensor(shape1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+
+  // [i2, i3]
+  auto tv2 = reshape(tv1, shape1, shape2);
+
+  // [i2, i3]
+  auto tv3 = add(tv2, fusion.oneVal());
+
+  // [i4, i5, i6]
+  auto tv4 = reshape(tv3, shape2, shape3);
+
+  // [i4, i5, i6]
+  auto tv5 = add(tv4, fusion.oneVal());
+
+  fusion.addOutput(tv5);
+
+  TransformPropagator propagator(tv5);
+  MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
+
+  inlineMost();
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  // Validate tv0 indexing
+  auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition());
+
+  // It isn't straightforward to do structual checking as the other
+  // tests since there's no particular rule about which domain is used
+  // to provide the extent of the group. However, since everything
+  // should be deterministic, string match should also work.
+  std::string tv0_producer_index_ref =
+      "( ( ( ( ( i78 * ( ceilDiv(( 4 * 25 ), 5) ) ) + ( ( i79 * ( ceilDiv(( ceilDiv(( 4 * 25 ), 5) ), 2) ) ) + i80 ) ) / 25 ) * ( ceilDiv(100, 4) ) ) + ( ( ( i78 * ( ceilDiv(( 4 * 25 ), 5) ) ) + ( ( i79 * ( ceilDiv(( ceilDiv(( 4 * 25 ), 5) ), 2) ) ) + i80 ) ) % 25 ) )";
+
+  EXPECT_EQ(tv0_producer_index->toInlineString(), tv0_producer_index_ref);
+
+  // All intermediate tensors should be fully inlined, so their
+  // indices should be just zero.
+  EXPECT_TRUE(indexer.getLinearIndex(tv1, tv1->definition())->isZeroInt());
+  EXPECT_TRUE(indexer.getLinearIndex(tv1, tv2->definition())->isZeroInt());
+  EXPECT_TRUE(indexer.getLinearIndex(tv2, tv2->definition())->isZeroInt());
+  EXPECT_TRUE(indexer.getLinearIndex(tv2, tv3->definition())->isZeroInt());
+  EXPECT_TRUE(indexer.getLinearIndex(tv3, tv3->definition())->isZeroInt());
+  EXPECT_TRUE(indexer.getLinearIndex(tv3, tv4->definition())->isZeroInt());
+  EXPECT_TRUE(indexer.getLinearIndex(tv4, tv4->definition())->isZeroInt());
+  EXPECT_TRUE(indexer.getLinearIndex(tv4, tv5->definition())->isZeroInt());
+
+  // tv5 has no transformation and is fully contiguous
+  std::vector<Val*> tv5_loop_indices = getLoopIndices(tv5, indexer);
+  auto tv5_consumer_index = indexer.getLinearIndex(tv5, tv5->definition());
+
+  auto tv5_consumer_index_ref = SimplifyingIrBuilder::addExpr(
+      SimplifyingIrBuilder::addExpr(
+          tv5_loop_indices.at(2),
+          SimplifyingIrBuilder::mulExpr(
+              tv5_loop_indices.at(1), tv5->getRFactorDomain().at(2)->extent())),
+      SimplifyingIrBuilder::mulExpr(
+          tv5_loop_indices.at(0),
+          SimplifyingIrBuilder::mulExpr(
+              tv5->getRFactorDomain().at(1)->extent(),
+              tv5->getRFactorDomain().at(2)->extent())));
+
+  EXPECT_TRUE(tv5_consumer_index->sameAs(tv5_consumer_index_ref))
+      << "Ref: " << tv5_consumer_index_ref->toInlineString()
+      << ". Actual: " << tv5_consumer_index->toInlineString();
+}
+
+} // namespace nvfuser

From 096f63878003ca459dae61e00bbc8ac356e8ba54 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 00:21:05 -0700
Subject: [PATCH 05/22] Disable idmodel

---
 csrc/device_lower/lower2device.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp
index 1a464417631..c50f160eb51 100644
--- a/csrc/device_lower/lower2device.cpp
+++ b/csrc/device_lower/lower2device.cpp
@@ -391,7 +391,7 @@ void GpuLower::analysis(Fusion* fusion) {
   // functionality should be affected. New IterDomains may be created,
   // so it is expected that generated code may use diffrent variable
   // names
-  if (true || isOptionEnabled(EnableOption::IdModel)) {
+  if (isOptionEnabled(EnableOption::IdModel)) {
     IdModel id_model(fusion_);
   }
 

From 82d747f84ed0b88407c41a7611ef15bd1c6f0299 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 00:32:33 -0700
Subject: [PATCH 06/22] fix

---
 csrc/device_lower/utils.cpp | 22 ----------------------
 csrc/device_lower/utils.h   |  2 --
 csrc/id_model/indexing.cpp  |  2 +-
 csrc/id_model/indexing.h    | 11 +++--------
 tests/cpp/test_indexing.cpp | 28 ++++++++++++++--------------
 5 files changed, 18 insertions(+), 47 deletions(-)

diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp
index bbf0526e461..955c1513dd3 100644
--- a/csrc/device_lower/utils.cpp
+++ b/csrc/device_lower/utils.cpp
@@ -920,28 +920,6 @@ std::array<UnitDim, 2> getMmaLayout(const MmaOp* expr) {
   return layout;
 }
 
-// Returns true if expr is an expression that initializes a reduction
-// buffer.
-bool isReductionInitExpr(const Expr* expr) {
-  // False if its output isn't a TensorView
-  if (!ir_utils::isTvOp(expr)) {
-    return false;
-  }
-  // False if it doesn't have any reduction axis
-  const auto out_tv = ir_utils::getTvOutput(expr);
-  if (!out_tv->domain()->hasReduction()) {
-    return false;
-  }
-  // False if it has have TensorView inputs as initialization should
-  // never use TensorViews
-  const auto tv_filter_inp_view =
-      ir_utils::filterByType<TensorView>(expr->inputs());
-  if (tv_filter_inp_view.begin() != tv_filter_inp_view.end()) {
-    return false;
-  }
-  return true;
-}
-
 } // namespace lower_utils
 
 } // namespace nvfuser
diff --git a/csrc/device_lower/utils.h b/csrc/device_lower/utils.h
index cad0ae5b996..60f3459814a 100644
--- a/csrc/device_lower/utils.h
+++ b/csrc/device_lower/utils.h
@@ -322,8 +322,6 @@ Val* getNumThreadsInTensorView(TensorView* tv);
 //! Get the unit dimensions of A and B for the given MmaOp.
 std::array<UnitDim, 2> getMmaLayout(const MmaOp* expr);
 
-bool isReductionInitExpr(const Expr* expr);
-
 } // namespace lower_utils
 
 } // namespace nvfuser
diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index c80b6c3548d..b5282d3f034 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -93,7 +93,7 @@ std::tuple<std::vector<IterDomain*>, std::vector<Val*>> getAllocationDomains(
     // - Local/Shared: loop domains to the right of the CA position
     const auto inlining_pos = tv->getComputeAtPosition();
     if (tv->getMemoryType() == MemoryType::Global) {
-      allocation_domains = tv->getRFactorDomain();
+      allocation_domains = tv->getLogicalDomain();
       contiguity = tv->domain()->contiguity();
     } else {
       for (const auto i : c10::irange(tv->nDims())) {
diff --git a/csrc/id_model/indexing.h b/csrc/id_model/indexing.h
index 1dae562c9c7..3e392e8e521 100644
--- a/csrc/id_model/indexing.h
+++ b/csrc/id_model/indexing.h
@@ -49,6 +49,9 @@ class TensorIndexer {
   // separate interface.
   Val* getLinearIndex(TensorView* tv, const Expr* expr);
 
+  // Get the index of a loop domain. Intended to be used only for testing.
+  Val* getLoopIndex(IterDomain* loop_id) const;
+
  private:
   // The AlmostExact graph is used since size-1 splits and merges
   // should not affect actual index exprs.
@@ -60,9 +63,6 @@ class TensorIndexer {
   // on loop_index_map_.
   void buildLoopIndexMap();
 
-  // Get the index of a loop domain.
-  Val* getLoopIndex(IterDomain* loop_id) const;
-
   // Propagate the loop indices of a given list of loop domains to the
   // traversal graph (i.e., the AlmostExact graph). Uses the loop
   // index map, which is built for the Loop graph.
@@ -82,11 +82,6 @@ class TensorIndexer {
       const Expr* expr,
       const std::vector<IterDomain*>& index_domains) const;
 
-  // Check if the loop index of a a loop group should be always
-  // just zero. For example, a loop group with an extent of one, i.e.,
-  // a broadcast-only loop group, should just use zero.
-  bool shouldUseZeroIndex(const ValGroup& loop_group) const;
-
  private:
   const IdModel& id_model_;
 
diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
index 3750338d511..45344c06886 100644
--- a/tests/cpp/test_indexing.cpp
+++ b/tests/cpp/test_indexing.cpp
@@ -76,7 +76,7 @@ TEST_F(IndexingTest, SimplePointwise1) {
                   SimplifyingIrBuilder::mulExpr(
                       tv1_loop_indices.at(0), tv1->axis(1)->extent()),
                   tv1_loop_indices.at(1)),
-              tv1->getRFactorDomain().at(1)->extent()),
+              tv1->getLogicalDomain().at(1)->extent()),
           IrBuilder::getItemExpr(
               IrBuilder::getAttrExpr(
                   IrBuilder::metadataExpr(tv0), "alloc_stride"),
@@ -87,7 +87,7 @@ TEST_F(IndexingTest, SimplePointwise1) {
                   SimplifyingIrBuilder::mulExpr(
                       tv1_loop_indices.at(0), tv1->axis(1)->extent()),
                   tv1_loop_indices.at(1)),
-              tv1->getRFactorDomain().at(1)->extent()),
+              tv1->getLogicalDomain().at(1)->extent()),
           IrBuilder::getItemExpr(
               IrBuilder::getAttrExpr(
                   IrBuilder::metadataExpr(tv0), "alloc_stride"),
@@ -102,15 +102,15 @@ TEST_F(IndexingTest, SimplePointwise1) {
               SimplifyingIrBuilder::mulExpr(
                   tv2_loop_indices.at(0), tv2->axis(1)->extent()),
               tv2_loop_indices.at(1)),
-          tv2->getRFactorDomain().at(1)->extent()),
+          tv2->getLogicalDomain().at(1)->extent()),
       SimplifyingIrBuilder::mulExpr(
           SimplifyingIrBuilder::divExpr(
               SimplifyingIrBuilder::addExpr(
                   SimplifyingIrBuilder::mulExpr(
                       tv2_loop_indices.at(0), tv2->axis(1)->extent()),
                   tv2_loop_indices.at(1)),
-              tv2->getRFactorDomain().at(1)->extent()),
-          tv2->getRFactorDomain().at(1)->extent()));
+              tv2->getLogicalDomain().at(1)->extent()),
+          tv2->getLogicalDomain().at(1)->extent()));
 
   EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref))
       << "Ref: " << tv0_producer_index_ref->toInlineString()
@@ -186,11 +186,11 @@ TEST_F(IndexingTest, SimplePointwise2) {
 
   auto global_ref = SimplifyingIrBuilder::addExpr(
       SimplifyingIrBuilder::modExpr(
-          contig_idx, tv0->getRFactorDomain().at(1)->extent()),
+          contig_idx, tv0->getLogicalDomain().at(1)->extent()),
       SimplifyingIrBuilder::mulExpr(
           SimplifyingIrBuilder::divExpr(
-              contig_idx, tv0->getRFactorDomain().at(1)->extent()),
-          tv0->getRFactorDomain().at(1)->extent()));
+              contig_idx, tv0->getLogicalDomain().at(1)->extent()),
+          tv0->getLogicalDomain().at(1)->extent()));
 
   auto shared_ref = NamedScalar::getParallelIndex(ParallelType::TIDx);
 
@@ -243,7 +243,7 @@ TEST_F(IndexingTest, SimpleReduction) {
   auto tv0_producer_index_ref = SimplifyingIrBuilder::addExpr(
       tv1_loop_indices.at(1),
       SimplifyingIrBuilder::mulExpr(
-          tv1_loop_indices.at(0), tv0->getRFactorDomain().at(1)->extent()));
+          tv1_loop_indices.at(0), tv0->getLogicalDomain().at(1)->extent()));
 
   auto tv1_consumer_index_ref = tv1_loop_indices.at(0);
   auto tv1_producer_index_ref = tv2_loop_indices.at(0);
@@ -286,12 +286,12 @@ TEST_F(IndexingTest, AllocationDomain) {
   auto tv1_consumer_index_ref = SimplifyingIrBuilder::addExpr(
       tv1_loop_indices.at(0),
       SimplifyingIrBuilder::mulExpr(
-          tv1_loop_indices.at(1), tv1->getRFactorDomain().at(0)->extent()));
+          tv1_loop_indices.at(1), tv1->getLogicalDomain().at(0)->extent()));
 
   auto tv1_producer_index_ref = SimplifyingIrBuilder::addExpr(
       tv2_loop_indices.at(0),
       SimplifyingIrBuilder::mulExpr(
-          tv2_loop_indices.at(1), tv1->getRFactorDomain().at(0)->extent()));
+          tv2_loop_indices.at(1), tv1->getLogicalDomain().at(0)->extent()));
 
   EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref))
       << "Ref: " << tv1_consumer_index_ref->toInlineString()
@@ -369,12 +369,12 @@ TEST_F(IndexingTest, Reshape) {
       SimplifyingIrBuilder::addExpr(
           tv5_loop_indices.at(2),
           SimplifyingIrBuilder::mulExpr(
-              tv5_loop_indices.at(1), tv5->getRFactorDomain().at(2)->extent())),
+              tv5_loop_indices.at(1), tv5->getLogicalDomain().at(2)->extent())),
       SimplifyingIrBuilder::mulExpr(
           tv5_loop_indices.at(0),
           SimplifyingIrBuilder::mulExpr(
-              tv5->getRFactorDomain().at(1)->extent(),
-              tv5->getRFactorDomain().at(2)->extent())));
+              tv5->getLogicalDomain().at(1)->extent(),
+              tv5->getLogicalDomain().at(2)->extent())));
 
   EXPECT_TRUE(tv5_consumer_index->sameAs(tv5_consumer_index_ref))
       << "Ref: " << tv5_consumer_index_ref->toInlineString()

From c8dfdf7dd82067ad45ab43e09a473bbb549cb1b1 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 00:35:59 -0700
Subject: [PATCH 07/22] cleanup

---
 csrc/id_model/utils.h | 55 -------------------------------------------
 1 file changed, 55 deletions(-)
 delete mode 100644 csrc/id_model/utils.h

diff --git a/csrc/id_model/utils.h b/csrc/id_model/utils.h
deleted file mode 100644
index 2d6327bf586..00000000000
--- a/csrc/id_model/utils.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// clang-format off
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
- * All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- */
-// clang-format on
-#pragma once
-
-#include <utils.h>
-
-#include <functional>
-#include <iostream>
-#include <sstream>
-
-#define VERBOSE() verbose(__LINE__)
-#define WARN() warn(__LINE__)
-
-namespace nvfuser {
-
-// Temporary logging utility
-class DebugStream {
- public:
-  DebugStream()
-      : enabled_(getNvFuserEnv("ID_MODEL_VERBOSE")), out_(std::cerr) {}
-
-  template <typename T>
-  DebugStream& operator<<(const T& v) {
-    if (enabled_) {
-      out_ << v;
-    }
-    return *this;
-  }
-
-  DebugStream& operator<<(std::ostream& (*endl)(std::ostream&)) {
-    if (enabled_) {
-      out_ << endl;
-    }
-    return *this;
-  }
-
- private:
-  bool enabled_ = false;
-  std::ostream& out_;
-};
-
-inline DebugStream verbose(int line) {
-  return DebugStream() << "[DEBUG@" << line << "] ";
-}
-
-inline DebugStream warn(int line) {
-  return DebugStream() << "[WARN@" << line << "] ";
-}
-
-} // namespace nvfuser

From efa09bd5db6a88021a56264965902f8d47463571 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 00:45:07 -0700
Subject: [PATCH 08/22] fix

---
 csrc/id_model/indexing.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index b5282d3f034..d42091cf2c2 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -11,7 +11,6 @@
 #include <expr_simplifier.h>
 #include <id_model/indexing.h>
 #include <id_model/to_string.h>
-#include <id_model/utils.h>
 #include <index_compute.h>
 #include <ir/builder.h>
 #include <ir/graphviz.h>

From 471dcd7d97d78399b166945b2fdc08080e8a5729 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 00:55:45 -0700
Subject: [PATCH 09/22] clang-tidy

---
 csrc/id_model/indexing.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index d42091cf2c2..7c7ec65a7f8 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -175,8 +175,9 @@ class IdGraphIndexCompute : public OptOutDispatch {
  public:
   IdGraphIndexCompute(
       const ValGraph& traversal_graph,
-      const std::unordered_map<ValGroup, Val*>& initial_index_map)
-      : traversal_graph_(traversal_graph), index_map_(initial_index_map) {}
+      std::unordered_map<ValGroup, Val*> initial_index_map)
+      : traversal_graph_(traversal_graph),
+        index_map_(std::move(initial_index_map)) {}
 
   // Propagate the index map through a given expr of a specified
   // direction.

From 81937bbfcf3b83ee83cf84d2a9b0572973ae01ab Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 09:46:14 -0700
Subject: [PATCH 10/22] cleanup

---
 csrc/id_model/indexing.cpp | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index 7c7ec65a7f8..ed6a765c8cb 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -183,7 +183,11 @@ class IdGraphIndexCompute : public OptOutDispatch {
   // direction.
   void propagate(const ExprGroup& expr_group, Direction direction) {
     NVF_ERROR(!expr_group->empty());
+    // This looks a little ugly but the dispatch interface doesn't
+    // have a way to pass arguments
+    current_direction_ = direction;
     dispatch(expr_group->front());
+    current_direction_ = Direction::Undefined;
   }
 
   const std::unordered_map<ValGroup, Val*> indexMap() const {
@@ -220,28 +224,11 @@ class IdGraphIndexCompute : public OptOutDispatch {
  private:
   const ValGraph& traversal_graph_;
   std::unordered_map<ValGroup, Val*> index_map_;
+  Direction current_direction_ = Direction::Undefined;
 };
 
-// TODO: Should use the explicit direction
 bool IdGraphIndexCompute::isForward(Expr* expr) const {
-  bool ready = true;
-  for (const auto inp : ir_utils::filterByType<IterDomain>(expr->inputs())) {
-    if (!hasIndex(inp)) {
-      ready = false;
-      break;
-    }
-  }
-  if (ready) {
-    return true;
-  }
-
-  // Can just return false here. Just make sure the outputs are
-  // already processed
-  for (const auto out : ir_utils::filterByType<IterDomain>(expr->outputs())) {
-    NVF_ERROR(hasIndex(out), "Output index not found: ", out->toString());
-  }
-
-  return false;
+  return current_direction_ == Direction::Forward;
 }
 
 void IdGraphIndexCompute::handle(Split* split) {

From eb2bc2b34da23a92552c08f79a3a177690954a6a Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 16:55:49 -0700
Subject: [PATCH 11/22] disable idmodel

---
 csrc/device_lower/lower2device.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp
index 1a464417631..c50f160eb51 100644
--- a/csrc/device_lower/lower2device.cpp
+++ b/csrc/device_lower/lower2device.cpp
@@ -391,7 +391,7 @@ void GpuLower::analysis(Fusion* fusion) {
   // functionality should be affected. New IterDomains may be created,
   // so it is expected that generated code may use diffrent variable
   // names
-  if (true || isOptionEnabled(EnableOption::IdModel)) {
+  if (isOptionEnabled(EnableOption::IdModel)) {
     IdModel id_model(fusion_);
   }
 

From 1097cd596054eba83eddc632407bd921fe88faf2 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 18:06:22 -0700
Subject: [PATCH 12/22] Add broadcast tests

---
 tests/cpp/test_indexing.cpp | 113 ++++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)

diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
index 33e3d561f52..6ffefa6a004 100644
--- a/tests/cpp/test_indexing.cpp
+++ b/tests/cpp/test_indexing.cpp
@@ -381,4 +381,117 @@ TEST_F(IndexingTest, Reshape) {
       << ". Actual: " << tv5_consumer_index->toInlineString();
 }
 
+// Simple non-concretized broadcast
+TEST_F(IndexingTest, SimpleBroadcast1) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {false, true});
+  auto tv2 = add(tv1, IrBuilder::create<Val>(1.0));
+  fusion.addOutput(tv2);
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  std::vector<Val*> tv1_loop_indices = getLoopIndices(tv1, indexer);
+
+  EXPECT_TRUE(tv1_loop_indices.at(1)->isZeroInt());
+
+  std::vector<Val*> tv2_loop_indices = getLoopIndices(tv2, indexer);
+
+  EXPECT_TRUE(tv2_loop_indices.at(1)->isZeroInt());
+
+  auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition());
+  auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition());
+  auto tv1_producer_index = indexer.getLinearIndex(tv1, tv2->definition());
+  auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition());
+
+  EXPECT_EQ(tv0_producer_index, tv1_loop_indices.at(0));
+  EXPECT_EQ(tv1_consumer_index, tv1_loop_indices.at(0));
+  EXPECT_EQ(tv1_producer_index, tv2_loop_indices.at(0));
+  EXPECT_EQ(tv2_consumer_index, tv2_loop_indices.at(0));
+}
+
+// SimpleBroadcast1 + scheduling
+TEST_F(IndexingTest, SimpleBroadcast2) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {false, true});
+  auto tv2 = add(tv1, IrBuilder::create<Val>(1.0));
+  fusion.addOutput(tv2);
+
+  tv2->flatten();
+  tv2->split(0, 4);
+
+  TransformPropagator propagator(tv2);
+  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  // The first merge of the logical domains should be a trivial merge,
+  // i.e., a merge with a extent-one domain. Thus, the indexing
+  // travesal should return "x + y * 4", where x and y are the loop
+  // indices, respecitvely.
+
+  std::vector<Val*> tv1_loop_indices = getLoopIndices(tv1, indexer);
+  std::vector<Val*> tv2_loop_indices = getLoopIndices(tv2, indexer);
+
+  auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition());
+  auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition());
+  auto tv1_producer_index = indexer.getLinearIndex(tv1, tv2->definition());
+  auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition());
+
+  // tv0 is a global memory tensor, so the indexing is done with its
+  // allocation domain, which is mapped with the merge of the two
+  // logical domains of tv1 on the AlmostExact graph. Traverse back to
+  // the merge output from the loop domains.
+  auto tv0_producer_index_ref = SimplifyingIrBuilder::addExpr(
+      SimplifyingIrBuilder::mulExpr(
+          tv1_loop_indices.at(0), tv1->axis(1)->extent()),
+      tv1_loop_indices.at(1));
+
+  // tv1 is a Local tensor, so its allocation domains are just their
+  // loop domains. This index is mathematically equivalent to the tv0
+  // index, but the order of linearizing the two loop domains is
+  // different from the order of computing the merge input index.
+  auto tv1_consumer_index_ref = SimplifyingIrBuilder::addExpr(
+      tv1_loop_indices.at(1),
+      SimplifyingIrBuilder::mulExpr(
+          tv1_loop_indices.at(0), tv1->axis(1)->extent()));
+
+  auto tv1_producer_index_ref = SimplifyingIrBuilder::addExpr(
+      tv2_loop_indices.at(1),
+      SimplifyingIrBuilder::mulExpr(
+          tv2_loop_indices.at(0), tv2->axis(1)->extent()));
+
+  auto tv2_consumer_index_ref = SimplifyingIrBuilder::addExpr(
+      SimplifyingIrBuilder::mulExpr(
+          tv2_loop_indices.at(0), tv2->axis(1)->extent()),
+      tv2_loop_indices.at(1));
+
+  EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref))
+      << "Ref: " << tv0_producer_index_ref->toInlineString()
+      << ". Actual: " << tv0_producer_index->toInlineString();
+
+  EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref))
+      << "Ref: " << tv1_consumer_index_ref->toInlineString()
+      << ". Actual: " << tv1_consumer_index->toInlineString();
+
+  EXPECT_TRUE(tv1_producer_index->sameAs(tv1_producer_index_ref))
+      << "Ref: " << tv1_producer_index_ref->toInlineString()
+      << ". Actual: " << tv1_producer_index->toInlineString();
+
+  EXPECT_TRUE(tv2_consumer_index->sameAs(tv2_consumer_index_ref))
+      << "Ref: " << tv2_consumer_index_ref->toInlineString()
+      << ". Actual: " << tv2_consumer_index->toInlineString();
+}
+
 } // namespace nvfuser

From d57487a5c7691ec3c61dfeaa161d979b11b1a9bd Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 19:11:14 -0700
Subject: [PATCH 13/22] tests

---
 tests/cpp/test_indexing.cpp | 120 ++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
index 6ffefa6a004..cdac61db5de 100644
--- a/tests/cpp/test_indexing.cpp
+++ b/tests/cpp/test_indexing.cpp
@@ -21,6 +21,8 @@
 #include <ops/all_ops.h>
 #include <scheduler/utils.h>
 
+#include <functional>
+
 namespace nvfuser {
 
 using IndexingTest = NVFuserTest;
@@ -35,6 +37,26 @@ std::vector<Val*> getLoopIndices(TensorView* tv, const TensorIndexer& indexer) {
   return loop_indices;
 }
 
+template <typename... Args>
+Val* addExpr(Args&&... args) {
+  return SimplifyingIrBuilder::addExpr(std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+Val* mulExpr(Args&&... args) {
+  return SimplifyingIrBuilder::mulExpr(std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+Val* divExpr(Args&&... args) {
+  return SimplifyingIrBuilder::divExpr(std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+Val* modExpr(Args&&... args) {
+  return SimplifyingIrBuilder::modExpr(std::forward<Args>(args)...);
+}
+
 } // namespace
 
 // Simple pointwise test with no parallelization
@@ -494,4 +516,102 @@ TEST_F(IndexingTest, SimpleBroadcast2) {
       << ". Actual: " << tv2_consumer_index->toInlineString();
 }
 
+// Concretized broadcast
+TEST_F(IndexingTest, SimpleBroadcast3) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeContigTensor(2);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {false, true});
+  auto tv3 = add(tv2, tv1);
+  fusion.addOutput(tv3);
+
+  tv3->flatten();
+
+  TransformPropagator propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+
+  inlineMost();
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  std::vector<Val*> tv3_loop_indices = getLoopIndices(tv3, indexer);
+
+  // Start with tv3 index as it's most straightforward
+  auto tv3_consumer_index = indexer.getLinearIndex(tv3, tv3->definition());
+  auto tv3_consumer_index_ref = addExpr(
+      modExpr(tv3_loop_indices.at(0), tv3->getLogicalDomain().at(1)->extent()),
+      mulExpr(
+          divExpr(
+              tv3_loop_indices.at(0), tv3->getLogicalDomain().at(1)->extent()),
+          tv3->getLogicalDomain().at(1)->extent()));
+
+  EXPECT_TRUE(tv3_consumer_index->sameAs(tv3_consumer_index_ref))
+      << "Ref: " << tv3_consumer_index_ref->toInlineString()
+      << ". Actual: " << tv3_consumer_index->toInlineString();
+
+  // Since tv2 is fully inlined, its index should be just zero
+  auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition());
+  auto tv2_producer_index = indexer.getLinearIndex(tv2, tv3->definition());
+
+  EXPECT_TRUE(tv2_consumer_index->isZeroInt());
+  EXPECT_TRUE(tv2_producer_index->isZeroInt());
+
+  // tv0 is a 1D pre-broadcast input tensor, so it only needs the
+  // index that corresponds to the outer dimension of the tv3 (or tv2)
+  // logical domains
+  auto tv0_producer_index = indexer.getLinearIndex(tv0, tv2->definition());
+  auto tv0_producer_index_ref =
+      divExpr(tv3_loop_indices.at(0), tv3->getLogicalDomain().at(1)->extent());
+
+  EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref))
+      << "Ref: " << tv0_producer_index_ref->toInlineString()
+      << ". Actual: " << tv0_producer_index->toInlineString();
+
+  // tv1 should have the same index as tv3
+  auto tv1_producer_index = indexer.getLinearIndex(tv1, tv3->definition());
+  EXPECT_TRUE(tv1_producer_index->sameAs(tv3_consumer_index_ref))
+      << "Ref: " << tv3_consumer_index_ref->toInlineString()
+      << ". Actual: " << tv1_producer_index->toInlineString();
+}
+
+// Concretized broadcast with partial inlining. Loop promotion is
+// required. Same fusion as IdModelTest.LoopPromotion4. See also
+// Example 1 of the Loop Promotion doc.
+TEST_F(IndexingTest, SimpleBroadcast4) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigConcreteTensor({1, 4});
+  fusion.addInput(tv0);
+  auto tv1 = makeContigConcreteTensor({3, 4});
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  // [i0, i1]
+  tv4->merge(0);
+  // [i0*i1]
+  tv4->split(0, 4, false); // outer split
+  // [4, i0*i1/4]
+
+  TransformPropagator propagator(tv4);
+  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+
+  for (auto tv : ir_utils::allTvs(&fusion)) {
+    tv->inlineAt(-2);
+  }
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+}
+
 } // namespace nvfuser

From 98e5fd26989a9c335255994c9cbb51c5f7964006 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 19:11:34 -0700
Subject: [PATCH 14/22] error check

---
 csrc/id_model/indexing.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index fc105f1db3c..b49d239b3f9 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -498,6 +498,18 @@ std::unordered_map<ValGroup, Val*> TensorIndexer::getInitialIndexMap(
 }
 
 Val* TensorIndexer::getLinearIndex(TensorView* tv, const Expr* expr) {
+  NVF_ERROR(tv != nullptr);
+  NVF_ERROR(expr != nullptr);
+  NVF_ERROR(
+      (std::find(expr->inputs().begin(), expr->inputs().end(), tv) !=
+       expr->inputs().end()) ||
+          (std::find(expr->outputs().begin(), expr->outputs().end(), tv) !=
+           expr->outputs().end()),
+      "Inconsistent tensor and expr. Tensor, ",
+      tv->toString(),
+      " not found in ",
+      expr->toString());
+
   VERBOSE() << "getIndex of " << tv->toString() << " in " << expr->toString();
 
   const auto [allocation_domains, strides] =

From 19c509e0ffbb86541862d26a7f152175f3f32bee Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 20:39:31 -0700
Subject: [PATCH 15/22] loop promotion

---
 csrc/id_model/indexing.cpp  | 14 ++++++++++++++
 tests/cpp/test_indexing.cpp | 16 ++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index b49d239b3f9..54a8990011b 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -167,6 +167,20 @@ std::tuple<std::vector<IterDomain*>, std::vector<Val*>> getAllocationDomains(
     }
   }
 
+  // Loop promotion may affect allocations. Promotions of intermediate
+  // domains may not be defined correctly. Only consider loop domains
+  // for now.
+  for (auto& allocation_domain : allocation_domains) {
+    bool is_loop = std::find(
+                       tv->getLeafDomain().begin(),
+                       tv->getLeafDomain().end(),
+                       allocation_domain) != tv->getLeafDomain().end();
+    if (!is_loop) {
+      continue;
+    }
+    allocation_domain = getLoopPromotion(allocation_domain, id_model);
+  }
+
   // Compute the strides from innermost to outermost domains
   std::vector<Val*> strides(allocation_domains.size(), nullptr);
   Val* cur_contig_stride = tv->fusion()->oneVal();
diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
index cdac61db5de..e50bdf9bd13 100644
--- a/tests/cpp/test_indexing.cpp
+++ b/tests/cpp/test_indexing.cpp
@@ -612,6 +612,22 @@ TEST_F(IndexingTest, SimpleBroadcast4) {
 
   IdModel id_model(&fusion);
   TensorIndexer indexer(id_model);
+
+  // As discussed in the doc, the inner domain of tv2 is promoted to
+  // a domain with the same extent as the inner domain of tv4. Since
+  // tv2 is a Local tensor, its allocation domain is also promoted to
+  // the same domain. Thus, its consumer index is just the loop index
+  // of the inner loop of the tv2 loop domains, and its producer index
+  // is also just the inner loop index of the loop domains of tv4.
+
+  std::vector<Val*> tv2_loop_indices = getLoopIndices(tv2, indexer);
+  std::vector<Val*> tv4_loop_indices = getLoopIndices(tv4, indexer);
+
+  auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition());
+  auto tv2_producer_index = indexer.getLinearIndex(tv2, tv4->definition());
+
+  EXPECT_EQ(tv2_consumer_index, tv2_loop_indices.at(1));
+  EXPECT_EQ(tv2_producer_index, tv4_loop_indices.at(1));
 }
 
 } // namespace nvfuser

From b9f9108ae458c138c246b9df8ddd6d35ae5cff5d Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 20:58:07 -0700
Subject: [PATCH 16/22] cleanup

---
 csrc/device_lower/utils.cpp | 22 ----------
 csrc/device_lower/utils.h   |  2 -
 csrc/id_model/indexing.cpp  | 88 ++-----------------------------------
 3 files changed, 4 insertions(+), 108 deletions(-)

diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp
index bbf0526e461..955c1513dd3 100644
--- a/csrc/device_lower/utils.cpp
+++ b/csrc/device_lower/utils.cpp
@@ -920,28 +920,6 @@ std::array<UnitDim, 2> getMmaLayout(const MmaOp* expr) {
   return layout;
 }
 
-// Returns true if expr is an expression that initializes a reduction
-// buffer.
-bool isReductionInitExpr(const Expr* expr) {
-  // False if its output isn't a TensorView
-  if (!ir_utils::isTvOp(expr)) {
-    return false;
-  }
-  // False if it doesn't have any reduction axis
-  const auto out_tv = ir_utils::getTvOutput(expr);
-  if (!out_tv->domain()->hasReduction()) {
-    return false;
-  }
-  // False if it has have TensorView inputs as initialization should
-  // never use TensorViews
-  const auto tv_filter_inp_view =
-      ir_utils::filterByType<TensorView>(expr->inputs());
-  if (tv_filter_inp_view.begin() != tv_filter_inp_view.end()) {
-    return false;
-  }
-  return true;
-}
-
 } // namespace lower_utils
 
 } // namespace nvfuser
diff --git a/csrc/device_lower/utils.h b/csrc/device_lower/utils.h
index cad0ae5b996..60f3459814a 100644
--- a/csrc/device_lower/utils.h
+++ b/csrc/device_lower/utils.h
@@ -322,8 +322,6 @@ Val* getNumThreadsInTensorView(TensorView* tv);
 //! Get the unit dimensions of A and B for the given MmaOp.
 std::array<UnitDim, 2> getMmaLayout(const MmaOp* expr);
 
-bool isReductionInitExpr(const Expr* expr);
-
 } // namespace lower_utils
 
 } // namespace nvfuser
diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index 54a8990011b..df1964e8810 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -88,13 +88,6 @@ Val* getStrideOfGlobalMemoryTensor(TensorView* tv, int64_t alloc_dim) {
       stride_dim);
 }
 
-// Currently it's only Shared or Local but Global can be the case
-// too.
-bool isAllocationBasedOnLeaf(TensorView* tv) {
-  return tv->getMemoryType() == MemoryType::Shared ||
-      tv->getMemoryType() == MemoryType::Local;
-}
-
 // Get the allocation domains of a given tensor. Also returns its
 // strides.
 //
@@ -107,40 +100,17 @@ std::tuple<std::vector<IterDomain*>, std::vector<Val*>> getAllocationDomains(
   std::vector<IterDomain*> allocation_domains;
   std::vector<std::optional<bool>> contiguity;
 
-  auto inlining_pos = tv->getComputeAtPosition();
-
-  bool use_set_allocatin_domain = false;
-
   if (tv->hasAllocation()) {
-    if (tv->getMemoryType() == MemoryType::Shared ||
-        tv->getMemoryType() == MemoryType::Local) {
-      if (std::is_permutation(
-              tv->getLeafDomain().begin(),
-              tv->getLeafDomain().end(),
-              tv->getAllocationDomain().begin())) {
-        use_set_allocatin_domain = true;
-      }
-    } else {
-      use_set_allocatin_domain = true;
-    }
-  }
-
-  // Ignore allocation of non-global tensors for now
-  if (use_set_allocatin_domain) {
     allocation_domains = tv->getAllocationDomain();
-    NVF_ERROR(!tv->isDoubleBuffered());
     contiguity = tv->domain()->contiguity();
   } else {
     // If allocation domain is not set, assume that:
-    // Local/Shared: leaf domains to the right of the CA position
-    // Global: rfactor domains
+    // Global: logical domains
+    // Local/Shared: loop domains to the right of the CA position
+    const auto inlining_pos = tv->getComputeAtPosition();
     if (tv->getMemoryType() == MemoryType::Global) {
-      VERBOSE() << "Tv does not have allocation of " << tv->toString() << ", "
-                << toDelimitedString(tv->getMaybeAllocationDomain())
-                << std::endl;
       allocation_domains = tv->getLogicalDomain();
       contiguity = tv->domain()->contiguity();
-      NVF_ERROR(!tv->isDoubleBuffered());
     } else {
       for (const auto i : c10::irange(tv->nDims())) {
         auto loop_id = tv->getLeafDomain().at(i);
@@ -295,9 +265,6 @@ bool IdGraphIndexCompute::isForward(Expr* expr) const {
 void IdGraphIndexCompute::handle(Split* split) {
   const bool is_forward = isForward(split);
 
-  VERBOSE() << "IdGraphIndexCompute handle (" << (is_forward ? "fwd" : "bwd")
-            << "): " << split->toString();
-
   if (is_forward) {
     auto in_idx = getIndex(split->in());
     auto inner_extent = split->inner()->extent();
@@ -319,9 +286,6 @@ void IdGraphIndexCompute::handle(Split* split) {
 void IdGraphIndexCompute::handle(Merge* merge) {
   const bool is_forward = isForward(merge);
 
-  VERBOSE() << "IdGraphIndexCompute handle (" << (is_forward ? "fwd" : "bwd")
-            << "): " << merge->toString();
-
   auto inner_ext = merge->inner()->extent();
 
   if (is_forward) {
@@ -399,30 +363,9 @@ void TensorIndexer::buildLoopIndexMap() {
       } else if (shouldUseZeroIndex(loop_group)) {
         loop_index = fusion->zeroVal();
       } else {
-        // Everything now should be serial concrete loops. For the mean
-        // time, just use the same index integer val generated for
-        // ComputeAtMap if available.
-        if (GpuLower::hasCurrent()) {
-          const auto& ca_map = GpuLower::current()->caMap();
-          for (const auto& id :
-               ir_utils::filterByType<IterDomain>(loop_group->vector())) {
-            if (!ca_map->getIdSets(IdMappingMode::LOOP).mappingExists(id)) {
-              continue;
-            }
-            loop_index = ca_map->getIndexVariable(id);
-            break;
-          }
-          NVF_ERROR(
-              loop_index != nullptr,
-              "No existing index found for ",
-              nvfuser::toString(loop_group));
-        } else {
-          // Not reusing the ComputeATMap index assignments
-          loop_index = IrBuilder::create<Val>(DataType::Index);
-        }
+        loop_index = IrBuilder::create<Val>(DataType::Index);
       }
 
-      NVF_ERROR(loop_index != nullptr);
       loop_index_map_[loop_group] = loop_index;
     }
   }
@@ -524,14 +467,9 @@ Val* TensorIndexer::getLinearIndex(TensorView* tv, const Expr* expr) {
       " not found in ",
       expr->toString());
 
-  VERBOSE() << "getIndex of " << tv->toString() << " in " << expr->toString();
-
   const auto [allocation_domains, strides] =
       getAllocationDomains(tv, id_model_);
 
-  VERBOSE() << "Allocation domains: " << toDelimitedString(allocation_domains)
-            << std::endl;
-
   const auto& index_info = computeIndex(expr, allocation_domains);
   const auto& index_map = index_info.index_map;
 
@@ -551,9 +489,6 @@ Val* TensorIndexer::getLinearIndex(TensorView* tv, const Expr* expr) {
         "Index not found for ",
         allocation_domain->toString());
     Val* idx = idx_it->second;
-    VERBOSE() << "Index of " << allocation_domain->toString() << ": "
-              << idx->toInlineString() << std::endl;
-
     index = SimplifyingIrBuilder::addExpr(
         index, SimplifyingIrBuilder::mulExpr(idx, stride));
   }
@@ -570,17 +505,6 @@ std::vector<IterDomain*> TensorIndexer::getLoopDomains(const Expr* expr) const {
   // scatter
   auto loop_domains = ir_utils::getTvOutput(expr)->getLeafDomain();
 
-  // If this is an expr initializing a buffer for a reduction, there
-  // should be no loops for reduction domains
-  if (lower_utils::isReductionInitExpr(expr)) {
-    loop_domains.erase(
-        std::remove_if(
-            loop_domains.begin(),
-            loop_domains.end(),
-            [](IterDomain* id) -> bool { return id->isReduction(); }),
-        loop_domains.end());
-  }
-
   for (auto& loop_id : loop_domains) {
     loop_id = getLoopPromotion(loop_id, id_model_);
   }
@@ -592,10 +516,6 @@ IndexingInfo TensorIndexer::computeIndex(
     const Expr* expr,
     const std::vector<IterDomain*>& index_domains) const {
   const auto loop_domains = getLoopDomains(expr);
-  VERBOSE() << "Loop domains: " << toDelimitedString(loop_domains) << std::endl;
-
-  VERBOSE() << "Index domains: " << toDelimitedString(index_domains)
-            << std::endl;
 
   const ValGroups loop_groups = traversalGraph().toGroups(loop_domains);
   const ValGroups index_groups = traversalGraph().toGroups(index_domains);

From 1ae83b3d7ff58f7cb054887756e38f0311ac8f6a Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 20:58:37 -0700
Subject: [PATCH 17/22] cleanup

---
 csrc/id_model/utils.h | 55 -------------------------------------------
 1 file changed, 55 deletions(-)
 delete mode 100644 csrc/id_model/utils.h

diff --git a/csrc/id_model/utils.h b/csrc/id_model/utils.h
deleted file mode 100644
index 2d6327bf586..00000000000
--- a/csrc/id_model/utils.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// clang-format off
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
- * All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- */
-// clang-format on
-#pragma once
-
-#include <utils.h>
-
-#include <functional>
-#include <iostream>
-#include <sstream>
-
-#define VERBOSE() verbose(__LINE__)
-#define WARN() warn(__LINE__)
-
-namespace nvfuser {
-
-// Temporary logging utility
-class DebugStream {
- public:
-  DebugStream()
-      : enabled_(getNvFuserEnv("ID_MODEL_VERBOSE")), out_(std::cerr) {}
-
-  template <typename T>
-  DebugStream& operator<<(const T& v) {
-    if (enabled_) {
-      out_ << v;
-    }
-    return *this;
-  }
-
-  DebugStream& operator<<(std::ostream& (*endl)(std::ostream&)) {
-    if (enabled_) {
-      out_ << endl;
-    }
-    return *this;
-  }
-
- private:
-  bool enabled_ = false;
-  std::ostream& out_;
-};
-
-inline DebugStream verbose(int line) {
-  return DebugStream() << "[DEBUG@" << line << "] ";
-}
-
-inline DebugStream warn(int line) {
-  return DebugStream() << "[WARN@" << line << "] ";
-}
-
-} // namespace nvfuser

From 1d9a67047ee79286bc4a36a5a2dd16419f7e0a4f Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 20:59:05 -0700
Subject: [PATCH 18/22] cleanup

---
 csrc/id_model/indexing.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index df1964e8810..e6b3faee853 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -11,7 +11,6 @@
 #include <expr_simplifier.h>
 #include <id_model/indexing.h>
 #include <id_model/to_string.h>
-#include <id_model/utils.h>
 #include <index_compute.h>
 #include <ir/builder.h>
 #include <ir/graphviz.h>

From 358e54bd7363fd5f618ea702a131e63c327e0906 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 21:06:07 -0700
Subject: [PATCH 19/22] cleanup

---
 csrc/id_model/indexing.cpp | 29 +++++------------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index e6b3faee853..9bf70f84e49 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -104,8 +104,8 @@ std::tuple<std::vector<IterDomain*>, std::vector<Val*>> getAllocationDomains(
     contiguity = tv->domain()->contiguity();
   } else {
     // If allocation domain is not set, assume that:
-    // Global: logical domains
-    // Local/Shared: loop domains to the right of the CA position
+    // - Global: logical domains
+    // - Local/Shared: loop domains to the right of the CA position
     const auto inlining_pos = tv->getComputeAtPosition();
     if (tv->getMemoryType() == MemoryType::Global) {
       allocation_domains = tv->getLogicalDomain();
@@ -371,27 +371,10 @@ void TensorIndexer::buildLoopIndexMap() {
 }
 
 bool TensorIndexer::shouldUseZeroIndex(const ValGroup& loop_group) const {
-  // For parallelized domains that have index NamedScalar's such as
-  // threadIdx.x, just use the NamedScalar. It doesn't automatically
-  // mean such parallel indices are actually used in the final index
-  // expr. For example, TID-parallelized Local tensors won't have
-  // TID-parallelized iter domains as allocation domains, so threadIdx
-  // won't appear in the final index expr.
-  ParallelType ptype = getParallelType(loop_group);
-  if (isParallelTypeThread(ptype)) {
-    return false;
-  }
-
-  // Note that the device paralle type is not included in
-  // "isThread". This is necessary because we don't have a NamedScalar
-  // for DID. Since it's always partitioned in any memory space
-  // currently supported, it's guaranteed to be zero.
-  if (isParallelTypeDeviceDim(ptype)) {
-    return true;
-  }
-
   // All loops in this set are non-parallel, non-concretized broadcast
-  //  iterdomains, their "index variable" should be zero.
+  //  iterdomains, their "index variable" should be zero. This
+  //  condition should be included in the next triviality check, but
+  //  just checking isBroadcast should be more efficient.
   if (std::all_of(loop_group->begin(), loop_group->end(), [](Val* val) {
         return val->as<IterDomain>()->isBroadcast();
       })) {
@@ -492,8 +475,6 @@ Val* TensorIndexer::getLinearIndex(TensorView* tv, const Expr* expr) {
         index, SimplifyingIrBuilder::mulExpr(idx, stride));
   }
 
-  VERBOSE() << "Final index: " << index->toInlineString() << std::endl;
-
   return index;
 }
 

From 699cd53021c136e5cf21e24bb11408f09ff662f2 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 5 Jun 2024 21:07:29 -0700
Subject: [PATCH 20/22] cleanup

---
 tests/cpp/test_indexing.cpp | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
index e50bdf9bd13..a1d2fe68219 100644
--- a/tests/cpp/test_indexing.cpp
+++ b/tests/cpp/test_indexing.cpp
@@ -475,28 +475,24 @@ TEST_F(IndexingTest, SimpleBroadcast2) {
   // allocation domain, which is mapped with the merge of the two
   // logical domains of tv1 on the AlmostExact graph. Traverse back to
   // the merge output from the loop domains.
-  auto tv0_producer_index_ref = SimplifyingIrBuilder::addExpr(
-      SimplifyingIrBuilder::mulExpr(
-          tv1_loop_indices.at(0), tv1->axis(1)->extent()),
+  auto tv0_producer_index_ref = addExpr(
+      mulExpr(tv1_loop_indices.at(0), tv1->axis(1)->extent()),
       tv1_loop_indices.at(1));
 
   // tv1 is a Local tensor, so its allocation domains are just their
   // loop domains. This index is mathematically equivalent to the tv0
   // index, but the order of linearizing the two loop domains is
   // different from the order of computing the merge input index.
-  auto tv1_consumer_index_ref = SimplifyingIrBuilder::addExpr(
+  auto tv1_consumer_index_ref = addExpr(
       tv1_loop_indices.at(1),
-      SimplifyingIrBuilder::mulExpr(
-          tv1_loop_indices.at(0), tv1->axis(1)->extent()));
+      mulExpr(tv1_loop_indices.at(0), tv1->axis(1)->extent()));
 
-  auto tv1_producer_index_ref = SimplifyingIrBuilder::addExpr(
+  auto tv1_producer_index_ref = addExpr(
       tv2_loop_indices.at(1),
-      SimplifyingIrBuilder::mulExpr(
-          tv2_loop_indices.at(0), tv2->axis(1)->extent()));
+      mulExpr(tv2_loop_indices.at(0), tv2->axis(1)->extent()));
 
-  auto tv2_consumer_index_ref = SimplifyingIrBuilder::addExpr(
-      SimplifyingIrBuilder::mulExpr(
-          tv2_loop_indices.at(0), tv2->axis(1)->extent()),
+  auto tv2_consumer_index_ref = addExpr(
+      mulExpr(tv2_loop_indices.at(0), tv2->axis(1)->extent()),
       tv2_loop_indices.at(1));
 
   EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref))

From de055b3811a3c9a1af86b798fb216da5445fa1dc Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Thu, 6 Jun 2024 13:48:42 -0700
Subject: [PATCH 21/22] cleanup

---
 csrc/id_model/indexing.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index beff8c41918..ad58cfe9aad 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -266,7 +266,7 @@ void IdGraphIndexCompute::handle(Split* split) {
   const bool is_forward = isForward(split);
 
   auto inner_extent = split->inner()->extent();
-  
+
   if (is_forward) {
     auto in_idx = getIndex(split->in());
     auto outer_idx = SimplifyingIrBuilder::divExpr(in_idx, inner_extent);
@@ -344,7 +344,6 @@ void TensorIndexer::buildLoopIndexMap() {
     if (!ir_utils::isTvOp(expr)) {
       continue;
     }
-
     // It's assumed that all sibling outputs share the same for-loops,
     // thus only one of the outputs is considered.
     auto tv_output = ir_utils::getTvOutput(expr);

From 1c6ad9e107666819b07f3bbde7dc046e752e2b7a Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Thu, 6 Jun 2024 17:36:43 -0700
Subject: [PATCH 22/22] Support indexing of DIDx parallelized tensors

---
 csrc/id_model/indexing.cpp  |   7 +-
 tests/cpp/test_indexing.cpp | 203 ++++++++++++++++++++++++++++++++++++
 2 files changed, 209 insertions(+), 1 deletion(-)

diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index ad58cfe9aad..ebfc12b5d5d 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -361,7 +361,12 @@ void TensorIndexer::buildLoopIndexMap() {
       ParallelType ptype = getParallelType(loop_group);
       if (isParallelTypeThread(ptype)) {
         loop_index = NamedScalar::getParallelIndex(ptype);
-      } else if (shouldUseZeroIndex(loop_group)) {
+      } else if (
+          // TODO: Cleanup needed. ir_utils::isMemoryPartitionedAcross
+          // should be used, but that means we would need to consider
+          // multiple outputs with different memory types, though it
+          // should be uncommon in practice.
+          shouldUseZeroIndex(loop_group) || isParallelTypeDeviceDim(ptype)) {
         loop_index = fusion->zeroVal();
       } else {
         loop_index = IrBuilder::create<Val>(DataType::Index);
diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
index a1d2fe68219..372daf6a677 100644
--- a/tests/cpp/test_indexing.cpp
+++ b/tests/cpp/test_indexing.cpp
@@ -626,4 +626,207 @@ TEST_F(IndexingTest, SimpleBroadcast4) {
   EXPECT_EQ(tv2_producer_index, tv4_loop_indices.at(1));
 }
 
+// Trivial example. 1D shared tensor. Each device only has one
+// element, so the index should be always just zero.
+TEST_F(IndexingTest, MultiDevice1DNoSplitMerge) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  tv0->axis(0)->parallelize(ParallelType::DIDx);
+  tv1->axis(0)->parallelize(ParallelType::DIDx);
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  EXPECT_TRUE(indexer.getLinearIndex(tv0, tv1->definition())->isZeroInt());
+  EXPECT_TRUE(indexer.getLinearIndex(tv1, tv1->definition())->isZeroInt());
+}
+
+// Same fusion as MultiDevice1DNoSplitMerge but with split.
+TEST_F(IndexingTest, MultiDevice1DSplit) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  Val* num_devices = IrBuilder::create<Val>(DataType::Index);
+
+  tv0->split(0, num_devices, false);
+  tv1->split(0, num_devices, false);
+
+  tv0->axis(0)->parallelize(ParallelType::DIDx);
+  tv1->axis(0)->parallelize(ParallelType::DIDx);
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  std::vector<Val*> tv1_loop_indices = getLoopIndices(tv1, indexer);
+
+  auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition());
+  auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition());
+
+  auto tv0_producer_index_ref = tv1_loop_indices.at(1);
+  auto tv1_consumer_index_ref = tv1_loop_indices.at(1);
+
+  EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref))
+      << "Ref: " << tv0_producer_index_ref->toInlineString()
+      << ". Actual: " << tv0_producer_index->toInlineString();
+
+  EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref))
+      << "Ref: " << tv1_consumer_index_ref->toInlineString()
+      << ". Actual: " << tv1_consumer_index->toInlineString();
+}
+
+TEST_F(IndexingTest, MultiDevice2D) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  Val* num_devices = IrBuilder::create<Val>(DataType::Index);
+
+  tv1->flatten();
+  tv1->split(0, num_devices, false);
+
+  TransformPropagator propagator(tv1);
+  MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
+
+  tv0->axis(0)->parallelize(ParallelType::DIDx);
+  tv1->axis(0)->parallelize(ParallelType::DIDx);
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  std::vector<Val*> tv1_loop_indices = getLoopIndices(tv1, indexer);
+
+  auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition());
+  auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition());
+
+  auto inner_dim = tv1->getLogicalDomain().at(1)->extent();
+
+  // Note that the allocation domain is the logical domain. See the
+  // next test for a leaf allocation example
+  auto tv0_producer_index_ref = addExpr(
+      modExpr(tv1_loop_indices.at(1), inner_dim),
+      mulExpr(divExpr(tv1_loop_indices.at(1), inner_dim), inner_dim));
+
+  // Should use the same index
+  auto tv1_consumer_index_ref = tv0_producer_index_ref;
+
+  EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref))
+      << "Ref: " << tv0_producer_index_ref->toInlineString()
+      << ". Actual: " << tv0_producer_index->toInlineString();
+
+  EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref))
+      << "Ref: " << tv1_consumer_index_ref->toInlineString()
+      << ". Actual: " << tv1_consumer_index->toInlineString();
+}
+
+// Same fusion as MultiDevice2D but with leaf allocation
+TEST_F(IndexingTest, MultiDevice2DLeafAllocation) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  Val* num_devices = IrBuilder::create<Val>(DataType::Index);
+
+  tv1->flatten();
+  tv1->split(0, num_devices, false);
+
+  TransformPropagator propagator(tv1);
+  MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
+
+  tv0->axis(0)->parallelize(ParallelType::DIDx);
+  tv1->axis(0)->parallelize(ParallelType::DIDx);
+
+  tv0->setAllocationDomain(tv0->getLeafDomain(), true);
+  tv1->setAllocationDomain(tv1->getLeafDomain(), true);
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  std::vector<Val*> tv1_loop_indices = getLoopIndices(tv1, indexer);
+
+  auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition());
+  auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition());
+
+  // Since the leaf domain is the allocation domain, the index should
+  // be just the non-parallelized loop index
+  auto tv0_producer_index_ref = tv1_loop_indices.at(1);
+
+  // Should use the same index
+  auto tv1_consumer_index_ref = tv0_producer_index_ref;
+
+  EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref))
+      << "Ref: " << tv0_producer_index_ref->toInlineString()
+      << ". Actual: " << tv0_producer_index->toInlineString();
+
+  EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref))
+      << "Ref: " << tv1_consumer_index_ref->toInlineString()
+      << ". Actual: " << tv1_consumer_index->toInlineString();
+}
+
+TEST_F(IndexingTest, MultiDevice2DTranspose) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = transpose(tv0);
+  fusion.addOutput(tv1);
+
+  Val* num_devices = IrBuilder::create<Val>(DataType::Index);
+
+  tv0->split(0, num_devices, false);
+  tv1->split(0, num_devices, false);
+
+  tv0->axis(0)->parallelize(ParallelType::DIDx);
+  tv1->axis(0)->parallelize(ParallelType::DIDx);
+
+  IdModel id_model(&fusion);
+  TensorIndexer indexer(id_model);
+
+  std::vector<Val*> tv1_loop_indices = getLoopIndices(tv1, indexer);
+
+  auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition());
+  auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition());
+
+  auto tv0_producer_index_ref = addExpr(
+      tv1_loop_indices.at(1),
+      mulExpr(tv1_loop_indices.at(2), tv0->getLogicalDomain().at(1)->extent()));
+
+  // Should use the same index
+  auto tv1_consumer_index_ref = addExpr(
+      tv1_loop_indices.at(2),
+      mulExpr(tv1_loop_indices.at(1), tv1->getLogicalDomain().at(1)->extent()));
+
+  EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref))
+      << "Ref: " << tv0_producer_index_ref->toInlineString()
+      << ". Actual: " << tv0_producer_index->toInlineString();
+
+  EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref))
+      << "Ref: " << tv1_consumer_index_ref->toInlineString()
+      << ". Actual: " << tv1_consumer_index->toInlineString();
+}
+
 } // namespace nvfuser