From 5be2421d3afb5e3987073d64507c04d5d0195b9b Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Mon, 22 May 2023 16:13:33 -0700
Subject: [PATCH 01/12] Allocation domain support in cacheFork

---
 csrc/tensor_view.cpp            |   3 +-
 test/test_allocation_domain.cpp | 164 ++++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+), 1 deletion(-)
diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp
index 2c0992aa5ca..bfeaf9414c8 100644
--- a/csrc/tensor_view.cpp
+++ b/csrc/tensor_view.cpp
@@ -1281,7 +1281,8 @@ TensorView* TensorView::cacheFork() {
   fusion()->replaceOutput(this, new_output);
 
   // Transform new output according to this TV
-  auto replayed_output_pair = TransformReplay::replayCasP(new_output, this, -1);
+  auto replayed_output_pair = TransformReplay::replayCasP(
+      new_output, this, -1, TransformReplayOptions().replayAllocation());
   new_output->setDomain(replayed_output_pair.first);
 
   return new_output;
diff --git a/test/test_allocation_domain.cpp b/test/test_allocation_domain.cpp
index 838807f9540..355fc761506 100644
--- a/test/test_allocation_domain.cpp
+++ b/test/test_allocation_domain.cpp
@@ -888,4 +888,168 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter_CUDA) {
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
+// Similar to NHWC4d_To_NHWC4d_CUDA, but does a cacheFork
+TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(4);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+  auto tv2 = set(tv1);
+  fusion.addOutput(tv2);
+
+  std::vector<IterDomain*> tv0_nhwc = {
+      tv0->axis(0), tv0->axis(2), tv0->axis(3), tv0->axis(1)};
+  tv0->setAllocationDomain(tv0_nhwc, true);
+
+  std::vector<IterDomain*> tv1_nhwc = {
+      tv1->axis(0), tv1->axis(2), tv1->axis(3), tv1->axis(1)};
+  tv1->setAllocationDomain(tv1_nhwc, true);
+
+  std::vector<IterDomain*> tv2_nhwc = {
+      tv2->axis(0), tv2->axis(2), tv2->axis(3), tv2->axis(1)};
+  tv2->setAllocationDomain(tv2_nhwc, true);
+
+  auto tv3 = tv1->cacheFork();
+
+  std::vector<IterDomain*> expected_new_allocation_domain{
+      tv3->axis(0), tv3->axis(2), tv3->axis(3), tv3->axis(1)};
+
+  ASSERT_EQ(tv0->getAllocationDomain(), tv0_nhwc);
+  ASSERT_EQ(tv1->getAllocationDomain(), tv1_nhwc);
+  ASSERT_EQ(tv2->getAllocationDomain(), tv2_nhwc);
+  ASSERT_EQ(tv3->getAllocationDomain(), expected_new_allocation_domain);
+
+  for (auto tv : {tv1, tv2, tv3}) {
+    // [N, C, H, W]
+    tv->reorder({{1, -1}});
+    // [N, H, W, C]
+    tv->merge(0);
+    tv->merge(0);
+    tv->merge(0);
+    // [N*H*W*C]
+    tv->split(0, 4);
+    tv->axis(1)->parallelize(ParallelType::Vectorize);
+    tv->split(0, 128);
+    tv->axis(1)->parallelize(ParallelType::TIDx);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    // [BIDx, TIDx, V]
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  int n = 31, h = 64, w = 103, c = 21;
+
+  at::Tensor t0_wrong_format = at::randn({n, c, h, w}, options);
+  at::Tensor t0 =
+      t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion_ptr.get(), {t0});
+
+  EXPECT_THAT(
+      [&]() { fe.runFusion({t0_wrong_format}); },
+      ::testing::ThrowsMessage<c10::Error>(
+          ::testing::HasSubstr("Stride mismatch with contiguity info")));
+
+  auto cg_outputs = fe.runFusion({t0});
+
+  ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
+
+  testValidate(&fusion, cg_outputs, {t0}, {t0, t0}, __LINE__, __FILE__);
+}
+
+// Similar to NHWC2d_To_NHWC2d_CUDA, but does a cacheFork
+TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int n = 31, h = 64, w = 103, c = 21;
+
+  auto tv0 = makeContigConcreteTensor({n * h / 8, 8 * w * c});
+  fusion.addInput(tv0);
+
+  std::vector<IterDomain*> tv0_2d = {tv0->axis(0), tv0->axis(1)};
+  tv0->setAllocationDomain(tv0_2d, true);
+  tv0->merge(0);
+  tv0->split(0, c);
+  tv0->split(0, w);
+  tv0->split(0, h);
+  // [N, H, W, C]
+  tv0->reorder({{-1, 1}});
+  // [N, C, H, W]
+  tv0->commitLeafToRFactor();
+
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  auto tv2 = set(tv1);
+  fusion.addOutput(tv2);
+
+  std::vector<IterDomain*> tv1_nhwc = {
+      tv1->axis(0), tv1->axis(2), tv1->axis(3), tv1->axis(1)};
+  tv1->setAllocationDomain(tv1_nhwc, true);
+
+  for (auto tv : {tv1, tv2}) {
+    // [N, C, H, W]
+    tv->reorder({{1, -1}});
+    // [N, H, W, C]
+    tv->merge(0);
+    tv->merge(1);
+    tv->merge(0);
+    // [N*H*W*C]
+
+    tv->split(0, 4);
+    // [N*H*W*C/4, 4]
+  }
+
+  std::vector<IterDomain*> tv2_2d = {tv2->axis(0), tv2->axis(1)};
+  tv2->setAllocationDomain(tv2_2d, true);
+
+  auto tv3 = tv1->cacheFork();
+
+  std::vector<IterDomain*> expected_new_allocation_domain{
+      tv3->getMaybeRFactorDomain().at(0),
+      tv3->getMaybeRFactorDomain().at(2),
+      tv3->getMaybeRFactorDomain().at(3),
+      tv3->getMaybeRFactorDomain().at(1)};
+
+  ASSERT_EQ(tv0->getAllocationDomain(), tv0_2d);
+  ASSERT_EQ(tv1->getAllocationDomain(), tv1_nhwc);
+  ASSERT_EQ(tv2->getAllocationDomain(), tv2_2d);
+  ASSERT_EQ(tv3->getAllocationDomain(), expected_new_allocation_domain);
+
+  for (auto tv : {tv1, tv2, tv3}) {
+    tv->split(0, 128);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::TIDx);
+    tv->axis(2)->parallelize(ParallelType::Vectorize);
+    // [BIDx, TIDx, V]
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0_wrong_format = at::randn({n, c, h, w}, options);
+  at::Tensor t0 =
+      t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion_ptr.get(), {t0});
+
+  EXPECT_THAT(
+      [&]() { fe.runFusion({t0_wrong_format}); },
+      ::testing::ThrowsMessage<c10::Error>(::testing::HasSubstr(
+          "splitting one dimension into discontiguous dimensions is not allowed in allocation domain")));
+
+  auto cg_outputs = fe.runFusion({t0});
+
+  ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
+
+  testValidate(&fusion, cg_outputs, {t0}, {t0, t0}, __LINE__, __FILE__);
+}
+
 } // namespace nvfuser

From 2010de6822f28d8c4677291053b483bec2dd9098 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 23 May 2023 09:43:44 -0700
Subject: [PATCH 02/12] TensorArgAbstract allocation size

---
 csrc/executor_kernel_arg.h | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/csrc/executor_kernel_arg.h b/csrc/executor_kernel_arg.h
index 9c22f6c6785..b8977a82926 100644
--- a/csrc/executor_kernel_arg.h
+++ b/csrc/executor_kernel_arg.h
@@ -199,9 +199,19 @@ struct TensorArgAbstract : ArgAbstract {
     return tensor_.size(i);
   }
 
-  virtual int64_t getStride(int64_t i) const {
+  virtual int64_t getAllocRank() const {
     TORCH_INTERNAL_ASSERT(
-        false, "The stride of an abstract tensor arg is not known.");
+        false, "The allocation rank of an abstract tensor arg is not known.");
+  }
+
+  virtual int64_t getAllocSize(int64_t i) const {
+    TORCH_INTERNAL_ASSERT(
+        false, "The allocation shape of an abstract tensor arg is not known.");
+  }
+
+  virtual int64_t getAllocStride(int64_t i) const {
+    TORCH_INTERNAL_ASSERT(
+        false, "The allocation stride of an abstract tensor arg is not known.");
   }
 
   size_t getPointerAddress() const {
@@ -274,6 +284,7 @@ inferAndValidateAllocationSizesAndStrides(
 template <typename TENSOR_TYPE>
 struct TensorArg : public TensorArgAbstract {
   TENSOR_TYPE instance_;
+  std::array<int64_t, TENSOR_TYPE::nAllocationDims()> alloc_sizes;
 
   TensorArg(const at::Tensor& tensor, TensorView* tv, ExpressionEvaluator& eval)
       : TensorArgAbstract(tensor) {
@@ -293,12 +304,21 @@ struct TensorArg : public TensorArgAbstract {
     TORCH_INTERNAL_ASSERT(
         (size_t)instance_.nAllocationDims() == sizes_strides.size());
     for (auto i : c10::irange((int64_t)sizes_strides.size())) {
+      alloc_sizes.at(i) = sizes_strides.at(i).first;
       using stride_t = typename TENSOR_TYPE::index_type;
       instance_.setStride(i, (stride_t)sizes_strides.at(i).second);
     }
   }
 
-  int64_t getStride(int64_t i) const override {
+  int64_t getAllocRank() const override {
+    return instance_.nAllocationDims();
+  }
+
+  int64_t getAllocSize(int64_t i) const override {
+    return alloc_sizes.at(i);
+  }
+
+  int64_t getAllocStride(int64_t i) const override {
     return instance_.getStride(i);
   }
 
@@ -323,10 +343,15 @@ struct TensorArg : public TensorArgAbstract {
   std::string toString() const override {
     std::stringstream ss;
     ss << TensorArgAbstract::toString();
-    ss << " stride: (";
-    for (auto i = 0; i < getRank(); i++) {
-      ss << getStride(i) << ", ";
+    ss << " allocation size: (";
+    for (auto i = 0; i < getAllocRank(); i++) {
+      ss << getAllocSize(i) << ", ";
+    }
+    ss << ") allocation stride: (";
+    for (auto i = 0; i < getAllocRank(); i++) {
+      ss << getAllocStride(i) << ", ";
     }
+    ss << ")";
     return ss.str();
   }
 

From f498f3f66cf9fd93c8b813dc9291b69d7ea62612 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 23 May 2023 09:45:54 -0700
Subject: [PATCH 03/12] registry

---
 csrc/scheduler/registry.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
index 9b84d87bf9d..06266584b99 100644
--- a/csrc/scheduler/registry.cpp
+++ b/csrc/scheduler/registry.cpp
@@ -970,14 +970,14 @@ SchedulerRuntimeInfo::SchedulerRuntimeInfo(
       // find and push discontiguous stride
       auto dtype_size = dataTypeSize(tensor_arg_abstract->getDataType());
       input_discontig_strides_[fusion_inp] = {};
-      auto dims = tensor_arg_abstract->getRank();
+      auto dims = tensor_arg_abstract->getAllocRank();
       int64_t expected_stride = 1;
       for (auto dim = dims - 1; dim >= 0; dim--) {
-        auto size = tensor_arg_abstract->getSize((int)dim);
+        auto size = tensor_arg_abstract->getAllocSize((int)dim);
         if (size <= 1) {
           continue;
         }
-        auto stride = tensor_arg_abstract->getStride((int)dim);
+        auto stride = tensor_arg_abstract->getAllocStride((int)dim);
         if (stride != expected_stride) {
           input_discontig_strides_[fusion_inp].push_back(stride * dtype_size);
           expected_stride = stride;

From 5dd984488960f12239da39c9815891d2889f5372 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 23 May 2023 09:46:12 -0700
Subject: [PATCH 04/12] Revert "Allocation domain support in cacheFork"

This reverts commit 5be2421d3afb5e3987073d64507c04d5d0195b9b.
---
 csrc/tensor_view.cpp            |   3 +-
 test/test_allocation_domain.cpp | 164 --------------------------------
 2 files changed, 1 insertion(+), 166 deletions(-)

diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp
index bfeaf9414c8..2c0992aa5ca 100644
--- a/csrc/tensor_view.cpp
+++ b/csrc/tensor_view.cpp
@@ -1281,8 +1281,7 @@ TensorView* TensorView::cacheFork() {
   fusion()->replaceOutput(this, new_output);
 
   // Transform new output according to this TV
-  auto replayed_output_pair = TransformReplay::replayCasP(
-      new_output, this, -1, TransformReplayOptions().replayAllocation());
+  auto replayed_output_pair = TransformReplay::replayCasP(new_output, this, -1);
   new_output->setDomain(replayed_output_pair.first);
 
   return new_output;
diff --git a/test/test_allocation_domain.cpp b/test/test_allocation_domain.cpp
index 355fc761506..838807f9540 100644
--- a/test/test_allocation_domain.cpp
+++ b/test/test_allocation_domain.cpp
@@ -888,168 +888,4 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter_CUDA) {
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
-// Similar to NHWC4d_To_NHWC4d_CUDA, but does a cacheFork
-TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(4);
-  fusion.addInput(tv0);
-  auto tv1 = set(tv0);
-  fusion.addOutput(tv1);
-  auto tv2 = set(tv1);
-  fusion.addOutput(tv2);
-
-  std::vector<IterDomain*> tv0_nhwc = {
-      tv0->axis(0), tv0->axis(2), tv0->axis(3), tv0->axis(1)};
-  tv0->setAllocationDomain(tv0_nhwc, true);
-
-  std::vector<IterDomain*> tv1_nhwc = {
-      tv1->axis(0), tv1->axis(2), tv1->axis(3), tv1->axis(1)};
-  tv1->setAllocationDomain(tv1_nhwc, true);
-
-  std::vector<IterDomain*> tv2_nhwc = {
-      tv2->axis(0), tv2->axis(2), tv2->axis(3), tv2->axis(1)};
-  tv2->setAllocationDomain(tv2_nhwc, true);
-
-  auto tv3 = tv1->cacheFork();
-
-  std::vector<IterDomain*> expected_new_allocation_domain{
-      tv3->axis(0), tv3->axis(2), tv3->axis(3), tv3->axis(1)};
-
-  ASSERT_EQ(tv0->getAllocationDomain(), tv0_nhwc);
-  ASSERT_EQ(tv1->getAllocationDomain(), tv1_nhwc);
-  ASSERT_EQ(tv2->getAllocationDomain(), tv2_nhwc);
-  ASSERT_EQ(tv3->getAllocationDomain(), expected_new_allocation_domain);
-
-  for (auto tv : {tv1, tv2, tv3}) {
-    // [N, C, H, W]
-    tv->reorder({{1, -1}});
-    // [N, H, W, C]
-    tv->merge(0);
-    tv->merge(0);
-    tv->merge(0);
-    // [N*H*W*C]
-    tv->split(0, 4);
-    tv->axis(1)->parallelize(ParallelType::Vectorize);
-    tv->split(0, 128);
-    tv->axis(1)->parallelize(ParallelType::TIDx);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    // [BIDx, TIDx, V]
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  int n = 31, h = 64, w = 103, c = 21;
-
-  at::Tensor t0_wrong_format = at::randn({n, c, h, w}, options);
-  at::Tensor t0 =
-      t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
-
-  EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
-      ::testing::ThrowsMessage<c10::Error>(
-          ::testing::HasSubstr("Stride mismatch with contiguity info")));
-
-  auto cg_outputs = fe.runFusion({t0});
-
-  ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
-
-  testValidate(&fusion, cg_outputs, {t0}, {t0, t0}, __LINE__, __FILE__);
-}
-
-// Similar to NHWC2d_To_NHWC2d_CUDA, but does a cacheFork
-TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  int n = 31, h = 64, w = 103, c = 21;
-
-  auto tv0 = makeContigConcreteTensor({n * h / 8, 8 * w * c});
-  fusion.addInput(tv0);
-
-  std::vector<IterDomain*> tv0_2d = {tv0->axis(0), tv0->axis(1)};
-  tv0->setAllocationDomain(tv0_2d, true);
-  tv0->merge(0);
-  tv0->split(0, c);
-  tv0->split(0, w);
-  tv0->split(0, h);
-  // [N, H, W, C]
-  tv0->reorder({{-1, 1}});
-  // [N, C, H, W]
-  tv0->commitLeafToRFactor();
-
-  auto tv1 = set(tv0);
-  fusion.addOutput(tv1);
-
-  auto tv2 = set(tv1);
-  fusion.addOutput(tv2);
-
-  std::vector<IterDomain*> tv1_nhwc = {
-      tv1->axis(0), tv1->axis(2), tv1->axis(3), tv1->axis(1)};
-  tv1->setAllocationDomain(tv1_nhwc, true);
-
-  for (auto tv : {tv1, tv2}) {
-    // [N, C, H, W]
-    tv->reorder({{1, -1}});
-    // [N, H, W, C]
-    tv->merge(0);
-    tv->merge(1);
-    tv->merge(0);
-    // [N*H*W*C]
-
-    tv->split(0, 4);
-    // [N*H*W*C/4, 4]
-  }
-
-  std::vector<IterDomain*> tv2_2d = {tv2->axis(0), tv2->axis(1)};
-  tv2->setAllocationDomain(tv2_2d, true);
-
-  auto tv3 = tv1->cacheFork();
-
-  std::vector<IterDomain*> expected_new_allocation_domain{
-      tv3->getMaybeRFactorDomain().at(0),
-      tv3->getMaybeRFactorDomain().at(2),
-      tv3->getMaybeRFactorDomain().at(3),
-      tv3->getMaybeRFactorDomain().at(1)};
-
-  ASSERT_EQ(tv0->getAllocationDomain(), tv0_2d);
-  ASSERT_EQ(tv1->getAllocationDomain(), tv1_nhwc);
-  ASSERT_EQ(tv2->getAllocationDomain(), tv2_2d);
-  ASSERT_EQ(tv3->getAllocationDomain(), expected_new_allocation_domain);
-
-  for (auto tv : {tv1, tv2, tv3}) {
-    tv->split(0, 128);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::TIDx);
-    tv->axis(2)->parallelize(ParallelType::Vectorize);
-    // [BIDx, TIDx, V]
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0_wrong_format = at::randn({n, c, h, w}, options);
-  at::Tensor t0 =
-      t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
-
-  EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
-      ::testing::ThrowsMessage<c10::Error>(::testing::HasSubstr(
-          "splitting one dimension into discontiguous dimensions is not allowed in allocation domain")));
-
-  auto cg_outputs = fe.runFusion({t0});
-
-  ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
-
-  testValidate(&fusion, cg_outputs, {t0}, {t0, t0}, __LINE__, __FILE__);
-}
-
 } // namespace nvfuser

From 3340235ee651c0ed8870f01632b72ae16627c58a Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 23 May 2023 10:16:29 -0700
Subject: [PATCH 05/12] cleanup vectorize

---
 csrc/scheduler/registry.cpp         | 96 -----------------------------
 csrc/scheduler/registry.h           |  6 --
 csrc/scheduler/transpose.cpp        |  4 +-
 csrc/scheduler/vectorize_helper.cpp | 81 +++++++-----------------
 csrc/scheduler/vectorize_helper.h   | 11 ----
 5 files changed, 24 insertions(+), 174 deletions(-)

diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
index 06266584b99..a35a171b6c6 100644
--- a/csrc/scheduler/registry.cpp
+++ b/csrc/scheduler/registry.cpp
@@ -1149,102 +1149,6 @@ size_t SchedulerRuntimeInfo::getMaxVectorizableWidth(TensorView* tv) {
   return vector_size;
 }
 
-// Gets the vectorizable width of the inner most dimension of tv if it's
-// contiguous. Ignores inner most dimensions that are broadcast or reduction.
-size_t SchedulerRuntimeInfo::getInnerDimVectorizableWidth(TensorView* tv) {
-  auto inner_vectorword_map_it_ = inner_vectorword_map_.find(tv);
-  if (inner_vectorword_map_it_ != inner_vectorword_map_.end()) {
-    return inner_vectorword_map_it_->second;
-  }
-
-  // If we don't have an record, either it is a tv with innermost broadcast,
-  // or it is an intermediate tensor allocated by fuser. Logic copied to get
-  // root according to scheduler_utils::innerMostRootDim.
-  auto tv_root = tv->hasReduction() && tv->hasRFactor()
-      ? tv->getRootDomain()
-      : tv->getMaybeRFactorDomain();
-
-  auto tv_root_no_reductions = TensorDomain::noReductions(tv_root);
-
-  auto contiguity = tv->domain()->contiguity();
-  // Appears after reductions the reduction domain often has a contiguity entry.
-  // This only matters if the result of the reduction is an output
-  if (contiguity.size() == tv_root.size() &&
-      contiguity.size() != tv_root_no_reductions.size()) {
-    std::vector<std::optional<bool>> new_contiguity;
-    for (auto i : c10::irange(tv_root.size())) {
-      if (!tv_root[i]->isReduction()) {
-        new_contiguity.push_back(contiguity[i]);
-      }
-    }
-    contiguity = new_contiguity;
-  }
-  tv_root = tv_root_no_reductions;
-
-  auto tv_root_no_reductions_size = tv_root_no_reductions.size();
-
-  // Filter out 0-dim tensors
-  if (tv_root_no_reductions_size < 1) {
-    return 1;
-  }
-
-  // Filter out mismatched contiguity info
-  if (tv_root_no_reductions_size != contiguity.size()) {
-    return 1;
-  }
-
-  auto inner_most_dim = scheduler_utils::innerMostRootDim(tv);
-
-  int id_pos = -1;
-  for (auto root_i : c10::irange((int)tv_root_no_reductions_size)) {
-    if (tv_root_no_reductions[root_i] == inner_most_dim) {
-      id_pos = root_i;
-      break;
-    }
-  }
-
-  // Something went wrong with finding the inner most dimension, just
-  // return 1.
-  if (id_pos == -1) {
-    return 1;
-  }
-
-  // If the inner most dimension is not contiguous return 1
-  auto contiguity_opt = contiguity.at(id_pos);
-  TORCH_INTERNAL_ASSERT(contiguity_opt.has_value());
-  if (!*contiguity_opt) {
-    return 1;
-  }
-
-  size_t item_size = dataTypeSize(tv->dtype(), getIndexType());
-
-  // Alignment should always at least be the data type size
-  TORCH_INTERNAL_ASSERT(getAlignmentSize(tv) % item_size == 0);
-  size_t max_vector_size = getAlignmentSize(tv) / item_size;
-
-  // Assuming intermediate tensors have friendly alignment, and
-  //  all contiguity true. Determine the largest power of 2 below
-  //  innermost dimension size for the word size of vectorizaiton
-  size_t vector_size = 1;
-  size_t next_vector_size = 2;
-  auto maybe_inner_dimension_size =
-      expression_evaluator_->evaluate(inner_most_dim->extent());
-  TORCH_INTERNAL_ASSERT(maybe_inner_dimension_size.has_value());
-  size_t inner_dimension_size = maybe_inner_dimension_size->as<int64_t>();
-
-  while (next_vector_size <= max_vector_size &&
-         next_vector_size <= inner_dimension_size &&
-         inner_dimension_size % next_vector_size == 0) {
-    vector_size = next_vector_size;
-    next_vector_size *= 2;
-  }
-
-  // save output to avoid re-compute
-  inner_vectorword_map_[tv] = vector_size;
-
-  return vector_size;
-}
-
 bool SchedulerEntry::sameAs(const SchedulerEntry* other) {
   return heuristic_ == other->heuristic_ && params_->sameAs(other->params_);
 }
diff --git a/csrc/scheduler/registry.h b/csrc/scheduler/registry.h
index 81ce5f4b997..80332f0cd65 100644
--- a/csrc/scheduler/registry.h
+++ b/csrc/scheduler/registry.h
@@ -71,10 +71,6 @@ class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable {
   // contiguity. Ignores dimensions that are broadcast or reduction.
   size_t getMaxVectorizableWidth(TensorView* tv);
 
-  // Gets the vectorizable width of the inner most dimension of tv if it's
-  // contiguous. Ignores inner most dimensions that are broadcast or reduction.
-  size_t getInnerDimVectorizableWidth(TensorView* tv);
-
   // Computes alignment size in bytes for provided ptr address
   static size_t computeAlignmentSize(size_t ptr_address);
 
@@ -129,8 +125,6 @@ class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable {
   std::unordered_map<TensorView*, size_t> alignment_map_;
   // Cache for getMaxVectorizableWidth
   std::unordered_map<TensorView*, size_t> max_vectorword_map_;
-  // Cache for getInnerDimVectorizableWidth
-  std::unordered_map<TensorView*, size_t> inner_vectorword_map_;
 
   // Found index mode kernel needs to be run in
   PrimDataType index_type_ = PrimDataType::Int;
diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp
index 69175bbe002..d0e2f445bfe 100644
--- a/csrc/scheduler/transpose.cpp
+++ b/csrc/scheduler/transpose.cpp
@@ -700,7 +700,7 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
 
   for (auto tv : grouped_inputs_outputs[0]) {
     const auto tv_vectorize_factor =
-        runtime_info.getInnerDimVectorizableWidth(tv);
+        runtime_info.getMaxVectorizableWidth(tv);
     vectorize_factor1 = std::min(vectorize_factor1, tv_vectorize_factor);
   }
   // TODO: Since group2 only has global->shared and shared->global set op, we
@@ -709,7 +709,7 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   // group 2
   for (auto tv : grouped_inputs_outputs[1]) {
     const auto tv_vectorize_factor =
-        runtime_info.getInnerDimVectorizableWidth(tv);
+        runtime_info.getMaxVectorizableWidth(tv);
     vectorize_factor2 = std::min(vectorize_factor2, tv_vectorize_factor);
   }
 
diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp
index ca2fcf27b86..d1ada990770 100644
--- a/csrc/scheduler/vectorize_helper.cpp
+++ b/csrc/scheduler/vectorize_helper.cpp
@@ -1201,13 +1201,29 @@ int64_t getVectorizationSize(
   return vectorize_size;
 }
 
-size_t getExpandedVectorization(
-    const std::vector<ContiguousInnerDimensionsMapper>& reference_maps,
+size_t getVectorizationFactor(
     SchedulerRuntimeInfo& runtime_info,
-    const std::vector<TensorView*> vectorizable_inputs_outputs,
     TensorView* reference_tv,
-    int break_point,
-    size_t default_word_size) {
+    HeuristicSummary* data_cache,
+    int break_point) {
+  auto vectorizable_inputs_outputs_entry =
+      HeuristicSummaryEntry<HeuristicCompileTime::VectorizableInputsAndOutputs>(
+          data_cache, [&reference_tv]() {
+            return std::make_unique<std::vector<TensorView*>>(
+                scheduler_utils::getInputsOutputsWithInnerDim(
+                    reference_tv, true, true));
+          });
+
+  auto& vectorizable_inputs_outputs = vectorizable_inputs_outputs_entry.get();
+
+  auto vectorize_maps_entry =
+      HeuristicSummaryEntry<HeuristicCompileTime::VectorizeMaps>(
+          data_cache, [&reference_tv]() {
+            return std::make_unique<
+                std::vector<vectorize_helper::ContiguousInnerDimensionsMapper>>(
+                vectorize_helper::getAllVectorizedMapsOf(reference_tv));
+          });
+
   if (vectorizable_inputs_outputs.empty()) {
     return 1;
   }
@@ -1229,14 +1245,7 @@ size_t getExpandedVectorization(
         common_alignment_size, runtime_info.getAlignmentSize(inp_or_out));
   }
 
-  // If there's no possibility to increase vector size of provided tensors,
-  // then don't bother doing a more complex analysis to try and do so, just
-  // return early.
-  if (max_expand_size == default_word_size) {
-    return default_word_size;
-  }
-
-  auto reference_map = reference_maps[break_point];
+  auto reference_map = vectorize_maps_entry.get().at(break_point);
   // Initialize to max the tensors could support.
   size_t max_supported_vector_size = max_expand_size;
   for (auto inp_or_out : vectorizable_inputs_outputs) {
@@ -1257,51 +1266,5 @@ size_t getExpandedVectorization(
   return max_supported_vector_size;
 }
 
-size_t getVectorizationFactor(
-    SchedulerRuntimeInfo& runtime_info,
-    TensorView* reference_tv,
-    HeuristicSummary* data_cache,
-    int break_point) {
-  auto vectorizable_inputs_outputs_entry =
-      HeuristicSummaryEntry<HeuristicCompileTime::VectorizableInputsAndOutputs>(
-          data_cache, [&reference_tv]() {
-            return std::make_unique<std::vector<TensorView*>>(
-                scheduler_utils::getInputsOutputsWithInnerDim(
-                    reference_tv, true, true));
-          });
-
-  auto& vectorizable_inputs_outputs = vectorizable_inputs_outputs_entry.get();
-
-  size_t vectorize_factor = std::numeric_limits<size_t>::max();
-
-  for (auto tv : vectorizable_inputs_outputs) {
-    const auto tv_vectorize_factor =
-        runtime_info.getInnerDimVectorizableWidth(tv);
-    vectorize_factor = std::min(vectorize_factor, tv_vectorize_factor);
-  }
-
-  if (vectorize_factor == std::numeric_limits<size_t>::max()) {
-    vectorize_factor = 1;
-  }
-
-  auto vectorize_maps_entry =
-      HeuristicSummaryEntry<HeuristicCompileTime::VectorizeMaps>(
-          data_cache, [&reference_tv]() {
-            return std::make_unique<
-                std::vector<vectorize_helper::ContiguousInnerDimensionsMapper>>(
-                vectorize_helper::getAllVectorizedMapsOf(reference_tv));
-          });
-
-  vectorize_factor = vectorize_helper::getExpandedVectorization(
-      vectorize_maps_entry.get(),
-      runtime_info,
-      vectorizable_inputs_outputs,
-      reference_tv,
-      break_point,
-      vectorize_factor);
-
-  return vectorize_factor;
-}
-
 } // namespace vectorize_helper
 } // namespace nvfuser
diff --git a/csrc/scheduler/vectorize_helper.h b/csrc/scheduler/vectorize_helper.h
index 0c8b6536195..2204782fe85 100644
--- a/csrc/scheduler/vectorize_helper.h
+++ b/csrc/scheduler/vectorize_helper.h
@@ -594,17 +594,6 @@ std::vector<std::pair<ProjectedExtent&, IterDomain*>> getContigVectorSizesOf(
     TensorView* of_tv,
     ContiguousInnerDimensionsMapper& mapper);
 
-// TODO: vectorizable_inputs_outputs should actually be known based on the
-// computed mappings. If nothing is mapped for a tensorview it's not
-// vectorizable.
-size_t getExpandedVectorization(
-    const std::vector<ContiguousInnerDimensionsMapper>& reference_maps,
-    SchedulerRuntimeInfo& runtime_info,
-    const std::vector<TensorView*> vectorizable_inputs_outputs,
-    TensorView* reference_tv,
-    int break_point,
-    size_t default_word_size);
-
 size_t getVectorizationFactor(
     SchedulerRuntimeInfo& runtime_info,
     TensorView* reference_tv,

From dd83215414be1c5ff0d212d3383e8f6f502cfae6 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 23 May 2023 10:17:53 -0700
Subject: [PATCH 06/12] Revert "registry"

This reverts commit f498f3f66cf9fd93c8b813dc9291b69d7ea62612.
---
 csrc/scheduler/registry.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
index a35a171b6c6..e9145258230 100644
--- a/csrc/scheduler/registry.cpp
+++ b/csrc/scheduler/registry.cpp
@@ -970,14 +970,14 @@ SchedulerRuntimeInfo::SchedulerRuntimeInfo(
       // find and push discontiguous stride
       auto dtype_size = dataTypeSize(tensor_arg_abstract->getDataType());
       input_discontig_strides_[fusion_inp] = {};
-      auto dims = tensor_arg_abstract->getAllocRank();
+      auto dims = tensor_arg_abstract->getRank();
       int64_t expected_stride = 1;
       for (auto dim = dims - 1; dim >= 0; dim--) {
-        auto size = tensor_arg_abstract->getAllocSize((int)dim);
+        auto size = tensor_arg_abstract->getSize((int)dim);
         if (size <= 1) {
           continue;
         }
-        auto stride = tensor_arg_abstract->getAllocStride((int)dim);
+        auto stride = tensor_arg_abstract->getStride((int)dim);
         if (stride != expected_stride) {
           input_discontig_strides_[fusion_inp].push_back(stride * dtype_size);
           expected_stride = stride;

From 91bc0c1aa55ad3515868bab6ecef5207eac06172 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 23 May 2023 10:19:19 -0700
Subject: [PATCH 07/12] Revert "TensorArgAbstract allocation size"

This reverts commit 2010de6822f28d8c4677291053b483bec2dd9098.
---
 csrc/executor_kernel_arg.h | 37 ++++++-------------------------------
 1 file changed, 6 insertions(+), 31 deletions(-)

diff --git a/csrc/executor_kernel_arg.h b/csrc/executor_kernel_arg.h
index b8977a82926..9c22f6c6785 100644
--- a/csrc/executor_kernel_arg.h
+++ b/csrc/executor_kernel_arg.h
@@ -199,19 +199,9 @@ struct TensorArgAbstract : ArgAbstract {
     return tensor_.size(i);
   }
 
-  virtual int64_t getAllocRank() const {
+  virtual int64_t getStride(int64_t i) const {
     TORCH_INTERNAL_ASSERT(
-        false, "The allocation rank of an abstract tensor arg is not known.");
-  }
-
-  virtual int64_t getAllocSize(int64_t i) const {
-    TORCH_INTERNAL_ASSERT(
-        false, "The allocation shape of an abstract tensor arg is not known.");
-  }
-
-  virtual int64_t getAllocStride(int64_t i) const {
-    TORCH_INTERNAL_ASSERT(
-        false, "The allocation stride of an abstract tensor arg is not known.");
+        false, "The stride of an abstract tensor arg is not known.");
   }
 
   size_t getPointerAddress() const {
@@ -284,7 +274,6 @@ inferAndValidateAllocationSizesAndStrides(
 template <typename TENSOR_TYPE>
 struct TensorArg : public TensorArgAbstract {
   TENSOR_TYPE instance_;
-  std::array<int64_t, TENSOR_TYPE::nAllocationDims()> alloc_sizes;
 
   TensorArg(const at::Tensor& tensor, TensorView* tv, ExpressionEvaluator& eval)
       : TensorArgAbstract(tensor) {
@@ -304,21 +293,12 @@ struct TensorArg : public TensorArgAbstract {
     TORCH_INTERNAL_ASSERT(
         (size_t)instance_.nAllocationDims() == sizes_strides.size());
     for (auto i : c10::irange((int64_t)sizes_strides.size())) {
-      alloc_sizes.at(i) = sizes_strides.at(i).first;
       using stride_t = typename TENSOR_TYPE::index_type;
       instance_.setStride(i, (stride_t)sizes_strides.at(i).second);
     }
   }
 
-  int64_t getAllocRank() const override {
-    return instance_.nAllocationDims();
-  }
-
-  int64_t getAllocSize(int64_t i) const override {
-    return alloc_sizes.at(i);
-  }
-
-  int64_t getAllocStride(int64_t i) const override {
+  int64_t getStride(int64_t i) const override {
     return instance_.getStride(i);
   }
 
@@ -343,15 +323,10 @@ struct TensorArg : public TensorArgAbstract {
   std::string toString() const override {
     std::stringstream ss;
     ss << TensorArgAbstract::toString();
-    ss << " allocation size: (";
-    for (auto i = 0; i < getAllocRank(); i++) {
-      ss << getAllocSize(i) << ", ";
-    }
-    ss << ") allocation stride: (";
-    for (auto i = 0; i < getAllocRank(); i++) {
-      ss << getAllocStride(i) << ", ";
+    ss << " stride: (";
+    for (auto i = 0; i < getRank(); i++) {
+      ss << getStride(i) << ", ";
     }
-    ss << ")";
     return ss.str();
   }
 

From 6fcbbbcc3749281183c2801230573f7f0261ba48 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 23 May 2023 10:23:09 -0700
Subject: [PATCH 08/12] format

---
 csrc/scheduler/transpose.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp
index d0e2f445bfe..5d30bf81c49 100644
--- a/csrc/scheduler/transpose.cpp
+++ b/csrc/scheduler/transpose.cpp
@@ -699,8 +699,7 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   size_t vectorize_factor2 = max_unroll_factor;
 
   for (auto tv : grouped_inputs_outputs[0]) {
-    const auto tv_vectorize_factor =
-        runtime_info.getMaxVectorizableWidth(tv);
+    const auto tv_vectorize_factor = runtime_info.getMaxVectorizableWidth(tv);
     vectorize_factor1 = std::min(vectorize_factor1, tv_vectorize_factor);
   }
   // TODO: Since group2 only has global->shared and shared->global set op, we
@@ -708,8 +707,7 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   // We should not be using a single global vectorize factor for the entire
   // group 2
   for (auto tv : grouped_inputs_outputs[1]) {
-    const auto tv_vectorize_factor =
-        runtime_info.getMaxVectorizableWidth(tv);
+    const auto tv_vectorize_factor = runtime_info.getMaxVectorizableWidth(tv);
     vectorize_factor2 = std::min(vectorize_factor2, tv_vectorize_factor);
   }
 

From 3b928298868d5aec1d20ee46b23f9034ccc46375 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 24 May 2023 11:16:23 -0700
Subject: [PATCH 09/12] transpose fix

---
 csrc/scheduler/registry.cpp  | 8 +++++++-
 csrc/scheduler/registry.h    | 2 +-
 csrc/scheduler/transpose.cpp | 6 ++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
index 711e8911fc0..4bbea966fbf 100644
--- a/csrc/scheduler/registry.cpp
+++ b/csrc/scheduler/registry.cpp
@@ -1051,7 +1051,9 @@ size_t SchedulerRuntimeInfo::getAlignmentSize(TensorView* tv) {
 // Gets maximum vectorizable width of tv, assumes we can merge across all
 // iteration domains if contiguous. Cannot permute the dimensions to fix
 // contiguity.
-size_t SchedulerRuntimeInfo::getMaxVectorizableWidth(TensorView* tv) {
+size_t SchedulerRuntimeInfo::getMaxVectorizableWidth(
+    TensorView* tv,
+    bool contig_merge) {
   // Gets the vectorizable width of the tv starting from the inner most
   // dimension, working its way towards the outer most dimension, if they're
   // contiguous. Ignores broadcast and reduction domains.
@@ -1130,6 +1132,10 @@ size_t SchedulerRuntimeInfo::getMaxVectorizableWidth(TensorView* tv) {
 
     // Still contiguous
     numel *= dim_size->as<int64_t>();
+
+    if (!contig_merge) {
+      break;
+    }
   }
 
   // Assuming intermediate tensors have friendly alignment, and
diff --git a/csrc/scheduler/registry.h b/csrc/scheduler/registry.h
index 80332f0cd65..1763e87f1a7 100644
--- a/csrc/scheduler/registry.h
+++ b/csrc/scheduler/registry.h
@@ -69,7 +69,7 @@ class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable {
   // Gets maximum vectorizable width of tv, assumes we can merge across all
   // iteration domains if contiguous. Cannot permute the dimensions to fix
   // contiguity. Ignores dimensions that are broadcast or reduction.
-  size_t getMaxVectorizableWidth(TensorView* tv);
+  size_t getMaxVectorizableWidth(TensorView* tv, bool contig_merge = true);
 
   // Computes alignment size in bytes for provided ptr address
   static size_t computeAlignmentSize(size_t ptr_address);
diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp
index 5d30bf81c49..9c10f9865ec 100644
--- a/csrc/scheduler/transpose.cpp
+++ b/csrc/scheduler/transpose.cpp
@@ -699,7 +699,8 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   size_t vectorize_factor2 = max_unroll_factor;
 
   for (auto tv : grouped_inputs_outputs[0]) {
-    const auto tv_vectorize_factor = runtime_info.getMaxVectorizableWidth(tv);
+    const auto tv_vectorize_factor =
+        runtime_info.getMaxVectorizableWidth(tv, false);
     vectorize_factor1 = std::min(vectorize_factor1, tv_vectorize_factor);
   }
   // TODO: Since group2 only has global->shared and shared->global set op, we
@@ -707,7 +708,8 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   // We should not be using a single global vectorize factor for the entire
   // group 2
   for (auto tv : grouped_inputs_outputs[1]) {
-    const auto tv_vectorize_factor = runtime_info.getMaxVectorizableWidth(tv);
+    const auto tv_vectorize_factor =
+        runtime_info.getMaxVectorizableWidth(tv, false);
     vectorize_factor2 = std::min(vectorize_factor2, tv_vectorize_factor);
   }
 

From b740de247a3cb176321c27108fa5d28c3fdd2f92 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 24 May 2023 11:21:51 -0700
Subject: [PATCH 10/12] fix test

---
 test/test_gpu1.cpp | 12 ++++++------
 test/test_gpu2.cpp | 30 +++++++++++++++---------------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/test/test_gpu1.cpp b/test/test_gpu1.cpp
index c809148c6c0..342d8b5aaad 100644
--- a/test/test_gpu1.cpp
+++ b/test/test_gpu1.cpp
@@ -1201,17 +1201,17 @@ TEST_F(NVFuserTest, FusionParser_CUDA) {
   // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
   const std::string expected_kernel = R"(
 __global__ void CUDAGeneratedKernel(Tensor<float, 1, 1> T0, Tensor<float, 1, 1> T1, Tensor<float, 1, 1> T3) {
-  int64_t i86;
-  i86 = ((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x));
-  if ((i86 < T0.size[0])) {
+  int64_t i87;
+  i87 = ((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x));
+  if ((i87 < T0.size[0])) {
     float T5[1];
     T5[0] = 0;
     T5[0]
-       = T1[i86];
+       = T1[i87];
     float T4[1];
     T4[0] = 0;
     T4[0]
-       = T0[i86];
+       = T0[i87];
     float T2[1];
     T2[0]
       = T4[0]
@@ -1220,7 +1220,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 1, 1> T0, Tensor<float, 1, 1>
     T6[0]
       = T2[0]
       * T4[0];
-    T3[i86]
+    T3[i87]
        = T6[0];
   }
 }
diff --git a/test/test_gpu2.cpp b/test/test_gpu2.cpp
index 6bccc21b9ce..d69dac4c9d1 100644
--- a/test/test_gpu2.cpp
+++ b/test/test_gpu2.cpp
@@ -9026,27 +9026,27 @@ TEST_F(NVFuserTest, FusionChannelsLastParser_CUDA) {
   // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
   const std::string expected_kernel = R"(
 __global__ void CUDAGeneratedKernel(Tensor<__half, 4, 4> T0, Tensor<__half, 4, 4> T2, Tensor<__half, 4, 4> T7) {
-  int64_t i1201;
-  i1201 = T0.size[2] * T0.size[1];
-  int64_t i1204;
-  i1204 = ((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x));
-  int64_t i1206;
-  i1206 = (T0.size[1] * T0.size[2]) * T0.size[3];
-  int64_t i1238;
-  i1238 = i1204 % i1206;
-  int64_t i1215;
-  i1215 = T0.size[2] * T0.size[3];
+  int64_t i1202;
+  i1202 = T0.size[2] * T0.size[1];
+  int64_t i1205;
+  i1205 = ((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x));
+  int64_t i1207;
+  i1207 = (T0.size[1] * T0.size[2]) * T0.size[3];
   int64_t i1239;
-  i1239 = i1238 % i1215;
-  if ((i1204 < (((T0.size[0] * T0.size[1]) * T0.size[2]) * T0.size[3]))) {
+  i1239 = i1205 % i1207;
+  int64_t i1216;
+  i1216 = T0.size[2] * T0.size[3];
+  int64_t i1240;
+  i1240 = i1239 % i1216;
+  if ((i1205 < (((T0.size[0] * T0.size[1]) * T0.size[2]) * T0.size[3]))) {
     __half T9[1];
     T9[0] = 0;
     T9[0]
-       = T2[(((((i1201 * T0.size[3]) * (i1204 / i1206)) + (i1201 * (i1239 % T0.size[3]))) + (T0.size[2] * (i1238 / i1215))) + (i1239 / T0.size[3]))];
+       = T2[(((((i1202 * T0.size[3]) * (i1205 / i1207)) + (i1202 * (i1240 % T0.size[3]))) + (T0.size[2] * (i1239 / i1216))) + (i1240 / T0.size[3]))];
     __half T8[1];
     T8[0] = 0;
     T8[0]
-       = T0[i1204];
+       = T0[i1205];
     float T3[1];
     T3[0]
        = __half2float(T9[0]);
@@ -9066,7 +9066,7 @@ __global__ void CUDAGeneratedKernel(Tensor<__half, 4, 4> T0, Tensor<__half, 4, 4
     __half T10[1];
     T10[0]
        = __float2half(T6[0]);
-    T7[i1204]
+    T7[i1205]
        = T10[0];
   }
 }

From a4099e5990f3513cdea0c7ea00b706ba5084277f Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 24 May 2023 11:25:02 -0700
Subject: [PATCH 11/12] doc

---
 csrc/scheduler/registry.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/csrc/scheduler/registry.h b/csrc/scheduler/registry.h
index 1763e87f1a7..b1ffc5b1d84 100644
--- a/csrc/scheduler/registry.h
+++ b/csrc/scheduler/registry.h
@@ -67,8 +67,9 @@ class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable {
   size_t getAlignmentSize(TensorView* tv);
 
   // Gets maximum vectorizable width of tv, assumes we can merge across all
-  // iteration domains if contiguous. Cannot permute the dimensions to fix
-  // contiguity. Ignores dimensions that are broadcast or reduction.
+  // iteration domains if contiguous, unless contig_merge=false. Cannot permute
+  // the dimensions to fix contiguity. Ignores dimensions that are broadcast or
+  // reduction.
   size_t getMaxVectorizableWidth(TensorView* tv, bool contig_merge = true);
 
   // Computes alignment size in bytes for provided ptr address

From 344ef497a3a71fb10fdb8ea4eb3bde0c68b27278 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 24 May 2023 12:53:32 -0700
Subject: [PATCH 12/12] more vectorization

---
 csrc/scheduler/vectorize_helper.cpp | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp
index d1ada990770..26f99d0d5bd 100644
--- a/csrc/scheduler/vectorize_helper.cpp
+++ b/csrc/scheduler/vectorize_helper.cpp
@@ -1157,6 +1157,11 @@ int64_t getVectorizationSize(
     auto denominator = denominator_optional->as<int64_t>();
     auto extent = extent_optional->as<int64_t>();
 
+    // TODO: we should clean this up with expr simplifier
+    auto gcd = std::gcd(numerator, denominator);
+    numerator = numerator / gcd;
+    denominator = denominator / gcd;
+
     if (denominator != 1) {
       break;
     }
@@ -1228,7 +1233,7 @@ size_t getVectorizationFactor(
     return 1;
   }
 
-  size_t max_expand_size = SchedulerRuntimeInfo::max_alignment_size_in_byte;
+  size_t max_vec_size = SchedulerRuntimeInfo::max_alignment_size_in_byte;
   size_t common_alignment_size =
       SchedulerRuntimeInfo::max_alignment_size_in_byte;
 
@@ -1236,18 +1241,18 @@ size_t getVectorizationFactor(
     auto dtype_size =
         dataTypeSize(inp_or_out->dtype(), runtime_info.getIndexType());
 
-    max_expand_size = std::min(
-        max_expand_size,
+    max_vec_size = std::min(
+        max_vec_size,
         SchedulerRuntimeInfo::max_alignment_size_in_byte / dtype_size);
-    max_expand_size = std::min(
-        max_expand_size, runtime_info.getMaxVectorizableWidth(inp_or_out));
+    max_vec_size = std::min(
+        max_vec_size, runtime_info.getMaxVectorizableWidth(inp_or_out));
     common_alignment_size = std::min(
         common_alignment_size, runtime_info.getAlignmentSize(inp_or_out));
   }
 
   auto reference_map = vectorize_maps_entry.get().at(break_point);
   // Initialize to max the tensors could support.
-  size_t max_supported_vector_size = max_expand_size;
+  size_t max_supported_vector_size = max_vec_size;
   for (auto inp_or_out : vectorizable_inputs_outputs) {
     size_t contig_dim_size = getVectorizationSize(
         getContigVectorSizesOf(inp_or_out, reference_map),
@@ -1255,7 +1260,7 @@ size_t getVectorizationFactor(
     size_t local_max_vec_size = 1;
 
     while (contig_dim_size > 1 && contig_dim_size % 2 == 0 &&
-           local_max_vec_size < max_expand_size) {
+           local_max_vec_size < max_vec_size) {
       contig_dim_size /= 2;
       local_max_vec_size *= 2;
     }