From c1761a5e6223cf4f161c8e2318dd80c2aa71ff7e Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 18 May 2023 10:39:48 -0400
Subject: [PATCH 1/6] Add C++ and python tests for multiple slices, some empty

---
 python_tests/test_python_frontend.py | 58 ++++++++++++++++++++++++++++
 test/test_resize.cpp                 | 48 +++++++++++++++++++++++
 2 files changed, 106 insertions(+)
diff --git a/python_tests/test_python_frontend.py b/python_tests/test_python_frontend.py
index 0d50d6631f9..4a548a55e02 100644
--- a/python_tests/test_python_frontend.py
+++ b/python_tests/test_python_frontend.py
@@ -2182,6 +2182,64 @@ def fusion_func(fd: FusionDefinition, inps, matmul_fn) -> None:
                 fc = FusionCache.get()
                 fc.reset()
 
+    def test_multiple_empty_slices(self):
+        inputs = [
+            torch.testing.make_tensor((4,), dtype=torch.float32, device="cuda"),
+        ]
+
+        def fusion_func(fd: FusionDefinition):
+            T2 = fd.define_tensor(
+                symbolic_sizes=[-1],
+                contiguity=[True],
+                dtype=DataType.Float,
+                is_cpu=False,
+            )
+            # Perform a size-1 slice and a size-0 slice on T2. The size-1 slice
+            # could be size >1 with no change in the error. The order does not
+            # matter. Performing only one of these slices does not trigger the
+            # error and the output is correct in that case. If there are
+            # multiple size-0 slices the error is not triggered. It only seems
+            # to appear when there are both size-0 and size non-zero slices of
+            # the same tensor.
+            T3 = fd.ops.slice(T2, start_indices=[2], end_indices=[3], strides=[1])
+            T4 = fd.ops.slice(T2, start_indices=[1], end_indices=[1], strides=[1])
+            fd.add_output(T3)
+            fd.add_output(T4)
+
+        nvf_out, _ = self.exec_nvfuser(fusion_func, inputs)
+
+        self.assertEqual(inputs[0][2:3], nvf_out[0])
+        self.assertEqual(inputs[0][1:1], nvf_out[1])
+
+    def test_thunder420(self):
+        inputs = [
+            torch.testing.make_tensor((4, 6, 7), dtype=torch.float32, device="cuda"),
+        ]
+
+        def fusion_func(fd: FusionDefinition):
+            T2 = fd.define_tensor(
+                symbolic_sizes=[-1],
+                contiguity=[True],
+                dtype=DataType.Float,
+                is_cpu=False,
+            )
+            # Perform a size-1 slice and a size-0 slice on T2. The size-1 slice
+            # could be size >1 with no change in the error. The order does not
+            # matter. Performing only one of these slices does not trigger the
+            # error and the output is correct in that case. If there are
+            # multiple size-0 slices the error is not triggered. It only seems
+            # to appear when there are both size-0 and size non-zero slices of
+            # the same tensor.
+            T3 = fd.ops.slice(T2, start_indices=[0], end_indices=[4], strides=[1])
+            T4 = fd.ops.slice(T2, start_indices=[1], end_indices=[1], strides=[1])
+            fd.add_output(T3)
+            fd.add_output(T4)
+
+        nvf_out, _ = self.exec_nvfuser(fusion_func, inputs)
+
+        self.assertEqual(inputs[0][2:3], nvf_out[0])
+        self.assertEqual(inputs[0][1:1], nvf_out[1])
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_resize.cpp b/test/test_resize.cpp
index d8fe4cffa76..7ba1d16c368 100644
--- a/test/test_resize.cpp
+++ b/test/test_resize.cpp
@@ -2053,4 +2053,52 @@ TEST_F(NVFuserTest, ResizePermuteAndSlice_CUDA) {
       __FILE__);
 }
 
+// Test issue with multiple slices, one of which has size 0
+TEST_F(NVFuserTest, FusionResizeMultiSliceEmpty_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  std::vector<int64_t> shape({9});
+
+  // concrete shapes to avoid dynamic Fusion
+  auto tv0 = makeConcreteTensor(shape);
+  fusion->addInput(tv0);
+
+  // Perform a size-1 slice and a size-0 slice on tv0. The size-1 slice
+  // could be size >1 with no change in the error. The order does not
+  // matter. Performing only one of these slices does not trigger the
+  // error and the output is correct in that case. If there are
+  // multiple size-0 slices the error is not triggered. It only seems
+  // to appear when there are both size-0 and size non-zero slices of
+  // the same tensor.
+  auto tv1 = slice(
+      tv0,
+      {{IrBuilder::create<Int>(0),
+        IrBuilder::create<Int>(1),
+        IrBuilder::create<Int>(1)}});
+  fusion->addOutput(tv1);
+  auto tv2 = slice(
+      tv0,
+      {{IrBuilder::create<Int>(0),
+        IrBuilder::create<Int>(0),
+        IrBuilder::create<Int>(1)}});
+  fusion->addOutput(tv2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> aten_inputs({t0});
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+  FusionExecutor fe;
+
+  auto ref0 = t0.index({at::indexing::Slice(0, 1)});
+  auto ref1 = t0.index({at::indexing::Slice(0, 0)});
+
+  TORCH_CHECK(ref0.equal(cg_outputs[0]));
+  TORCH_CHECK(ref1.equal(cg_outputs[1]));
+}
+
 } // namespace nvfuser

From 76c80113d31130a5d93b2efa8709985c2f0cde03 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 18 May 2023 10:56:23 -0400
Subject: [PATCH 2/6] Remove numerator!=0 check.

Note that this is not really safe as it will set zero_ = false even if
it's still zero because of zero input numerator.
---
 csrc/scheduler/vectorize_helper.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/csrc/scheduler/vectorize_helper.h b/csrc/scheduler/vectorize_helper.h
index afd8e601a1d..cb653f883f6 100644
--- a/csrc/scheduler/vectorize_helper.h
+++ b/csrc/scheduler/vectorize_helper.h
@@ -71,12 +71,6 @@ class TORCH_CUDA_CU_API ProjectedExtent {
   // Multiply numerator by provided value, or if currently zero set numerator to
   // provided value.
   void multiplyNumeratorValue(Val* new_numerator_val) {
-    TORCH_INTERNAL_ASSERT(
-        !new_numerator_val->isZeroInt() &&
-            (!new_numerator_val->isConstInt() ||
-             new_numerator_val->evaluateInt() > 0),
-        "Adding numerator value of zero not supported in ProjectedExtent.");
-
     zero_ = false;
     if (new_numerator_val->isConstInt()) {
       const_numerator_vals_.push_back(new_numerator_val->evaluateInt());

From 86da47c6644400db4a705adc54baca5e131aa6b3 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 18 May 2023 10:57:19 -0400
Subject: [PATCH 3/6] Remove split/merge size-0 checks.

See #367.
---
 csrc/ir_nodes.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp
index 4c4634841a4..e42998b00d9 100644
--- a/csrc/ir_nodes.cpp
+++ b/csrc/ir_nodes.cpp
@@ -2250,9 +2250,6 @@ std::vector<IterDomain*> IterDomain::clone(
 // domains have valid start and stop, it's not possible to contiguous
 // predication.
 IterDomain* IterDomain::merge(IterDomain* outer, IterDomain* inner) {
-  TORCH_CHECK(
-      !outer->extent()->isZeroInt() && !inner->extent()->isZeroInt(),
-      "Merging IterDomains with ending values that are 0 is not supported at this time.");
   TORCH_CHECK(
       outer->isReduction() == inner->isReduction(),
       "Merging IterDomains requires that their iteration types match. ",
@@ -2330,10 +2327,6 @@ std::pair<IterDomain*, IterDomain*> IterDomain::split(
     bool inner_split,
     Val* start_offset,
     Val* stop_offset) {
-  TORCH_CHECK(
-      !in->extent()->isZeroInt(),
-      "Splitting IterDomains with ending values that are 0 is not supported at this time.");
-
   TORCH_CHECK(
       factor->isIntegralScalar(), "Cannot split by non-integer value ", factor);
 

From 8f18ffece71ffa2cb5b4d2d45e5b586accdd41c4 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 18 May 2023 12:09:25 -0400
Subject: [PATCH 4/6] Continue if constant zero in multiplyNumeratorValue

---
 csrc/scheduler/vectorize_helper.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/csrc/scheduler/vectorize_helper.h b/csrc/scheduler/vectorize_helper.h
index cb653f883f6..4fcfdcbd9e0 100644
--- a/csrc/scheduler/vectorize_helper.h
+++ b/csrc/scheduler/vectorize_helper.h
@@ -71,6 +71,13 @@ class TORCH_CUDA_CU_API ProjectedExtent {
   // Multiply numerator by provided value, or if currently zero set numerator to
   // provided value.
   void multiplyNumeratorValue(Val* new_numerator_val) {
+    if (new_numerator_val->isZeroInt() ||
+        (new_numerator_val->isConstInt() &&
+         new_numerator_val->evaluateInt() == 0)) {
+      // If we know the value is zero, we want to skip setting zero_ = true
+      return;
+    }
+
     zero_ = false;
     if (new_numerator_val->isConstInt()) {
       const_numerator_vals_.push_back(new_numerator_val->evaluateInt());

From 9b77b776cc29c7c86c37fbb0dbb3a8b6fb11bf69 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 19 May 2023 14:43:00 -0400
Subject: [PATCH 5/6] Use FusionExecutor instead of FEC for easier debugging

---
 test/test_resize.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/test_resize.cpp b/test/test_resize.cpp
index fe269bfb53c..b27180b7fb4 100644
--- a/test/test_resize.cpp
+++ b/test/test_resize.cpp
@@ -2205,9 +2205,11 @@ TEST_F(NVFuserTest, FusionResizeMultiSliceEmpty_CUDA) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutorCache executor_cache(std::move(fusion));
-  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+  auto params = *getPointwiseHeuristics(fusion.get(), aten_inputs);
+  schedulePointwise(fusion.get(), params);
   FusionExecutor fe;
+  fe.compileFusion(fusion.get(), aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
 
   auto ref0 = t0.index({at::indexing::Slice(0, 1)});
   auto ref1 = t0.index({at::indexing::Slice(0, 0)});
@@ -2216,5 +2218,4 @@ TEST_F(NVFuserTest, FusionResizeMultiSliceEmpty_CUDA) {
   TORCH_CHECK(ref1.equal(cg_outputs[1]));
 }
 
-
 } // namespace nvfuser

From 5565688f04189e3943e34b7da0f251fef87a9856 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 23 May 2023 09:58:56 -0400
Subject: [PATCH 6/6] Guard against multiplying by zero extents in
 ContiguousInnerDimensionsMapper

---
 csrc/scheduler/vectorize_helper.cpp | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp
index ca2fcf27b86..62753c1f6c8 100644
--- a/csrc/scheduler/vectorize_helper.cpp
+++ b/csrc/scheduler/vectorize_helper.cpp
@@ -218,9 +218,16 @@ ContiguousInnerDimensionsMapper::ContiguousInnerDimensionsMapper(
       if (std::find(reference_ids.begin(), reference_ids.end(), id) !=
           reference_ids.end()) {
         reordered_rfactor.push_back(id);
-        // Initiailze the extent for the mapped iter domain
+        // Initialize the extent for the mapped iter domain
         ProjectedExtent pe;
-        pe.multiplyNumeratorValue(commonOrConstExtent(ca_map_, id));
+        auto ext = commonOrConstExtent(ca_map_, id);
+        if (ext->isConstInt() && ext->evaluateInt() == 0) {
+          // A size-zero extent ID will be predicated out always, so it should
+          // not affect the projected extent calculation
+          continue;
+        } else {
+          pe.multiplyNumeratorValue(ext);
+        }
         addProjectedExtent(id, pe);
       } else if (!id->isBroadcast()) {
         // Ignore broadcasts in the reference. Otherwise, remove non-contiguous
@@ -239,9 +246,16 @@ ContiguousInnerDimensionsMapper::ContiguousInnerDimensionsMapper(
       if (std::find(reference_ids.begin(), reference_ids.end(), id) !=
           reference_ids.end()) {
         reordered_root.push_back(id);
-        // Initiailze the extent for the mapped iter domain
+        // Initialize the extent for the mapped iter domain
         ProjectedExtent pe;
-        pe.multiplyNumeratorValue(commonOrConstExtent(ca_map_, id));
+        auto ext = commonOrConstExtent(ca_map_, id);
+        if (ext->isConstInt() && ext->evaluateInt() == 0) {
+          // A size-zero extent ID will be predicated out always, so it should
+          // not affect the projected extent calculation
+          continue;
+        } else {
+          pe.multiplyNumeratorValue(ext);
+        }
         addProjectedExtent(id, pe);
       } else if (!id->isBroadcast()) {
         // Ignore broadcasts in the reference. Otherwise, remove non-contiguous