From 4ae4f09a5d87f168408027b0a1231a8624ff4ff0 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 28 Apr 2023 15:01:14 -0400
Subject: [PATCH 01/49] First draft of handling Resize in
 DynamicTransformInfoBuilder

---
 csrc/dynamic_transform.cpp | 45 ++++++++++++++++++++++++++++++++++++++
 csrc/dynamic_transform.h   |  7 ++++++
 2 files changed, 52 insertions(+)
diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index c7e9823d2c9..a5de7698c68 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -30,6 +30,9 @@ class DynamicTransformInfoBuilder : public IterVisitor {
   // Analyze a dynamic reshape and generate AnalyzeViewResult
   void handle(ViewOp* op) override;
 
+  // Analyze a dynamic resize
+  void handle(Resize* op) override;
+
   const auto& getInfo() const {
     return info_;
   }
@@ -186,6 +189,48 @@ void DynamicTransformInfoBuilder::handle(ViewOp* op) {
   info_.reshape_transforms_.emplace_back(out_tv, view_result);
 }
 
+void DynamicTransformInfoBuilder::handle(Resize* op) {
+  auto out_id = op->out()->as<IterDomain>();
+
+  // If the input is not symbolic, and the expansion sizes are static, this is
+  // a static resize
+  if (out_id->getIterType() != IterType::Symbolic) {
+    return;
+  }
+
+  auto out_extent_val = expr_eval_->evaluate(out_id->extent());
+  TORCH_INTERNAL_ASSERT(
+      out_extent_val.has_value(),
+      "Cannot evaluate the extent of a resized IterDomain: ",
+      out_id->toString());
+
+  auto in_id = op->in()->as<IterDomain>();
+  auto in_extent_val = expr_eval_->evaluate(in_id->extent());
+  TORCH_INTERNAL_ASSERT(
+      in_extent_val.has_value(),
+      "Cannot evaluate the extent of input to an IterDomain resize: ",
+      in_id->toString());
+
+  auto left = op->leftExpand()->as<Int>();
+  auto left_val = expr_eval_->evaluate(left);
+  TORCH_INTERNAL_ASSERT(
+      left_val.has_value(),
+      "Cannot evaluate the left expansion of an IterDomain resize: ",
+      left_val->toString());
+
+  auto right = op->rightExpand()->as<Int>();
+  auto right_val = expr_eval_->evaluate(right);
+  TORCH_INTERNAL_ASSERT(
+      right_val.has_value(),
+      "Cannot evaluate the right expansion of an IterDomain resize: ",
+      right_val->toString());
+
+  auto out_itertype = out_extent_val->as<int64_t>() == 1 ? IterType::Broadcast
+                                                         : IterType::Iteration;
+
+  info_.resize_transforms_.emplace_back(out_id, out_itertype);
+}
+
 //! Concretize a symbolic fusion with concrete transformation info
 class DynamicTransformConcretizer : public OptOutMutator {
  public:
diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h
index 4d4b6477218..a115eb6f092 100644
--- a/csrc/dynamic_transform.h
+++ b/csrc/dynamic_transform.h
@@ -53,8 +53,15 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo {
 
  private:
   Fusion* fusion_ = nullptr;
+
+  // Holds, for each dynamic reshape, the output TensorView, and the result of
+  // analyzeView
   std::vector<std::pair<TensorView*, AnalyzeViewResult>> reshape_transforms_;
 
+  // Holds the resized IterDomain along with the concretized left and right
+  // expansion sizes
+  std::vector<std::tuple<IterDomain*, IterType>> resize_transforms_;
+
   friend class DynamicTransformInfoBuilder;
 };
 

From 4d75a37254eee3553a0a9b6a7a04f8bd2d7b0ea2 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 28 Apr 2023 15:49:14 -0400
Subject: [PATCH 02/49] Concretize resizes, clone resize transforms

---
 csrc/dynamic_transform.cpp | 36 +++++++++++++++++++++++++++++++++---
 csrc/dynamic_transform.h   |  7 ++++++-
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index a5de7698c68..6cb5e0e2a7d 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -95,6 +95,13 @@ DynamicTransformConcretizationInfo DynamicTransformConcretizationInfo::clone(
         // Statements that would need cloning, only integer indices of axes.
         pair.second);
   }
+  for (auto& pair : resize_transforms_) {
+    cloned_info.resize_transforms_.emplace_back(
+        ir_cloner.clone(pair.first),
+        // Similar to reshape_transforms_, we only clone the IterDomains in
+        // resize_transforms_
+        pair.second);
+  }
   return cloned_info;
 }
 
@@ -216,14 +223,14 @@ void DynamicTransformInfoBuilder::handle(Resize* op) {
   TORCH_INTERNAL_ASSERT(
       left_val.has_value(),
       "Cannot evaluate the left expansion of an IterDomain resize: ",
-      left_val->toString());
+      left->toString());
 
   auto right = op->rightExpand()->as<Int>();
   auto right_val = expr_eval_->evaluate(right);
   TORCH_INTERNAL_ASSERT(
       right_val.has_value(),
       "Cannot evaluate the right expansion of an IterDomain resize: ",
-      right_val->toString());
+      right->toString());
 
   auto out_itertype = out_extent_val->as<int64_t>() == 1 ? IterType::Broadcast
                                                          : IterType::Iteration;
@@ -249,6 +256,8 @@ class DynamicTransformConcretizer : public OptOutMutator {
 
   void concretizeReshape();
 
+  void concretizeResize();
+
   using OptOutMutator::mutate;
 
   void mutate(TensorView* tv) final;
@@ -268,7 +277,10 @@ void DynamicTransformConcretizer::concretize() {
   // First, concretize all dynamic reshape ops
   concretizeReshape();
 
-  // Second, propagate concretized domains
+  // Set output IterTypes for dynamic resize ops
+  concretizeResize();
+
+  // Finally, propagate concretized domains
   auto all_stmts = StmtSort::getStmts(info_.fusion(), false);
   for (auto stmt : all_stmts) {
     if (stmt->isA<Val>()) {
@@ -302,6 +314,20 @@ void DynamicTransformConcretizer::concretizeReshape() {
   }
 }
 
+void DynamicTransformConcretizer::concretizeResize() {
+  // Concretize each resize op.
+  for (const auto& kv : info_.getResizeTransforms()) {
+    auto id = kv.first;
+    auto iter_type = kv.second;
+
+    // swap in new IterDomain as output of the resize Expr
+    ir_utils::replaceValInExpr(
+        id->definition(),
+        id,
+        IterDomainBuilder(id).iter_type(iter_type).build());
+  }
+}
+
 // Concretizes inherited symbolic domains. Note that when this is
 // called, it is assumed that all dynamic ops themselves are
 // concretized. Since symbolic IDs may be propagated down to
@@ -501,6 +527,10 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
       }
     }
 
+    TORCH_INTERNAL_ASSERT(
+        id_type.has_value(),
+        "Did not find id_type. Perhaps TensorView def has no inputs.");
+
     TORCH_INTERNAL_ASSERT(
         id_type != IterType::Symbolic,
         "Failed to concretize ",
diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h
index a115eb6f092..ed0cb6a0cf3 100644
--- a/csrc/dynamic_transform.h
+++ b/csrc/dynamic_transform.h
@@ -32,6 +32,11 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo {
     return reshape_transforms_;
   }
 
+  const std::vector<std::pair<IterDomain*, IterType>> getResizeTransforms()
+      const {
+    return resize_transforms_;
+  }
+
   bool operator==(const DynamicTransformConcretizationInfo& other) const;
 
   bool operator!=(const DynamicTransformConcretizationInfo& other) const {
@@ -60,7 +65,7 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo {
 
   // Holds the resized IterDomain along with the concretized left and right
   // expansion sizes
-  std::vector<std::tuple<IterDomain*, IterType>> resize_transforms_;
+  std::vector<std::pair<IterDomain*, IterType>> resize_transforms_;
 
   friend class DynamicTransformInfoBuilder;
 };

From a027f2b98795ddf6d0f9886d0aaa3daf29b64911 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 28 Apr 2023 15:58:02 -0400
Subject: [PATCH 03/49] Set resized IterDomains to symbolic if extent is
 non-constant

---
 csrc/ir_nodes.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp
index 9626fa3e16e..19676266d4a 100644
--- a/csrc/ir_nodes.cpp
+++ b/csrc/ir_nodes.cpp
@@ -2478,10 +2478,12 @@ IterDomain* IterDomain::resize(
         right_expansion);
   }
 
+  auto is_symbolic = !resized_id_size->isConstInt();
+
   auto resized_id =
       IterDomainBuilder(in->container()->zeroVal(), resized_id_size->as<Int>())
           .is_rfactor_domain(mark_as_rfactor)
-          .iter_type(in->getIterType())
+          .iter_type(is_symbolic ? IterType::Symbolic : in->getIterType())
           .build();
 
   IrBuilder::create<Resize>(

From 5de8af56f8fffcaca0437c61e8a873051c0bc4ab Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 28 Apr 2023 16:01:02 -0400
Subject: [PATCH 04/49] Set resized id to symbolic, broadcast, or iteration

---
 csrc/ir_nodes.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp
index 19676266d4a..94d75f19bc4 100644
--- a/csrc/ir_nodes.cpp
+++ b/csrc/ir_nodes.cpp
@@ -2478,12 +2478,19 @@ IterDomain* IterDomain::resize(
         right_expansion);
   }
 
+  auto iter_type = in->getIterType();
   auto is_symbolic = !resized_id_size->isConstInt();
+  if (symbolic) {
+    iter_type = IterType::Symbolic;
+  } else {
+    auto extent_val = resized_id_size->getInt().value();
+    iter_type = extent_val == 1 ? IterType::Broadcast : IterType::Iteration;
+  }
 
   auto resized_id =
       IterDomainBuilder(in->container()->zeroVal(), resized_id_size->as<Int>())
           .is_rfactor_domain(mark_as_rfactor)
-          .iter_type(is_symbolic ? IterType::Symbolic : in->getIterType())
+          .iter_type(iter_type)
           .build();
 
   IrBuilder::create<Resize>(

From 8aa38e5ce21802f426866e92a0c7f51d55ff1b38 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 28 Apr 2023 20:56:46 -0400
Subject: [PATCH 05/49] Return references for getRe{shape,size}Transforms()

---
 csrc/dynamic_transform.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h
index ed0cb6a0cf3..13c2c03c169 100644
--- a/csrc/dynamic_transform.h
+++ b/csrc/dynamic_transform.h
@@ -27,12 +27,12 @@ class DynamicTransformInfoBuilder;
 //! of the fusion inputs
 class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo {
  public:
-  const std::vector<std::pair<TensorView*, AnalyzeViewResult>>
+  const std::vector<std::pair<TensorView*, AnalyzeViewResult>>&
   getReshapeTransforms() const {
     return reshape_transforms_;
   }
 
-  const std::vector<std::pair<IterDomain*, IterType>> getResizeTransforms()
+  const std::vector<std::pair<IterDomain*, IterType>>& getResizeTransforms()
       const {
     return resize_transforms_;
   }

From 840b5eef33c331654b12ec10dc65f68358af1002 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 28 Apr 2023 20:57:08 -0400
Subject: [PATCH 06/49] Improve logic for determining concrete resize IterType

This should probably be refactored and reused in the original op in case
the inputs are all constant at definition.
---
 csrc/ir_nodes.cpp | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp
index 94d75f19bc4..fc34b567367 100644
--- a/csrc/ir_nodes.cpp
+++ b/csrc/ir_nodes.cpp
@@ -2479,12 +2479,25 @@ IterDomain* IterDomain::resize(
   }
 
   auto iter_type = in->getIterType();
-  auto is_symbolic = !resized_id_size->isConstInt();
-  if (symbolic) {
-    iter_type = IterType::Symbolic;
+  if (resized_id_size->isConstInt()) {
+    auto in_extent = in->extent()->getInt().value();
+    auto out_extent = resized_id_size->getInt().value();
+    auto left = left_expansion->getInt().value();
+    auto right = right_expansion->getInt().value();
+    TORCH_CHECK(out_extent >= 0, "Resized extent must be non-negative.");
+    if (
+        // negative padding sums to input extent. Output is zero-dimensional
+        out_extent == 0 ||
+        // input overlaps output
+        left + in_extent > 0 || right + in_extent > 0) {
+      iter_type = IterType::Iteration;
+    } else {
+      // Result is zero-dimensional, broadcast, or input doesn't overlap output
+      // In these cases, the output is just the broadcasted pad value
+      iter_type = IterType::Broadcast;
+    }
   } else {
-    auto extent_val = resized_id_size->getInt().value();
-    iter_type = extent_val == 1 ? IterType::Broadcast : IterType::Iteration;
+    iter_type = IterType::Symbolic;
   }
 
   auto resized_id =

From acfd7ee628211b8b8276b72bfc008c11d87aef82 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 1 May 2023 08:29:56 -0400
Subject: [PATCH 07/49] Add note about -1 issue with link

---
 test/test_dynamic_transform.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index 2c6f9491e98..8a293169ec3 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -838,7 +838,9 @@ TEST_F(NVFuserTest, FusionDynamicReshapeReductionShmoo_CUDA) {
        {8, 3, 4 * 7, 5},
        false}, // merge(1) merge(2) osplit(1, 3)
       {{8, 3 * 5, 7, 9}, {8, 3, 5 * 7, 9}, false}, // merge(1) osplit(1, 3)
+
       // test passing -1 dynamically for dimension size
+      // This currently fails. see https://github.com/NVIDIA/Fuser/issues/249
       //{{8, 3 * 5, 7, 9}, {8, 3, -1, 9}, false} // merge(1) osplit(1, 3)
   };
   reductionDynamicViewAddFusion(

From c60a13d2e1a886992b0368c57d0b90073baab7a7 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 1 May 2023 10:36:29 -0400
Subject: [PATCH 08/49] Add dynamic pad shmoo test

---
 test/test_dynamic_transform.cpp | 134 ++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index 8a293169ec3..05dd52e2c6c 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -847,4 +847,138 @@ TEST_F(NVFuserTest, FusionDynamicReshapeReductionShmoo_CUDA) {
       invocations, true /* reshape_before_reduction */);
 }
 
+using dynamic_pad_invocation = std::tuple<
+    std::vector<int64_t>, // input_shape
+    std::vector<int64_t>, // pad_widths
+    bool // expect miss
+    >;
+
+void reductionDynamicPadAddFusion(
+    std::vector<dynamic_pad_invocation>& invocations,
+    bool pad_before_reduction) {
+  constexpr int kReductionAxis = -1;
+
+  auto input_shape = std::get<0>(invocations[0]);
+  auto pad_widths = std::get<1>(invocations[0]);
+
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto bias_dims =
+      pad_before_reduction ? input_shape.size() : input_shape.size() - 1;
+
+  // TODO: change symbolic size for padded dimension if start size is 1
+  TensorView* x = makeSymbolicTensor(input_shape.size());
+  TensorView* bias = makeSymbolicTensor(bias_dims);
+  fusion.addInput(x);
+  fusion.addInput(bias);
+
+  auto tv1 = (pad_before_reduction) ? add(x, bias) : sum(x, {kReductionAxis});
+  std::vector<Val*> pad_width_vals(pad_widths.size());
+  for (auto i : c10::irange(pad_widths.size())) {
+    pad_width_vals[i] = IrBuilder::create<Int>();
+    fusion.addInput(pad_width_vals[i]);
+  }
+  auto x_pad = pad(tv1, pad_width_vals);
+  auto y =
+      (pad_before_reduction) ? sum(x_pad, {kReductionAxis}) : add(x_pad, bias);
+  fusion.addOutput(y);
+
+  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+
+  // Return pair of: number of concretizations & total number of kernel runtimes
+  auto countConcretizations = [&fusion_executor_cache]() {
+    std::unordered_set<const std::pair<
+        size_t,
+        std::optional<DynamicTransformConcretizationInfo>>*>
+        concs;
+    for (auto& it : fusion_executor_cache.getKernelRuntimes()) {
+      concs.insert(&it.first);
+    }
+    return concs.size();
+  };
+  size_t num_concretizations = countConcretizations();
+  // Check that concretizations and runtimes are cache misses only when they
+  // should be
+  auto checkCache = [&countConcretizations,
+                     &num_concretizations](bool expect_miss) {
+    auto current = countConcretizations();
+    ASSERT_EQ(current, num_concretizations + (size_t)expect_miss);
+    num_concretizations = current;
+  };
+
+  for (auto& inv : invocations) {
+    auto pad_widths = std::get<0>(inv);
+    auto start_extent = std::get<1>(inv);
+    auto expect_miss = std::get<2>(inv);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+    at::Tensor at_x = at::randn(input_shape, options);
+    std::vector<int64_t> bias_shape(input_shape);
+    if (!pad_before_reduction) {
+      // remove last dimension due to reduction
+      bias_shape.resize(bias_shape.size() - 1);
+    }
+    if (!pad_before_reduction) {
+      // When bias_shape = output_shape, it may contain -1s
+      // concretize bias_shape so that we can properly initialize at_bias
+      size_t other_numel = 1;
+      ssize_t negone_dim = -1; // negative if no -1 shape is provided
+      for (auto i : c10::irange(bias_shape.size())) {
+        if (bias_shape[i] == -1) {
+          ASSERT_EQ(negone_dim, -1); // test cases should not have multiple -1s
+          negone_dim = -1;
+        } else {
+          other_numel *= bias_shape[i];
+        }
+      }
+      if (negone_dim >= 0) {
+        bias_shape[negone_dim] = at_x.numel() / other_numel;
+      }
+    }
+    at::Tensor at_bias = at::randn(bias_shape, options);
+    std::vector<c10::IValue> aten_inputs = {at_x, at_bias};
+    // Add input scalars describing the reshape size for concretization
+    for (int i : c10::irange(pad_widths.size())) {
+      aten_inputs.push_back(pad_widths[i]);
+    }
+
+    auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+    checkCache(expect_miss);
+
+    auto at_tv1 = (pad_before_reduction) ? (at_x + at_bias)
+                                         : at::sum(at_x, kReductionAxis);
+    auto at_x_reshape = at::native::view(at_tv1, bias_shape);
+    auto at_y = (pad_before_reduction) ? at::sum(at_x_reshape, kReductionAxis)
+                                       : at::add(at_x_reshape, at_bias);
+
+    testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
+  }
+}
+
+// Test dynamic pad for various inputs
+TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) {
+  auto invocations = std::vector<dynamic_pad_invocation>{
+      {{3, 5}, {0, 0}, true}, // trivial
+      //{{-1, 1}, 5, false}, // shift by one. Re-uses Iteration
+      /*
+    {{8, 3 * 4, 7, 9}, {8, 3 * 4, 7, 9}, true}, // trivial
+    {{8, 3 * 4, 7, 5}, {8, 3 * 4, 7, 5}, false}, // trivial
+    {{8, 3 * 4, 7, 9}, {8, 3, 4, 7 * 9}, true}, // merge(2) osplit(1, 3)
+    {{8, 3 * 4, 7, 9},
+     {8, 3, 4 * 7, 9},
+     true}, // merge(1) merge(2) osplit(1, 3)
+    {{8, 3 * 4, 7, 5},
+     {8, 3, 4 * 7, 5},
+     false}, // merge(1) merge(2) osplit(1, 3)
+    {{8, 3 * 5, 7, 9}, {8, 3, 5 * 7, 9}, false}, // merge(1) osplit(1, 3)
+    // test passing -1 dynamically for dimension size
+    //{{8, 3 * 5, 7, 9}, {8, 3, -1, 9}, false} // merge(1) osplit(1, 3)
+    */
+  };
+  reductionDynamicPadAddFusion(invocations, true /* pad_before_reduction */);
+}
+
 } // namespace nvfuser

From 45b60e6fe64749fe9d01c16cbb07c3db65490073 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 1 May 2023 10:39:09 -0400
Subject: [PATCH 09/49] Hold TensorDomain for resizes in order to replace in
 output rootdomain

---
 csrc/dynamic_transform.cpp | 142 ++++++++++++++++++++++---------------
 csrc/dynamic_transform.h   |  11 +--
 2 files changed, 91 insertions(+), 62 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 6cb5e0e2a7d..8a4354674e8 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -30,8 +30,9 @@ class DynamicTransformInfoBuilder : public IterVisitor {
   // Analyze a dynamic reshape and generate AnalyzeViewResult
   void handle(ViewOp* op) override;
 
-  // Analyze a dynamic resize
-  void handle(Resize* op) override;
+  // We handle IterDomain "Resize" ops at TensorDomain level
+  void handle(TensorDomain* td) override;
+  void handle(TensorView* tv) override;
 
   const auto& getInfo() const {
     return info_;
@@ -95,12 +96,13 @@ DynamicTransformConcretizationInfo DynamicTransformConcretizationInfo::clone(
         // Statements that would need cloning, only integer indices of axes.
         pair.second);
   }
-  for (auto& pair : resize_transforms_) {
+  for (auto& tx : resize_transforms_) {
     cloned_info.resize_transforms_.emplace_back(
-        ir_cloner.clone(pair.first),
-        // Similar to reshape_transforms_, we only clone the IterDomains in
-        // resize_transforms_
-        pair.second);
+        ir_cloner.clone(std::get<0>(tx)),
+        ir_cloner.clone(std::get<1>(tx)),
+        // Similar to reshape_transforms_, we only clone the TensorDomains and
+        // IterDomains in resize_transforms_
+        std::get<2>(tx));
   }
   return cloned_info;
 }
@@ -114,9 +116,71 @@ std::string DynamicTransformConcretizationInfo::toString() const {
     ss << indent << indent << kv.first->toString() << ", "
        << kv.second.toString() << "\n";
   }
+  ss << indent << "Resize:\n";
+  for (const auto& kv : resize_transforms_) {
+    ss << indent << indent << std::get<0>(kv)->toString() << ", "
+       << std::get<1>(kv)->toString() << ", " << std::get<2>(kv) << "\n";
+  }
   return ss.str();
 }
 
+void DynamicTransformInfoBuilder::handle(TensorDomain* td) {
+  std::cout << "Handling TensorDomain " << td->toString() << std::endl;
+  auto rootd = td->getRootDomain();
+  for (auto id : rootd) {
+    if (id->getIterType() == IterType::Symbolic && id->definition()) {
+      auto def = id->definition();
+      if (def->isA<Resize>()) {
+        auto op = def->as<Resize>();
+
+        auto out_extent_val = expr_eval_->evaluate(id->extent());
+        TORCH_INTERNAL_ASSERT(
+            out_extent_val.has_value(),
+            "Cannot evaluate the extent of a resized IterDomain: ",
+            id->toString());
+
+        auto in_id = op->in()->as<IterDomain>();
+        auto in_extent_val = expr_eval_->evaluate(in_id->extent());
+        TORCH_INTERNAL_ASSERT(
+            in_extent_val.has_value(),
+            "Cannot evaluate the extent of input to an IterDomain resize: ",
+            in_id->toString());
+
+        auto left = op->leftExpand()->as<Int>();
+        auto left_val = expr_eval_->evaluate(left);
+        TORCH_INTERNAL_ASSERT(
+            left_val.has_value(),
+            "Cannot evaluate the left expansion of an IterDomain resize: ",
+            left->toString());
+
+        auto right = op->rightExpand()->as<Int>();
+        auto right_val = expr_eval_->evaluate(right);
+        TORCH_INTERNAL_ASSERT(
+            right_val.has_value(),
+            "Cannot evaluate the right expansion of an IterDomain resize: ",
+            right->toString());
+
+        auto out_itertype = out_extent_val->as<int64_t>() == 1
+            ? IterType::Broadcast
+            : IterType::Iteration;
+
+        info_.resize_transforms_.emplace_back(
+            // std::make_tuple(
+            td,
+            id,
+            out_itertype
+            //)
+        );
+      }
+    }
+  }
+}
+
+void DynamicTransformInfoBuilder::handle(TensorView* tv) {
+  std::cout << "Handling TensorView " << tv->toString() << std::endl;
+  handle(tv->domain());
+}
+
 void DynamicTransformInfoBuilder::handle(ViewOp* op) {
   auto inp_tv = op->in()->as<TensorView>();
   auto out_tv = op->out()->as<TensorView>();
@@ -196,48 +260,6 @@ void DynamicTransformInfoBuilder::handle(ViewOp* op) {
   info_.reshape_transforms_.emplace_back(out_tv, view_result);
 }
 
-void DynamicTransformInfoBuilder::handle(Resize* op) {
-  auto out_id = op->out()->as<IterDomain>();
-
-  // If the input is not symbolic, and the expansion sizes are static, this is
-  // a static resize
-  if (out_id->getIterType() != IterType::Symbolic) {
-    return;
-  }
-
-  auto out_extent_val = expr_eval_->evaluate(out_id->extent());
-  TORCH_INTERNAL_ASSERT(
-      out_extent_val.has_value(),
-      "Cannot evaluate the extent of a resized IterDomain: ",
-      out_id->toString());
-
-  auto in_id = op->in()->as<IterDomain>();
-  auto in_extent_val = expr_eval_->evaluate(in_id->extent());
-  TORCH_INTERNAL_ASSERT(
-      in_extent_val.has_value(),
-      "Cannot evaluate the extent of input to an IterDomain resize: ",
-      in_id->toString());
-
-  auto left = op->leftExpand()->as<Int>();
-  auto left_val = expr_eval_->evaluate(left);
-  TORCH_INTERNAL_ASSERT(
-      left_val.has_value(),
-      "Cannot evaluate the left expansion of an IterDomain resize: ",
-      left->toString());
-
-  auto right = op->rightExpand()->as<Int>();
-  auto right_val = expr_eval_->evaluate(right);
-  TORCH_INTERNAL_ASSERT(
-      right_val.has_value(),
-      "Cannot evaluate the right expansion of an IterDomain resize: ",
-      right->toString());
-
-  auto out_itertype = out_extent_val->as<int64_t>() == 1 ? IterType::Broadcast
-                                                         : IterType::Iteration;
-
-  info_.resize_transforms_.emplace_back(out_id, out_itertype);
-}
-
 //! Concretize a symbolic fusion with concrete transformation info
 class DynamicTransformConcretizer : public OptOutMutator {
  public:
@@ -270,7 +292,6 @@ class DynamicTransformConcretizer : public OptOutMutator {
 
  private:
   const DynamicTransformConcretizationInfo& info_;
-  std::unordered_map<IterDomain*, IterDomain*> update_map_;
 };
 
 void DynamicTransformConcretizer::concretize() {
@@ -316,15 +337,22 @@ void DynamicTransformConcretizer::concretizeReshape() {
 
 void DynamicTransformConcretizer::concretizeResize() {
   // Concretize each resize op.
-  for (const auto& kv : info_.getResizeTransforms()) {
-    auto id = kv.first;
-    auto iter_type = kv.second;
+  for (const auto& resize_info : info_.getResizeTransforms()) {
+    auto td = std::get<0>(resize_info);
+    auto id = std::get<1>(resize_info);
+    auto iter_type = std::get<2>(resize_info);
+
+    auto new_id = IterDomainBuilder(id).iter_type(iter_type).build();
 
     // swap in new IterDomain as output of the resize Expr
-    ir_utils::replaceValInExpr(
-        id->definition(),
-        id,
-        IterDomainBuilder(id).iter_type(iter_type).build());
+    ir_utils::replaceValInExpr(id->definition(), id, new_id);
+
+    // replace id with new_id in root domain of output TensorDomain
+    auto rootd = td->getRootDomain();
+    for (auto root_id : td->getRootDomain()) {
+      if (root_id == id) {
+      }
+    }
   }
 }
 
diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h
index 13c2c03c169..be4cee92f02 100644
--- a/csrc/dynamic_transform.h
+++ b/csrc/dynamic_transform.h
@@ -32,8 +32,8 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo {
     return reshape_transforms_;
   }
 
-  const std::vector<std::pair<IterDomain*, IterType>>& getResizeTransforms()
-      const {
+  const std::vector<std::tuple<TensorDomain*, IterDomain*, IterType>>&
+  getResizeTransforms() const {
     return resize_transforms_;
   }
 
@@ -63,9 +63,10 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo {
   // analyzeView
   std::vector<std::pair<TensorView*, AnalyzeViewResult>> reshape_transforms_;
 
-  // Holds the resized IterDomain along with the concretized left and right
-  // expansion sizes
-  std::vector<std::pair<IterDomain*, IterType>> resize_transforms_;
+  // Holds the resized IterDomain (output of the Resize op) along with the
+  // TensorDomain where it appears, and the concretized IterType
+  std::vector<std::tuple<TensorDomain*, IterDomain*, IterType>>
+      resize_transforms_;
 
   friend class DynamicTransformInfoBuilder;
 };

From 1b84942f9f210efc868481229b28294bbf5fc6e7 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 1 May 2023 12:03:46 -0400
Subject: [PATCH 10/49] Fix issues in concretization of resize

Resized IterDomains are now replaced, but I am hitting a scheduling
error:

```
exception with description "!replay_has_rfactor_inp INTERNAL ASSERT
FAILED at "/opt/pytorch/nvfuser/csrc/transform_iter.cpp":519, please
report a bug to PyTorch. Error during replay, a transformation was
called that conflicts with an rfactor call.
Exception raised from BestEffortReplay at
/opt/pytorch/nvfuser/csrc/transform_iter.cpp:519 (most recent call
first):
```
---
 csrc/dynamic_transform.cpp      | 73 +++++++++++++++++++++------------
 csrc/dynamic_transform.h        |  6 +--
 csrc/kernel_cache.cpp           |  9 ++++
 test/test_dynamic_transform.cpp | 58 ++++++--------------------
 4 files changed, 71 insertions(+), 75 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 8a4354674e8..7a00cfd5102 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -30,8 +30,7 @@ class DynamicTransformInfoBuilder : public IterVisitor {
   // Analyze a dynamic reshape and generate AnalyzeViewResult
   void handle(ViewOp* op) override;
 
-  // We handle IterDomain "Resize" ops at TensorDomain level
-  void handle(TensorDomain* td) override;
+  // We handle IterDomain "Resize" ops at TensorView level
   void handle(TensorView* tv) override;
 
   const auto& getInfo() const {
@@ -100,7 +99,7 @@ DynamicTransformConcretizationInfo DynamicTransformConcretizationInfo::clone(
     cloned_info.resize_transforms_.emplace_back(
         ir_cloner.clone(std::get<0>(tx)),
         ir_cloner.clone(std::get<1>(tx)),
-        // Similar to reshape_transforms_, we only clone the TensorDomains and
+        // Similar to reshape_transforms_, we only clone the TensorViews and
         // IterDomains in resize_transforms_
         std::get<2>(tx));
   }
@@ -124,10 +123,9 @@ std::string DynamicTransformConcretizationInfo::toString() const {
   return ss.str();
 }
 
-void DynamicTransformInfoBuilder::handle(TensorDomain* td) {
-  std::cout << "Handling TensorDomain " << td->toString() << std::endl;
-  auto rootd = td->getRootDomain();
-  for (auto id : rootd) {
+void DynamicTransformInfoBuilder::handle(TensorView* tv) {
+  auto rfd = tv->domain()->getMaybeRFactorDomain();
+  for (auto id : rfd) {
     if (id->getIterType() == IterType::Symbolic && id->definition()) {
       auto def = id->definition();
       if (def->isA<Resize>()) {
@@ -164,23 +162,12 @@ void DynamicTransformInfoBuilder::handle(TensorDomain* td) {
             ? IterType::Broadcast
             : IterType::Iteration;
 
-        info_.resize_transforms_.emplace_back(
-            // std::make_tuple(
-            td,
-            id,
-            out_itertype
-            //)
-        );
+        info_.resize_transforms_.emplace_back(tv, id, out_itertype);
       }
     }
   }
 }
 
-void DynamicTransformInfoBuilder::handle(TensorView* tv) {
-  std::cout << "Handling TensorView " << tv->toString() << std::endl;
-  handle(tv->domain());
-}
-
 void DynamicTransformInfoBuilder::handle(ViewOp* op) {
   auto inp_tv = op->in()->as<TensorView>();
   auto out_tv = op->out()->as<TensorView>();
@@ -302,7 +289,7 @@ void DynamicTransformConcretizer::concretize() {
   concretizeResize();
 
   // Finally, propagate concretized domains
-  auto all_stmts = StmtSort::getStmts(info_.fusion(), false);
+  auto all_stmts = StmtSort::getStmts(info_.fusion(), true);
   for (auto stmt : all_stmts) {
     if (stmt->isA<Val>()) {
       mutate(stmt);
@@ -338,7 +325,7 @@ void DynamicTransformConcretizer::concretizeReshape() {
 void DynamicTransformConcretizer::concretizeResize() {
   // Concretize each resize op.
   for (const auto& resize_info : info_.getResizeTransforms()) {
-    auto td = std::get<0>(resize_info);
+    auto incomplete_out_tv = std::get<0>(resize_info);
     auto id = std::get<1>(resize_info);
     auto iter_type = std::get<2>(resize_info);
 
@@ -347,12 +334,46 @@ void DynamicTransformConcretizer::concretizeResize() {
     // swap in new IterDomain as output of the resize Expr
     ir_utils::replaceValInExpr(id->definition(), id, new_id);
 
-    // replace id with new_id in root domain of output TensorDomain
-    auto rootd = td->getRootDomain();
-    for (auto root_id : td->getRootDomain()) {
-      if (root_id == id) {
-      }
+    // We need to replace the TensorDomain of incomplete_out_tv with one where
+    // we've replaced id with new_id in the r-factor domain
+    auto old_rfactor_domain =
+        incomplete_out_tv->domain()->getMaybeRFactorDomain();
+    std::vector<IterDomain*> new_rfactor_domain(old_rfactor_domain.size());
+    for (auto i : c10::irange(old_rfactor_domain.size())) {
+      new_rfactor_domain[i] =
+          old_rfactor_domain[i] == id ? new_id : old_rfactor_domain[i];
+      std::cout << "new_rfactor_domain[" << i << "] = " << new_rfactor_domain[i]
+                << std::endl;
+    }
+
+    auto new_td = IrBuilder::create<TensorDomain>(
+        incomplete_out_tv->container(),
+        incomplete_out_tv->domain()->getRootDomain(),
+        new_rfactor_domain,
+        new_rfactor_domain,
+        incomplete_out_tv->domain()->getContiguityFilledWith(
+            new_rfactor_domain, true));
+    auto new_out_tv = IrBuilder::create<TensorView>(
+        new_td, incomplete_out_tv->dtype(), incomplete_out_tv->getMemoryType());
+
+    TORCH_INTERNAL_ASSERT(
+        incomplete_out_tv->definition(),
+        "Cannot replace TensorView with resized IterDomain if it has no definition");
+
+    // This should set the definition of new_out_tv
+    ir_utils::replaceValInExpr(
+        incomplete_out_tv->definition(), incomplete_out_tv, new_out_tv);
+
+    // Replace the old tensor with the new concretized tensor
+    for (auto use_of_old_tv : incomplete_out_tv->uses()) {
+      ir_utils::replaceValInExpr(use_of_old_tv, incomplete_out_tv, new_out_tv);
     }
+
+    if (incomplete_out_tv->isFusionOutput()) {
+      incomplete_out_tv->fusion()->replaceOutput(incomplete_out_tv, new_out_tv);
+    }
+
+    incomplete_out_tv->fusion()->removeVal(incomplete_out_tv);
   }
 }
 
diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h
index be4cee92f02..681a2a0693e 100644
--- a/csrc/dynamic_transform.h
+++ b/csrc/dynamic_transform.h
@@ -32,7 +32,7 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo {
     return reshape_transforms_;
   }
 
-  const std::vector<std::tuple<TensorDomain*, IterDomain*, IterType>>&
+  const std::vector<std::tuple<TensorView*, IterDomain*, IterType>>&
   getResizeTransforms() const {
     return resize_transforms_;
   }
@@ -64,8 +64,8 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo {
   std::vector<std::pair<TensorView*, AnalyzeViewResult>> reshape_transforms_;
 
   // Holds the resized IterDomain (output of the Resize op) along with the
-  // TensorDomain where it appears, and the concretized IterType
-  std::vector<std::tuple<TensorDomain*, IterDomain*, IterType>>
+  // TensorView where it appears, and its concretized IterType
+  std::vector<std::tuple<TensorView*, IterDomain*, IterType>>
       resize_transforms_;
 
   friend class DynamicTransformInfoBuilder;
diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp
index fbb7c4a68a4..bc75d708b73 100644
--- a/csrc/kernel_cache.cpp
+++ b/csrc/kernel_cache.cpp
@@ -389,6 +389,9 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
               std::any_cast<DynamicTransformConcretizationInfo>(data);
           return orig_conc_info.clone(ir_cloner);
         });
+
+    std::cout << "Concretization info: " << conc_info.value().toString()
+              << std::endl;
   }
 
   // Initialize or fetch vector of FusionKernelRuntime objects associated with
@@ -426,6 +429,9 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
     // concretize fusion_ for use in this runtime
     auto fusion = std::make_unique<Fusion>(*fusion_);
     FusionGuard fg(fusion.get());
+    std::cout << "Before concretization: " << std::endl;
+    fusion->printMath();
+    fusion->printTransforms();
     if (has_dynamic_reshape_) {
       const auto& cloned_conc_info =
           fusion->getManagedSafe<DynamicTransformConcretizationInfo>(
@@ -442,6 +448,9 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
       // here, effectively as if it now describes a non-dynamic Fusion.
       // cloned_conc_info.clear();
       fusion->stopManaging(conc_info_index);
+
+      std::cout << "\nAfter concretization: " << std::endl;
+      fusion->printTransforms();
     }
     kernel_runtimes.emplace_back(std::make_unique<FusionKernelRuntime>(
         std::move(fusion), args, forced_index_type));
diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index 05dd52e2c6c..b9d7c5cf9fb 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -747,7 +747,7 @@ void reductionDynamicViewAddFusion(
       (reshape_before_reduction) ? add(x, bias) : sum(x, {kReductionAxis});
   // create vectors of input scalars describing this reshape
   std::vector<Val*> output_shape(output_dims);
-  for (int i : c10::irange(output_dims)) {
+  for (size_t i : c10::irange(output_dims)) {
     output_shape[i] = IrBuilder::create<Int>();
     fusion.addInput(output_shape[i]);
   }
@@ -808,8 +808,8 @@ void reductionDynamicViewAddFusion(
     at::Tensor at_bias = at::randn(bias_shape, options);
     std::vector<c10::IValue> aten_inputs = {at_x, at_bias};
     // Add input scalars describing the reshape size for concretization
-    for (int i : c10::irange(output_dims)) {
-      aten_inputs.push_back(output_shape[i]);
+    for (size_t i : c10::irange(output_dims)) {
+      aten_inputs.emplace_back(output_shape[i]);
     }
 
     auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
@@ -854,8 +854,7 @@ using dynamic_pad_invocation = std::tuple<
     >;
 
 void reductionDynamicPadAddFusion(
-    std::vector<dynamic_pad_invocation>& invocations,
-    bool pad_before_reduction) {
+    std::vector<dynamic_pad_invocation>& invocations) {
   constexpr int kReductionAxis = -1;
 
   auto input_shape = std::get<0>(invocations[0]);
@@ -865,24 +864,17 @@ void reductionDynamicPadAddFusion(
   Fusion& fusion = *fusion_ptr.get();
   FusionGuard fg(&fusion);
 
-  auto bias_dims =
-      pad_before_reduction ? input_shape.size() : input_shape.size() - 1;
-
   // TODO: change symbolic size for padded dimension if start size is 1
   TensorView* x = makeSymbolicTensor(input_shape.size());
-  TensorView* bias = makeSymbolicTensor(bias_dims);
   fusion.addInput(x);
-  fusion.addInput(bias);
 
-  auto tv1 = (pad_before_reduction) ? add(x, bias) : sum(x, {kReductionAxis});
   std::vector<Val*> pad_width_vals(pad_widths.size());
   for (auto i : c10::irange(pad_widths.size())) {
     pad_width_vals[i] = IrBuilder::create<Int>();
     fusion.addInput(pad_width_vals[i]);
   }
-  auto x_pad = pad(tv1, pad_width_vals);
-  auto y =
-      (pad_before_reduction) ? sum(x_pad, {kReductionAxis}) : add(x_pad, bias);
+  auto x_pad = pad(x, pad_width_vals);
+  auto y = sum(x_pad, {kReductionAxis});
   fusion.addOutput(y);
 
   FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
@@ -916,43 +908,17 @@ void reductionDynamicPadAddFusion(
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
     at::Tensor at_x = at::randn(input_shape, options);
-    std::vector<int64_t> bias_shape(input_shape);
-    if (!pad_before_reduction) {
-      // remove last dimension due to reduction
-      bias_shape.resize(bias_shape.size() - 1);
-    }
-    if (!pad_before_reduction) {
-      // When bias_shape = output_shape, it may contain -1s
-      // concretize bias_shape so that we can properly initialize at_bias
-      size_t other_numel = 1;
-      ssize_t negone_dim = -1; // negative if no -1 shape is provided
-      for (auto i : c10::irange(bias_shape.size())) {
-        if (bias_shape[i] == -1) {
-          ASSERT_EQ(negone_dim, -1); // test cases should not have multiple -1s
-          negone_dim = -1;
-        } else {
-          other_numel *= bias_shape[i];
-        }
-      }
-      if (negone_dim >= 0) {
-        bias_shape[negone_dim] = at_x.numel() / other_numel;
-      }
-    }
-    at::Tensor at_bias = at::randn(bias_shape, options);
-    std::vector<c10::IValue> aten_inputs = {at_x, at_bias};
+    std::vector<c10::IValue> aten_inputs = {at_x};
     // Add input scalars describing the reshape size for concretization
-    for (int i : c10::irange(pad_widths.size())) {
-      aten_inputs.push_back(pad_widths[i]);
+    for (size_t i : c10::irange(pad_widths.size())) {
+      aten_inputs.emplace_back(pad_widths[i]);
     }
 
     auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
     checkCache(expect_miss);
 
-    auto at_tv1 = (pad_before_reduction) ? (at_x + at_bias)
-                                         : at::sum(at_x, kReductionAxis);
-    auto at_x_reshape = at::native::view(at_tv1, bias_shape);
-    auto at_y = (pad_before_reduction) ? at::sum(at_x_reshape, kReductionAxis)
-                                       : at::add(at_x_reshape, at_bias);
+    auto at_x_pad = at::pad(at_x, pad_widths);
+    auto at_y = at::sum(at_x_pad, kReductionAxis);
 
     testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
   }
@@ -978,7 +944,7 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) {
     //{{8, 3 * 5, 7, 9}, {8, 3, -1, 9}, false} // merge(1) osplit(1, 3)
     */
   };
-  reductionDynamicPadAddFusion(invocations, true /* pad_before_reduction */);
+  reductionDynamicPadAddFusion(invocations);
 }
 
 } // namespace nvfuser

From cdb0f272c2f5184f10d14b1be450a89af918a8d7 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 1 May 2023 12:50:39 -0400
Subject: [PATCH 11/49] Remove replaced vals after resize concretization

---
 csrc/dynamic_transform.cpp      | 23 +++++++++++++----------
 csrc/kernel_cache.cpp           |  9 ---------
 test/test_dynamic_transform.cpp | 20 +++++++++++++-------
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 7a00cfd5102..00fe7c0b277 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -335,22 +335,23 @@ void DynamicTransformConcretizer::concretizeResize() {
     ir_utils::replaceValInExpr(id->definition(), id, new_id);
 
     // We need to replace the TensorDomain of incomplete_out_tv with one where
-    // we've replaced id with new_id in the r-factor domain
-    auto old_rfactor_domain =
-        incomplete_out_tv->domain()->getMaybeRFactorDomain();
-    std::vector<IterDomain*> new_rfactor_domain(old_rfactor_domain.size());
-    for (auto i : c10::irange(old_rfactor_domain.size())) {
-      new_rfactor_domain[i] =
-          old_rfactor_domain[i] == id ? new_id : old_rfactor_domain[i];
-      std::cout << "new_rfactor_domain[" << i << "] = " << new_rfactor_domain[i]
-                << std::endl;
+    // we've replaced id with new_id in the r-factor domain.
+    std::vector<IterDomain*> new_rfactor_domain;
+    if (incomplete_out_tv->domain()->hasRFactor()) {
+      auto old_rfactor_domain =
+          incomplete_out_tv->domain()->getMaybeRFactorDomain();
+      new_rfactor_domain.resize(old_rfactor_domain.size());
+      for (auto i : c10::irange(old_rfactor_domain.size())) {
+        new_rfactor_domain[i] =
+            old_rfactor_domain[i] == id ? new_id : old_rfactor_domain[i];
+      }
     }
 
     auto new_td = IrBuilder::create<TensorDomain>(
         incomplete_out_tv->container(),
         incomplete_out_tv->domain()->getRootDomain(),
         new_rfactor_domain,
-        new_rfactor_domain,
+        new_rfactor_domain,  // TODO: add check that we don't have leaf transforms
         incomplete_out_tv->domain()->getContiguityFilledWith(
             new_rfactor_domain, true));
     auto new_out_tv = IrBuilder::create<TensorView>(
@@ -373,7 +374,9 @@ void DynamicTransformConcretizer::concretizeResize() {
       incomplete_out_tv->fusion()->replaceOutput(incomplete_out_tv, new_out_tv);
     }
 
+    incomplete_out_tv->fusion()->removeVal(incomplete_out_tv->domain());
     incomplete_out_tv->fusion()->removeVal(incomplete_out_tv);
+    incomplete_out_tv->fusion()->removeVal(id);
   }
 }
 
diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp
index bc75d708b73..fbb7c4a68a4 100644
--- a/csrc/kernel_cache.cpp
+++ b/csrc/kernel_cache.cpp
@@ -389,9 +389,6 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
               std::any_cast<DynamicTransformConcretizationInfo>(data);
           return orig_conc_info.clone(ir_cloner);
         });
-
-    std::cout << "Concretization info: " << conc_info.value().toString()
-              << std::endl;
   }
 
   // Initialize or fetch vector of FusionKernelRuntime objects associated with
@@ -429,9 +426,6 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
     // concretize fusion_ for use in this runtime
     auto fusion = std::make_unique<Fusion>(*fusion_);
     FusionGuard fg(fusion.get());
-    std::cout << "Before concretization: " << std::endl;
-    fusion->printMath();
-    fusion->printTransforms();
     if (has_dynamic_reshape_) {
       const auto& cloned_conc_info =
           fusion->getManagedSafe<DynamicTransformConcretizationInfo>(
@@ -448,9 +442,6 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
       // here, effectively as if it now describes a non-dynamic Fusion.
       // cloned_conc_info.clear();
       fusion->stopManaging(conc_info_index);
-
-      std::cout << "\nAfter concretization: " << std::endl;
-      fusion->printTransforms();
     }
     kernel_runtimes.emplace_back(std::make_unique<FusionKernelRuntime>(
         std::move(fusion), args, forced_index_type));
diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index b9d7c5cf9fb..ea215b84895 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -784,6 +784,9 @@ void reductionDynamicViewAddFusion(
     auto output_shape = std::get<1>(inv);
     auto expect_miss = std::get<2>(inv);
 
+    TORCH_INTERNAL_ASSERT(input_shape.size() == input_dims);
+    TORCH_INTERNAL_ASSERT(output_shape.size() == output_dims);
+
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
     at::Tensor at_x = at::randn(input_shape, options);
@@ -857,19 +860,19 @@ void reductionDynamicPadAddFusion(
     std::vector<dynamic_pad_invocation>& invocations) {
   constexpr int kReductionAxis = -1;
 
-  auto input_shape = std::get<0>(invocations[0]);
-  auto pad_widths = std::get<1>(invocations[0]);
+  auto input_dims = std::get<0>(invocations[0]).size();
+  auto num_pad_widths = std::get<1>(invocations[0]).size();
 
   std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
   Fusion& fusion = *fusion_ptr.get();
   FusionGuard fg(&fusion);
 
   // TODO: change symbolic size for padded dimension if start size is 1
-  TensorView* x = makeSymbolicTensor(input_shape.size());
+  TensorView* x = makeSymbolicTensor(input_dims);
   fusion.addInput(x);
 
-  std::vector<Val*> pad_width_vals(pad_widths.size());
-  for (auto i : c10::irange(pad_widths.size())) {
+  std::vector<Val*> pad_width_vals(num_pad_widths);
+  for (auto i : c10::irange(num_pad_widths)) {
     pad_width_vals[i] = IrBuilder::create<Int>();
     fusion.addInput(pad_width_vals[i]);
   }
@@ -901,10 +904,13 @@ void reductionDynamicPadAddFusion(
   };
 
   for (auto& inv : invocations) {
-    auto pad_widths = std::get<0>(inv);
-    auto start_extent = std::get<1>(inv);
+    auto input_shape = std::get<0>(inv);
+    auto pad_widths = std::get<1>(inv);
     auto expect_miss = std::get<2>(inv);
 
+    TORCH_INTERNAL_ASSERT(input_shape.size() == input_dims);
+    TORCH_INTERNAL_ASSERT(pad_widths.size() == num_pad_widths);
+
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
     at::Tensor at_x = at::randn(input_shape, options);

From 6dbdbb1f8fc489e06de89c2b27052dc9ab875b18 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 1 May 2023 13:11:12 -0400
Subject: [PATCH 12/49] Add more cases to pad shmoo test

---
 test/test_dynamic_transform.cpp | 41 +++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index ea215b84895..9df69026594 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -934,21 +934,32 @@ void reductionDynamicPadAddFusion(
 TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) {
   auto invocations = std::vector<dynamic_pad_invocation>{
       {{3, 5}, {0, 0}, true}, // trivial
-      //{{-1, 1}, 5, false}, // shift by one. Re-uses Iteration
-      /*
-    {{8, 3 * 4, 7, 9}, {8, 3 * 4, 7, 9}, true}, // trivial
-    {{8, 3 * 4, 7, 5}, {8, 3 * 4, 7, 5}, false}, // trivial
-    {{8, 3 * 4, 7, 9}, {8, 3, 4, 7 * 9}, true}, // merge(2) osplit(1, 3)
-    {{8, 3 * 4, 7, 9},
-     {8, 3, 4 * 7, 9},
-     true}, // merge(1) merge(2) osplit(1, 3)
-    {{8, 3 * 4, 7, 5},
-     {8, 3, 4 * 7, 5},
-     false}, // merge(1) merge(2) osplit(1, 3)
-    {{8, 3 * 5, 7, 9}, {8, 3, 5 * 7, 9}, false}, // merge(1) osplit(1, 3)
-    // test passing -1 dynamically for dimension size
-    //{{8, 3 * 5, 7, 9}, {8, 3, -1, 9}, false} // merge(1) osplit(1, 3)
-    */
+
+      {{3, 5}, {2, 1}, false}, // simple pad of both sides
+      {{3, 5}, {-1, 1}, false}, // shift by one
+      // TODO: The following fails with a SIGFPE in innerReductionHeuristic
+      //{{3, 5}, {-3, -2}, false}, // output is zero-dimensional
+
+      // Output has size 1 so is set to broadcast
+      // Currently fails since: IterDomain cannot be both a broadcast and
+      // rfactor domain. Exception raised from IterDomain at
+      // /opt/pytorch/nvfuser/csrc/ir_nodes.cpp:2080
+      //{{3, 5}, {0, -4}, true},
+
+      // Test full negative shifts, so output doesn't overlap input
+      {{3, 5},
+       {-5, 2},
+       false}, // TODO: why doesn't this miss due to concretize to broadcast?
+      {{3, 5}, {2, -5}, false}, // full shift the other direction, re-use
+
+      // The following reuses the schedule of {3, 5} inputs, and does not set
+      // broadcast on the second input dimension.
+      {{3, 1}, {1, 1}, false},
+
+      // Test zero-dimensional input
+      //{{3, 0}, {0, 0}, false}, // TODO: SIGFPE (see above)
+      {{3, 0}, {1, 1}, false},
+      //{{3, 0}, {-1, 1}, false}, // TODO: SIGFPE (see above)
   };
   reductionDynamicPadAddFusion(invocations);
 }

From c663ece6ac164e096ef95e005d0f4c728c249124 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 1 May 2023 14:53:15 -0400
Subject: [PATCH 13/49] Re-use IterType computation in concretization

---
 csrc/dynamic_transform.cpp | 11 +++++++----
 csrc/dynamic_transform.h   | 23 +++++++++++++++++++++++
 csrc/ir_nodes.cpp          | 14 ++------------
 3 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 00fe7c0b277..e321268105e 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -158,9 +158,11 @@ void DynamicTransformInfoBuilder::handle(TensorView* tv) {
             "Cannot evaluate the right expansion of an IterDomain resize: ",
             right->toString());
 
-        auto out_itertype = out_extent_val->as<int64_t>() == 1
-            ? IterType::Broadcast
-            : IterType::Iteration;
+        auto out_itertype = resize_output_itertype(
+            in_extent_val->as<int64_t>(),
+            out_extent_val->as<int64_t>(),
+            left_val->as<int64_t>(),
+            right_val->as<int64_t>());
 
         info_.resize_transforms_.emplace_back(tv, id, out_itertype);
       }
@@ -351,7 +353,8 @@ void DynamicTransformConcretizer::concretizeResize() {
         incomplete_out_tv->container(),
         incomplete_out_tv->domain()->getRootDomain(),
         new_rfactor_domain,
-        new_rfactor_domain,  // TODO: add check that we don't have leaf transforms
+        new_rfactor_domain, // TODO: add check that we don't have leaf
+                            // transforms
         incomplete_out_tv->domain()->getContiguityFilledWith(
             new_rfactor_domain, true));
     auto new_out_tv = IrBuilder::create<TensorView>(
diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h
index 681a2a0693e..851bc8d339e 100644
--- a/csrc/dynamic_transform.h
+++ b/csrc/dynamic_transform.h
@@ -23,6 +23,29 @@ class Fusion;
 class ExpressionEvaluator;
 class DynamicTransformInfoBuilder;
 
+//! Compute the IterType of an IterDomain that has been resized. If the output
+//! is size 1, or the output uses no input elements, this function returns
+//! Broadcast. Otherwise, it returns Iteration.
+inline IterType resize_output_itertype(
+    int64_t in_extent,
+    int64_t out_extent,
+    int64_t left,
+    int64_t right) {
+  TORCH_CHECK(out_extent >= 0, "Resized extent must be non-negative.");
+  if (
+      // negative padding sums to input extent. Output is zero-dimensional
+      out_extent == 0 ||
+      // input overlaps output
+      left + in_extent > 0 || right + in_extent > 0) {
+    return IterType::Iteration;
+  } else {
+    // Result is size-1 or input doesn't overlap output.
+    // In these cases, the output is just a broadcast of either the used input
+    // value, or the pad value.
+    return IterType::Broadcast;
+  }
+}
+
 //! A set of transformations for a symbolic fusion with concrete sizes
 //! of the fusion inputs
 class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo {
diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp
index fc34b567367..f96ca198188 100644
--- a/csrc/ir_nodes.cpp
+++ b/csrc/ir_nodes.cpp
@@ -6,6 +6,7 @@
  */
 // clang-format on
 #include <disjoint_set.h>
+#include <dynamic_transform.h>
 #include <ir_cloner.h>
 #include <ir_interface_nodes.h>
 #include <ir_iostream.h>
@@ -2484,18 +2485,7 @@ IterDomain* IterDomain::resize(
     auto out_extent = resized_id_size->getInt().value();
     auto left = left_expansion->getInt().value();
     auto right = right_expansion->getInt().value();
-    TORCH_CHECK(out_extent >= 0, "Resized extent must be non-negative.");
-    if (
-        // negative padding sums to input extent. Output is zero-dimensional
-        out_extent == 0 ||
-        // input overlaps output
-        left + in_extent > 0 || right + in_extent > 0) {
-      iter_type = IterType::Iteration;
-    } else {
-      // Result is zero-dimensional, broadcast, or input doesn't overlap output
-      // In these cases, the output is just the broadcasted pad value
-      iter_type = IterType::Broadcast;
-    }
+    iter_type = resize_output_itertype(in_extent, out_extent, left, right);
   } else {
     iter_type = IterType::Symbolic;
   }

From 4efc74cfae0665617ea211ccfc663a0dfd24157b Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 2 May 2023 13:36:12 -0400
Subject: [PATCH 14/49] Add zero-element reduction test

---
 test/test_gpu3.cpp | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/test/test_gpu3.cpp b/test/test_gpu3.cpp
index 432d54d6f9a..1c8756d1214 100644
--- a/test/test_gpu3.cpp
+++ b/test/test_gpu3.cpp
@@ -8274,6 +8274,32 @@ TEST_F(NVFuserTest, FusionClearGmemBetweenSegments_CUDA) {
   testValidate(
       executor_cache.fusion(), outputs, {at_x}, {t4}, __LINE__, __FILE__);
 }
+
+// Test that 0-dimensional tensors do not break reduction scheduler
+TEST_F(NVFuserTest, FusionReduceZeroElementTensor_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  std::vector<int64_t> input_shape{3, 4, 0, 5};
+  auto tv0 = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  fusion->addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs({at_x});
+  auto t2 = at_x.sum({2});
+
+  auto reduction_params = getReductionHeuristics(fusion.get(), {at_x});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(fusion.get(), *reduction_params);
+
+  testValidate(
+      executor_cache.fusion(), outputs, {at_x}, {t2}, __LINE__, __FILE__);
+}
+
 // Test file size should be up to 10K LoC. Create a new file for more tests.
 
 } // namespace nvfuser

From 81a6f109def065df7c76cf95b95336b9053663b8 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 2 May 2023 13:36:40 -0400
Subject: [PATCH 15/49] Short-circuit outerReductionHeuristic on numel==0

---
 csrc/scheduler/reduction.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/csrc/scheduler/reduction.cpp b/csrc/scheduler/reduction.cpp
index 6986b2f7628..643ac83ff3c 100644
--- a/csrc/scheduler/reduction.cpp
+++ b/csrc/scheduler/reduction.cpp
@@ -516,6 +516,25 @@ std::shared_ptr<ReductionParams> outerReductionHeuristic(
     const int64_t n_tensor_inputs,
     const int64_t max_input_dtype_size,
     const size_t vectorize_factor) {
+  if (total_reduction_numel + total_iteration_numel == 0) {
+    // Number of elements is zero
+
+    if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) {
+      auto rparams = std::make_unique<ReductionParams>();
+      std::cerr << "\nNumber of elements in reduction input buffer is zero\n"
+                << std::endl;
+      std::cerr << "\n===== Reduction Stats ========\n"
+                << "total_reduction_numel: " << total_reduction_numel << "\n"
+                << "total_iteration_numel: " << total_iteration_numel << "\n"
+                << "vectorize_factor: " << vectorize_factor << "\n"
+                << "n_tensor_inputs: " << n_tensor_inputs << "\n"
+                << "max_input_dtype_size: " << max_input_dtype_size << "\n"
+                << "block(" << rparams->lparams.bdimx() << ", "
+                << rparams->lparams.bdimy() << ", 1)" << std::endl;
+      std::cerr << rparams->toString() << std::endl;
+      return std::move(rparams);
+    }
+  }
   // WARNING: Current device for codegen may not be the target device
   const int64_t device_max_threads_per_multiprocessor =
       (int64_t)at::cuda::getCurrentDeviceProperties()

From 4174a63b50179cbc63b32e2d0ad4e16a159f375a Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 2 May 2023 14:08:22 -0400
Subject: [PATCH 16/49] Short-circuit if numel of input is zero

i.e. if total_reduction_numel OR total_iteration_numel is zero.

In these cases we just return the default ReductionParams, which will
launch a single block.
---
 csrc/scheduler/reduction.cpp | 15 ++++++++++-----
 test/test_gpu3.cpp           | 37 +++++++++++++++++++-----------------
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/csrc/scheduler/reduction.cpp b/csrc/scheduler/reduction.cpp
index 643ac83ff3c..0769fc2d655 100644
--- a/csrc/scheduler/reduction.cpp
+++ b/csrc/scheduler/reduction.cpp
@@ -516,11 +516,10 @@ std::shared_ptr<ReductionParams> outerReductionHeuristic(
     const int64_t n_tensor_inputs,
     const int64_t max_input_dtype_size,
     const size_t vectorize_factor) {
-  if (total_reduction_numel + total_iteration_numel == 0) {
-    // Number of elements is zero
-
+  if (total_reduction_numel == 0 || total_iteration_numel == 0) {
+    // Number of elements in input is zero
+    auto rparams = std::make_unique<ReductionParams>();
     if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) {
-      auto rparams = std::make_unique<ReductionParams>();
       std::cerr << "\nNumber of elements in reduction input buffer is zero\n"
                 << std::endl;
       std::cerr << "\n===== Reduction Stats ========\n"
@@ -532,9 +531,15 @@ std::shared_ptr<ReductionParams> outerReductionHeuristic(
                 << "block(" << rparams->lparams.bdimx() << ", "
                 << rparams->lparams.bdimy() << ", 1)" << std::endl;
       std::cerr << rparams->toString() << std::endl;
-      return std::move(rparams);
     }
+    return std::move(rparams);
   }
+  std::cerr << "\n===== Reduction Stats ========\n"
+            << "total_reduction_numel: " << total_reduction_numel << "\n"
+            << "total_iteration_numel: " << total_iteration_numel << "\n"
+            << "vectorize_factor: " << vectorize_factor << "\n"
+            << "n_tensor_inputs: " << n_tensor_inputs << "\n"
+            << "max_input_dtype_size: " << max_input_dtype_size << std::endl;
   // WARNING: Current device for codegen may not be the target device
   const int64_t device_max_threads_per_multiprocessor =
       (int64_t)at::cuda::getCurrentDeviceProperties()
diff --git a/test/test_gpu3.cpp b/test/test_gpu3.cpp
index 1c8756d1214..232354a029b 100644
--- a/test/test_gpu3.cpp
+++ b/test/test_gpu3.cpp
@@ -8277,27 +8277,30 @@ TEST_F(NVFuserTest, FusionClearGmemBetweenSegments_CUDA) {
 
 // Test that 0-dimensional tensors do not break reduction scheduler
 TEST_F(NVFuserTest, FusionReduceZeroElementTensor_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
+  for (int reduction_dim : {1, 2}) {
+    auto fusion = std::make_unique<Fusion>();
+    FusionGuard fg(fusion.get());
 
-  std::vector<int64_t> input_shape{3, 4, 0, 5};
-  auto tv0 = makeSymbolicTensor(4);
-  fusion->addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  fusion->addOutput(tv1);
+    std::vector<int64_t> input_shape{3, 4, 0, 5};
+    auto tv0 = makeSymbolicTensor(4);
+    fusion->addInput(tv0);
+    auto tv1 = sum(tv0, {reduction_dim});
+    fusion->addOutput(tv1);
 
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  FusionExecutorCache executor_cache(std::move(fusion));
-  auto outputs = executor_cache.runFusionWithInputs({at_x});
-  auto t2 = at_x.sum({2});
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor at_x = at::randn(input_shape, options);
+    auto t2 = at_x.sum({reduction_dim});
 
-  auto reduction_params = getReductionHeuristics(fusion.get(), {at_x});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(fusion.get(), *reduction_params);
+    auto reduction_params = getReductionHeuristics(fusion.get(), {at_x});
+    TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+    scheduleReduction(fusion.get(), *reduction_params);
 
-  testValidate(
-      executor_cache.fusion(), outputs, {at_x}, {t2}, __LINE__, __FILE__);
+    FusionExecutor fe;
+    fe.compileFusion(fusion.get(), {at_x});
+    auto cg_outputs = fe.runFusion({at_x});
+
+    testValidate(fusion.get(), cg_outputs, {at_x}, {t2}, __LINE__, __FILE__);
+  }
 }
 
 // Test file size should be up to 10K LoC. Create a new file for more tests.

From 37fc7ce776c2769fa97efcdb6d12496a8c7a543c Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 2 May 2023 14:22:04 -0400
Subject: [PATCH 17/49] Also guard innerReductionHeuristic, update test

---
 csrc/scheduler/reduction.cpp | 25 ++++++++++++++++++++++---
 test/test_gpu3.cpp           |  2 +-
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/csrc/scheduler/reduction.cpp b/csrc/scheduler/reduction.cpp
index 0769fc2d655..4c27f3b24ef 100644
--- a/csrc/scheduler/reduction.cpp
+++ b/csrc/scheduler/reduction.cpp
@@ -77,6 +77,25 @@ std::shared_ptr<ReductionParams> innerReductionHeuristic(
 
   const int64_t n_elems = total_reduction_numel * total_iteration_numel;
 
+  if (n_elems == 0) {
+    // Number of elements in input is zero
+    auto rparams = std::make_unique<ReductionParams>();
+    if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) {
+      std::cerr << "\nNumber of elements in reduction input buffer is zero\n"
+                << std::endl;
+      std::cerr << "\n===== Reduction Stats ========\n"
+                << "total_reduction_numel: " << total_reduction_numel << "\n"
+                << "total_iteration_numel: " << total_iteration_numel << "\n"
+                << "vectorize_factor: " << vectorize_factor << "\n"
+                << "n_tensor_inputs: " << n_tensor_inputs << "\n"
+                << "max_input_dtype_size: " << max_input_dtype_size << "\n"
+                << "block(" << rparams->lparams.bdimx() << ", "
+                << rparams->lparams.bdimy() << ", 1)" << std::endl;
+      std::cerr << rparams->toString() << std::endl;
+    }
+    return std::move(rparams);
+  }
+
   // WARNING: At some point we may want to generate heuristics for another
   // device that is not the current device.
   const int64_t device_max_threads_per_multiprocessor =
@@ -516,7 +535,9 @@ std::shared_ptr<ReductionParams> outerReductionHeuristic(
     const int64_t n_tensor_inputs,
     const int64_t max_input_dtype_size,
     const size_t vectorize_factor) {
-  if (total_reduction_numel == 0 || total_iteration_numel == 0) {
+  const int64_t n_elems = total_reduction_numel * total_iteration_numel;
+
+  if (n_elems == 0) {
     // Number of elements in input is zero
     auto rparams = std::make_unique<ReductionParams>();
     if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) {
@@ -554,8 +575,6 @@ std::shared_ptr<ReductionParams> outerReductionHeuristic(
       // Reduce unrolling if we have many inputs, start reduction at 4 inputs
       scheduler_utils::lastPow2(
           std::max((int64_t)n_tensor_inputs >> 2, (int64_t)1)));
-
-  const int64_t n_elems = total_reduction_numel * total_iteration_numel;
   const int64_t n_waves = 8;
 
   // if data fits in l2 and we need more parallelization in the iter dim,
diff --git a/test/test_gpu3.cpp b/test/test_gpu3.cpp
index 232354a029b..b6ec836c75d 100644
--- a/test/test_gpu3.cpp
+++ b/test/test_gpu3.cpp
@@ -8277,7 +8277,7 @@ TEST_F(NVFuserTest, FusionClearGmemBetweenSegments_CUDA) {
 
 // Test that 0-dimensional tensors do not break reduction scheduler
 TEST_F(NVFuserTest, FusionReduceZeroElementTensor_CUDA) {
-  for (int reduction_dim : {1, 2}) {
+  for (int reduction_dim : {0, 1, 2, 3}) {
     auto fusion = std::make_unique<Fusion>();
     FusionGuard fg(fusion.get());
 

From 4e2c170956e4cab410f8bdde6cf7f05980e5816b Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 3 May 2023 12:55:14 -0400
Subject: [PATCH 18/49] Add more test cases for zero-element reduction

This also checks that we do not parallelize in this case.
---
 test/test_gpu3.cpp | 68 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 21 deletions(-)

diff --git a/test/test_gpu3.cpp b/test/test_gpu3.cpp
index b6ec836c75d..48b1c544303 100644
--- a/test/test_gpu3.cpp
+++ b/test/test_gpu3.cpp
@@ -8276,30 +8276,56 @@ TEST_F(NVFuserTest, FusionClearGmemBetweenSegments_CUDA) {
 }
 
 // Test that 0-dimensional tensors do not break reduction scheduler
-TEST_F(NVFuserTest, FusionReduceZeroElementTensor_CUDA) {
-  for (int reduction_dim : {0, 1, 2, 3}) {
-    auto fusion = std::make_unique<Fusion>();
-    FusionGuard fg(fusion.get());
-
-    std::vector<int64_t> input_shape{3, 4, 0, 5};
-    auto tv0 = makeSymbolicTensor(4);
-    fusion->addInput(tv0);
-    auto tv1 = sum(tv0, {reduction_dim});
-    fusion->addOutput(tv1);
-
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor at_x = at::randn(input_shape, options);
-    auto t2 = at_x.sum({reduction_dim});
+TEST_F(NVFuserTest, FusionScheduleReduceZeroElementTensor_CUDA) {
+  for (auto input_shape : std::vector<std::vector<int>>{
+           {3, 4, 0, 5}, // Warp-reduce in all dim pairs (ignoring zero)
+           {33, 40, 0, 50}, // Require block reduction (ignoring zero)
+           {300, 400, 0, 500}, // Require grid reduction (ignoring zero)
+       }) {
+    for (auto reduction_dims : std::vector<std::vector<int>>{
+             {0}, // outermost only
+             {3}, // innermost only
+             {2}, // only zero-dim
+             {1, 2}, // zero-dim and non-zero
+             {2, 3}, // zero-dim and non-zero (innermost)
+         }) {
+      auto fusion = std::make_unique<Fusion>();
+      FusionGuard fg(fusion.get());
+
+      auto tv0 = makeSymbolicTensor(4);
+      fusion->addInput(tv0);
+      auto tv1 = sum(tv0, reduction_dims);
+      fusion->addOutput(tv1);
+
+      auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+      at::Tensor at_x = at::randn(
+          std::vector<int64_t>(input_shape.begin(), input_shape.end()),
+          options);
+      auto t2 = at_x.sum(
+          std::vector<int64_t>(reduction_dims.begin(), reduction_dims.end()));
+
+      auto reduction_params = getReductionHeuristics(fusion.get(), {at_x});
+      TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+      scheduleReduction(fusion.get(), *reduction_params);
 
-    auto reduction_params = getReductionHeuristics(fusion.get(), {at_x});
-    TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-    scheduleReduction(fusion.get(), *reduction_params);
+      FusionExecutor fe;
+      fe.compileFusion(fusion.get(), {at_x});
+      auto cg_outputs = fe.runFusion({at_x});
 
-    FusionExecutor fe;
-    fe.compileFusion(fusion.get(), {at_x});
-    auto cg_outputs = fe.runFusion({at_x});
+      testValidate(fusion.get(), cg_outputs, {at_x}, {t2}, __LINE__, __FILE__);
 
-    testValidate(fusion.get(), cg_outputs, {at_x}, {t2}, __LINE__, __FILE__);
+      // verify that the scheduler does not parallelize any IterDomains
+      for (auto tv : ir_utils::allTvs(fusion.get())) {
+        for (auto id : tv->domain()->leaf()) {
+          TORCH_CHECK(
+              id->getParallelType() == ParallelType::Serial ||
+                  id->getParallelType() == ParallelType::Unswitch ||
+                  id->getParallelType() == ParallelType::Unroll,
+              "No IterDomains should be parallelized in zero-element reduction but found ",
+              id->toString());
+        }
+      }
+    }
   }
 }
 

From bf87c191924ce2ed60d4a8c7b0483b1443818152 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 9 May 2023 19:28:52 -0400
Subject: [PATCH 19/49] Move resizeOutputIterType to ir_utils

---
 csrc/dynamic_transform.cpp |  2 +-
 csrc/dynamic_transform.h   | 23 -----------------------
 csrc/ir_nodes.cpp          |  3 ++-
 csrc/ir_utils.h            | 23 +++++++++++++++++++++++
 4 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index e321268105e..3101e6efc4a 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -158,7 +158,7 @@ void DynamicTransformInfoBuilder::handle(TensorView* tv) {
             "Cannot evaluate the right expansion of an IterDomain resize: ",
             right->toString());
 
-        auto out_itertype = resize_output_itertype(
+        auto out_itertype = resizeOutputItertype(
             in_extent_val->as<int64_t>(),
             out_extent_val->as<int64_t>(),
             left_val->as<int64_t>(),
diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h
index 851bc8d339e..681a2a0693e 100644
--- a/csrc/dynamic_transform.h
+++ b/csrc/dynamic_transform.h
@@ -23,29 +23,6 @@ class Fusion;
 class ExpressionEvaluator;
 class DynamicTransformInfoBuilder;
 
-//! Compute the IterType of an IterDomain that has been resized. If the output
-//! is size 1, or the output uses no input elements, this function returns
-//! Broadcast. Otherwise, it returns Iteration.
-inline IterType resize_output_itertype(
-    int64_t in_extent,
-    int64_t out_extent,
-    int64_t left,
-    int64_t right) {
-  TORCH_CHECK(out_extent >= 0, "Resized extent must be non-negative.");
-  if (
-      // negative padding sums to input extent. Output is zero-dimensional
-      out_extent == 0 ||
-      // input overlaps output
-      left + in_extent > 0 || right + in_extent > 0) {
-    return IterType::Iteration;
-  } else {
-    // Result is size-1 or input doesn't overlap output.
-    // In these cases, the output is just a broadcast of either the used input
-    // value, or the pad value.
-    return IterType::Broadcast;
-  }
-}
-
 //! A set of transformations for a symbolic fusion with concrete sizes
 //! of the fusion inputs
 class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo {
diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp
index 831cb9fa4a8..e4752ef72cb 100644
--- a/csrc/ir_nodes.cpp
+++ b/csrc/ir_nodes.cpp
@@ -2501,7 +2501,8 @@ IterDomain* IterDomain::resize(
     auto out_extent = resized_id_size->getInt().value();
     auto left = left_expansion->getInt().value();
     auto right = right_expansion->getInt().value();
-    iter_type = resize_output_itertype(in_extent, out_extent, left, right);
+    iter_type =
+        ir_utils::resizeOutputIterType(in_extent, out_extent, left, right);
   } else {
     iter_type = IterType::Symbolic;
   }
diff --git a/csrc/ir_utils.h b/csrc/ir_utils.h
index 31c8e7edad3..0516b1a04b1 100644
--- a/csrc/ir_utils.h
+++ b/csrc/ir_utils.h
@@ -424,5 +424,28 @@ void validateDomainEquivalence(
     const std::vector<IterDomain*>& initial_domain,
     const std::vector<IterDomain*>& derived_domain);
 
+//! Compute the IterType of an IterDomain that has been resized. If the output
+//! is size 1, or the output uses no input elements, this function returns
+//! Broadcast. Otherwise, it returns Iteration.
+inline IterType resizeOutputIterType(
+    int64_t in_extent,
+    int64_t out_extent,
+    int64_t left,
+    int64_t right) {
+  TORCH_CHECK(out_extent >= 0, "Resized extent must be non-negative.");
+  if (
+      // negative padding sums to input extent. Output is zero-dimensional
+      out_extent == 0 ||
+      // input overlaps output
+      left + in_extent > 0 || right + in_extent > 0) {
+    return IterType::Iteration;
+  } else {
+    // Result is size-1 or input doesn't overlap output.
+    // In these cases, the output is just a broadcast of either the used input
+    // value, or the pad value.
+    return IterType::Broadcast;
+  }
+}
+
 } // namespace ir_utils
 } // namespace nvfuser

From 3371303112f48d1eb75102b1647a123042508c36 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com>
Date: Tue, 9 May 2023 19:40:45 -0400
Subject: [PATCH 20/49] Use initializer-if in csrc/dynamic_transform.cpp

Co-authored-by: Naoya Maruyama <naoyam@users.noreply.github.com>
---
 csrc/dynamic_transform.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 3101e6efc4a..6d0b3dc3793 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -126,9 +126,7 @@ std::string DynamicTransformConcretizationInfo::toString() const {
 void DynamicTransformInfoBuilder::handle(TensorView* tv) {
   auto rfd = tv->domain()->getMaybeRFactorDomain();
   for (auto id : rfd) {
-    if (id->getIterType() == IterType::Symbolic && id->definition()) {
-      auto def = id->definition();
-      if (def->isA<Resize>()) {
+    if (auto op = dynamic_cast<Resize>(id->definition()); id->getIterType() == IterType::Symbolic && op != nullptr) {
         auto op = def->as<Resize>();
 
         auto out_extent_val = expr_eval_->evaluate(id->extent());

From d5f521389438eebdcb28b01b22597c98608c13d5 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 10 May 2023 10:09:37 -0400
Subject: [PATCH 21/49] Add iter_type option to resize(), other fixes

---
 csrc/dynamic_transform.cpp | 209 +++++++++++++++----------------------
 csrc/ir_internal_nodes.h   |   3 +-
 csrc/ir_nodes.cpp          |  19 ++--
 3 files changed, 97 insertions(+), 134 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 3101e6efc4a..61cc0715715 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -124,7 +124,7 @@ std::string DynamicTransformConcretizationInfo::toString() const {
 }
 
 void DynamicTransformInfoBuilder::handle(TensorView* tv) {
-  auto rfd = tv->domain()->getMaybeRFactorDomain();
+  const auto& rfd = tv->domain()->getMaybeRFactorDomain();
   for (auto id : rfd) {
     if (id->getIterType() == IterType::Symbolic && id->definition()) {
       auto def = id->definition();
@@ -158,7 +158,7 @@ void DynamicTransformInfoBuilder::handle(TensorView* tv) {
             "Cannot evaluate the right expansion of an IterDomain resize: ",
             right->toString());
 
-        auto out_itertype = resizeOutputItertype(
+        auto out_itertype = ir_utils::resizeOutputIterType(
             in_extent_val->as<int64_t>(),
             out_extent_val->as<int64_t>(),
             left_val->as<int64_t>(),
@@ -327,59 +327,21 @@ void DynamicTransformConcretizer::concretizeReshape() {
 void DynamicTransformConcretizer::concretizeResize() {
   // Concretize each resize op.
   for (const auto& resize_info : info_.getResizeTransforms()) {
-    auto incomplete_out_tv = std::get<0>(resize_info);
     auto id = std::get<1>(resize_info);
     auto iter_type = std::get<2>(resize_info);
 
-    auto new_id = IterDomainBuilder(id).iter_type(iter_type).build();
-
-    // swap in new IterDomain as output of the resize Expr
-    ir_utils::replaceValInExpr(id->definition(), id, new_id);
-
-    // We need to replace the TensorDomain of incomplete_out_tv with one where
-    // we've replaced id with new_id in the r-factor domain.
-    std::vector<IterDomain*> new_rfactor_domain;
-    if (incomplete_out_tv->domain()->hasRFactor()) {
-      auto old_rfactor_domain =
-          incomplete_out_tv->domain()->getMaybeRFactorDomain();
-      new_rfactor_domain.resize(old_rfactor_domain.size());
-      for (auto i : c10::irange(old_rfactor_domain.size())) {
-        new_rfactor_domain[i] =
-            old_rfactor_domain[i] == id ? new_id : old_rfactor_domain[i];
-      }
-    }
-
-    auto new_td = IrBuilder::create<TensorDomain>(
-        incomplete_out_tv->container(),
-        incomplete_out_tv->domain()->getRootDomain(),
-        new_rfactor_domain,
-        new_rfactor_domain, // TODO: add check that we don't have leaf
-                            // transforms
-        incomplete_out_tv->domain()->getContiguityFilledWith(
-            new_rfactor_domain, true));
-    auto new_out_tv = IrBuilder::create<TensorView>(
-        new_td, incomplete_out_tv->dtype(), incomplete_out_tv->getMemoryType());
-
-    TORCH_INTERNAL_ASSERT(
-        incomplete_out_tv->definition(),
-        "Cannot replace TensorView with resized IterDomain if it has no definition");
-
-    // This should set the definition of new_out_tv
-    ir_utils::replaceValInExpr(
-        incomplete_out_tv->definition(), incomplete_out_tv, new_out_tv);
-
-    // Replace the old tensor with the new concretized tensor
-    for (auto use_of_old_tv : incomplete_out_tv->uses()) {
-      ir_utils::replaceValInExpr(use_of_old_tv, incomplete_out_tv, new_out_tv);
-    }
-
-    if (incomplete_out_tv->isFusionOutput()) {
-      incomplete_out_tv->fusion()->replaceOutput(incomplete_out_tv, new_out_tv);
-    }
-
-    incomplete_out_tv->fusion()->removeVal(incomplete_out_tv->domain());
-    incomplete_out_tv->fusion()->removeVal(incomplete_out_tv);
-    incomplete_out_tv->fusion()->removeVal(id);
+    TORCH_CHECK(
+        id->definition() && id->definition()->isA<Resize>(),
+        "Resized IterDomain must have a Resize definition");
+    auto def = id->definition()->as<Resize>();
+    auto new_id = IterDomain::resize(
+        def->in(),
+        def->leftExpand(),
+        def->rightExpand(),
+        id->isRFactorProduct(),
+        iter_type);
+
+    registerMutation(id, new_id);
   }
 }
 
@@ -396,80 +358,81 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
   // axes inherited from the producers
   auto propagated = propagateFromProducerToConsumer(tv);
 
-  // If no root domain is altered, nothing to do further
-  if (!propagated) {
-    return;
-  }
+  if (propagated) {
+    // Root IDs are altered. Need to propagate the changes to rfactor
+    // domain
 
-  // Root IDs are altered. Need to propagate the changes to rfactor
-  // domain
-
-  // At this point, there should be no expr beyond rfactor root
-  TORCH_INTERNAL_ASSERT(
-      tv->domain()->leaf() == tv->getMaybeRFactorDomain(),
-      "Invalid tensor: ",
-      tv->toString());
-
-  // If it has an rfactor root domain, the IterTypes of the rfactor
-  // IDs may need to be updated as well. Traverse the rfactor exprs
-  // and mutate the IterTypes of output IDs if symbolic.
-  if (tv->hasRFactor()) {
-    // Note that it is assumed that theres's no further expression
-    // beyond the rfactor domain as asserted above
-    auto all_id_exprs = StmtSort::getExprsBetween(
-        tv->fusion(),
-        {tv->getRootDomain().begin(), tv->getRootDomain().end()},
-        {tv->getMaybeRFactorDomain().begin(),
-         tv->getMaybeRFactorDomain().end()});
-    for (auto expr : all_id_exprs) {
-      // Assume outputs of IterDomain exprs are always IterDomains. If
-      // the assumption is invalidated, the logic here would need to
-      // be updated. Assert the assumption to immediately detect such
-      // a case if happened.
-      for (auto out_val : expr->outputs()) {
+    // At this point, there should be no expr beyond rfactor root
+    TORCH_INTERNAL_ASSERT(
+        tv->domain()->leaf() == tv->getMaybeRFactorDomain(),
+        "Invalid tensor: ",
+        tv->toString());
+
+    // If it has an rfactor root domain, the IterTypes of the rfactor
+    // IDs may need to be updated as well. Traverse the rfactor exprs
+    // and mutate the IterTypes of output IDs if symbolic.
+    if (tv->hasRFactor()) {
+      // Note that it is assumed that theres's no further expression
+      // beyond the rfactor domain as asserted above
+      auto all_id_exprs = StmtSort::getExprsBetween(
+          tv->fusion(),
+          {tv->getRootDomain().begin(), tv->getRootDomain().end()},
+          {tv->getMaybeRFactorDomain().begin(),
+           tv->getMaybeRFactorDomain().end()});
+      for (auto expr : all_id_exprs) {
+        // Assume outputs of IterDomain exprs are always IterDomains. If
+        // the assumption is invalidated, the logic here would need to
+        // be updated. Assert the assumption to immediately detect such
+        // a case if happened.
+        for (auto out_val : expr->outputs()) {
+          TORCH_INTERNAL_ASSERT(
+              out_val->isA<IterDomain>(),
+              "Unexpected output: ",
+              out_val->toString(),
+              ". IterDomain was expected.");
+        }
+
+        // If none of the output IDs is symbolic, nothing to concretize
+        if (std::all_of(
+                expr->outputs().begin(),
+                expr->outputs().end(),
+                [](Val* output) {
+                  return output->as<IterDomain>()->getIterType() !=
+                      IterType::Symbolic;
+                })) {
+          continue;
+        }
+        // If any of output IDs is symbolic, all outputs should be symbolic
+        TORCH_INTERNAL_ASSERT(std::all_of(
+            expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
+              return output->as<IterDomain>()->getIterType() ==
+                  IterType::Symbolic;
+            }));
+
+        // Determine the output IterType
+        IterType iter_type = IterType::Symbolic;
+        for (auto inp_id : ir_utils::filterByType<IterDomain>(expr->inputs())) {
+          auto updated_id = maybeMutated(inp_id)->as<IterDomain>();
+          iter_type =
+              ops::promoteIterType(iter_type, updated_id->getIterType());
+        }
         TORCH_INTERNAL_ASSERT(
-            out_val->isA<IterDomain>(),
-            "Unexpected output: ",
-            out_val->toString(),
-            ". IterDomain was expected.");
+            iter_type != IterType::Symbolic,
+            "Failed to concretize an output IterType for expression: ",
+            expr->toString());
+
+        // Update the IterType of each output
+        for (auto out_id :
+             ir_utils::filterByType<IterDomain>(expr->outputs())) {
+          auto concreteized_out_id =
+              IterDomainBuilder(out_id).iter_type(iter_type).build();
+          registerMutation(out_id, concreteized_out_id);
+        }
+
+        // Outputs are mutated. The expr itself needs to be mutated as
+        // well, which can be done by the mutate method
+        OptOutMutator::mutate(expr);
       }
-
-      // If none of the output IDs is symbolic, nothing to concretize
-      if (std::all_of(
-              expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
-                return output->as<IterDomain>()->getIterType() !=
-                    IterType::Symbolic;
-              })) {
-        continue;
-      }
-      // If any of output IDs is symbolic, all outputs should be symbolic
-      TORCH_INTERNAL_ASSERT(std::all_of(
-          expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
-            return output->as<IterDomain>()->getIterType() ==
-                IterType::Symbolic;
-          }));
-
-      // Determine the output IterType
-      IterType iter_type = IterType::Symbolic;
-      for (auto inp_id : ir_utils::filterByType<IterDomain>(expr->inputs())) {
-        auto updated_id = maybeMutated(inp_id)->as<IterDomain>();
-        iter_type = ops::promoteIterType(iter_type, updated_id->getIterType());
-      }
-      TORCH_INTERNAL_ASSERT(
-          iter_type != IterType::Symbolic,
-          "Failed to concretize an output IterType for expression: ",
-          expr->toString());
-
-      // Update the IterType of each output
-      for (auto out_id : ir_utils::filterByType<IterDomain>(expr->outputs())) {
-        auto concreteized_out_id =
-            IterDomainBuilder(out_id).iter_type(iter_type).build();
-        registerMutation(out_id, concreteized_out_id);
-      }
-
-      // Outputs are mutated. The expr itself needs to be mutated as
-      // well, which can be done by the mutate method
-      OptOutMutator::mutate(expr);
     }
   }
 
diff --git a/csrc/ir_internal_nodes.h b/csrc/ir_internal_nodes.h
index 366d1c8ba02..8aecf89769b 100644
--- a/csrc/ir_internal_nodes.h
+++ b/csrc/ir_internal_nodes.h
@@ -1505,7 +1505,8 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
       IterDomain* in,
       Val* left_expansion,
       Val* right_expansion,
-      bool mark_as_rfactor = false);
+      bool mark_as_rfactor = false,
+      std::optional<IterType> iter_type = std::nullopt);
 
   bool isReduction() const {
     return getIterType() == IterType::Reduction;
diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp
index e4752ef72cb..7735ee5bf48 100644
--- a/csrc/ir_nodes.cpp
+++ b/csrc/ir_nodes.cpp
@@ -2452,7 +2452,8 @@ IterDomain* IterDomain::resize(
     IterDomain* in,
     Val* left_expansion,
     Val* right_expansion,
-    bool mark_as_rfactor) {
+    bool mark_as_rfactor,
+    std::optional<IterType> iter_type_opt) {
   TORCH_CHECK(
       left_expansion->isIntegralScalar(),
       "Expansion factor must be an integer scalar: ",
@@ -2495,16 +2496,14 @@ IterDomain* IterDomain::resize(
         right_expansion);
   }
 
-  auto iter_type = in->getIterType();
-  if (resized_id_size->isConstInt()) {
-    auto in_extent = in->extent()->getInt().value();
+  // Output IterType is Symbolic unless provided, or if extent is a const Int,
+  // in which case we set it to either Broadcast or Iteration.
+  IterType iter_type = IterType::Symbolic;
+  if (iter_type_opt.has_value()) {
+    iter_type = iter_type_opt.value();
+  } else if (resized_id_size->isConstInt()) {
     auto out_extent = resized_id_size->getInt().value();
-    auto left = left_expansion->getInt().value();
-    auto right = right_expansion->getInt().value();
-    iter_type =
-        ir_utils::resizeOutputIterType(in_extent, out_extent, left, right);
-  } else {
-    iter_type = IterType::Symbolic;
+    iter_type = out_extent == 1 ? IterType::Broadcast : IterType::Iteration;
   }
 
   auto resized_id =

From 88bb23991441e27512b48cd5bae114e1fd2e0f59 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 10 May 2023 10:54:04 -0400
Subject: [PATCH 22/49] Minor fixup for initializer-if

---
 csrc/dynamic_transform.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 3463036c589..0bf105c3cca 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -124,12 +124,13 @@ std::string DynamicTransformConcretizationInfo::toString() const {
 }
 
 void DynamicTransformInfoBuilder::handle(TensorView* tv) {
-  const auto& rfd = tv->domain()->getMaybeRFactorDomain();
+  const auto& rfd = tv->getMaybeRFactorDomain();
   for (auto id : rfd) {
-    if (auto op = dynamic_cast<Resize>(id->definition());
+    if (!id->definition()) {
+      continue;
+    }
+    if (auto op = dynamic_cast<Resize*>(id->definition());
         id->getIterType() == IterType::Symbolic && op != nullptr) {
-      auto op = def->as<Resize>();
-
       auto out_extent_val = expr_eval_->evaluate(id->extent());
       TORCH_INTERNAL_ASSERT(
           out_extent_val.has_value(),
@@ -167,7 +168,6 @@ void DynamicTransformInfoBuilder::handle(TensorView* tv) {
     }
   }
 }
-}
 
 void DynamicTransformInfoBuilder::handle(ViewOp* op) {
   auto inp_tv = op->in()->as<TensorView>();

From fde271dbb1b05296dd3120fb0ba3d76272c7a70e Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 10 May 2023 12:57:54 -0400
Subject: [PATCH 23/49] Remove stale TODO comment

---
 test/test_dynamic_transform.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index 9df69026594..f10c1a8d1de 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -867,7 +867,6 @@ void reductionDynamicPadAddFusion(
   Fusion& fusion = *fusion_ptr.get();
   FusionGuard fg(&fusion);
 
-  // TODO: change symbolic size for padded dimension if start size is 1
   TensorView* x = makeSymbolicTensor(input_dims);
   fusion.addInput(x);
 

From 2d98beb14b4254985c7307cbf179e5f4869130c9 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 10 May 2023 13:04:45 -0400
Subject: [PATCH 24/49] Silence clang-tidy

---
 csrc/scheduler/reduction.cpp    | 2 +-
 test/test_dynamic_transform.cpp | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/csrc/scheduler/reduction.cpp b/csrc/scheduler/reduction.cpp
index eb12c4fbd70..6b7022a502f 100644
--- a/csrc/scheduler/reduction.cpp
+++ b/csrc/scheduler/reduction.cpp
@@ -113,7 +113,7 @@ std::shared_ptr<ReductionParams> innerReductionHeuristic(
           std::max((int64_t)n_tensor_inputs >> 2, (int64_t)1)));
 
   // Conservative value, could be set to larger based on arch if necessary.
-  constexpr int64_t l1_cache = 32 * 1024;
+  constexpr int64_t l1_cache = (int64_t)32 * 1024;
   // Could change per generation, but for l1 we want to consider active threads,
   // not resident
   constexpr int64_t active_threads = 1024;
diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index f10c1a8d1de..97c314c9ebf 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -805,7 +805,7 @@ void reductionDynamicViewAddFusion(
         }
       }
       if (negone_dim >= 0) {
-        bias_shape[negone_dim] = at_x.numel() / other_numel;
+        bias_shape[negone_dim] = (int64_t)at_x.numel() / (int64_t)other_numel;
       }
     }
     at::Tensor at_bias = at::randn(bias_shape, options);
@@ -931,6 +931,7 @@ void reductionDynamicPadAddFusion(
 
 // Test dynamic pad for various inputs
 TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) {
+  // NOLINTBEGIN(bugprone-implicit-widening-of-multiplication-result)
   auto invocations = std::vector<dynamic_pad_invocation>{
       {{3, 5}, {0, 0}, true}, // trivial
 
@@ -960,6 +961,7 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) {
       {{3, 0}, {1, 1}, false},
       //{{3, 0}, {-1, 1}, false}, // TODO: SIGFPE (see above)
   };
+  // NOLINTEND(bugprone-implicit-widening-of-multiplication-result)
   reductionDynamicPadAddFusion(invocations);
 }
 

From db5b8fce26e5c5598e96ff06a60590d4e8d33ac7 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 10 May 2023 13:24:46 -0400
Subject: [PATCH 25/49] Remove unguarded reduction stats printing

This was either a merge error or a copy-paste error from earlier commits
---
 csrc/scheduler/reduction.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/csrc/scheduler/reduction.cpp b/csrc/scheduler/reduction.cpp
index 6b7022a502f..c6e624d33b9 100644
--- a/csrc/scheduler/reduction.cpp
+++ b/csrc/scheduler/reduction.cpp
@@ -555,12 +555,6 @@ std::shared_ptr<ReductionParams> outerReductionHeuristic(
     }
     return std::move(rparams);
   }
-  std::cerr << "\n===== Reduction Stats ========\n"
-            << "total_reduction_numel: " << total_reduction_numel << "\n"
-            << "total_iteration_numel: " << total_iteration_numel << "\n"
-            << "vectorize_factor: " << vectorize_factor << "\n"
-            << "n_tensor_inputs: " << n_tensor_inputs << "\n"
-            << "max_input_dtype_size: " << max_input_dtype_size << std::endl;
   // WARNING: Current device for codegen may not be the target device
   const int64_t device_max_threads_per_multiprocessor =
       (int64_t)at::cuda::getCurrentDeviceProperties()

From 5db7cb4b58abc46dde69f1ce9032a29d3c61f6ad Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 10 May 2023 15:07:25 -0400
Subject: [PATCH 26/49] Try harder for static IterDomain::resize

If we can do a simple proof that out extent is >1, use static resize
instead of Symbolic. Note that there are plenty of cases where this
might be provable that we do not cover. In those cases it's best to pass
the iter_type_opt argument.
---
 csrc/ir_nodes.cpp | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp
index 75cddfdea5f..641d5ea48cd 100644
--- a/csrc/ir_nodes.cpp
+++ b/csrc/ir_nodes.cpp
@@ -2504,14 +2504,23 @@ IterDomain* IterDomain::resize(
         right_expansion);
   }
 
-  // Output IterType is Symbolic unless provided, or if extent is a const Int,
-  // in which case we set it to either Broadcast or Iteration.
+  // If output IterType is provided, use it. Otherwise, if we can prove the
+  // resized extent is 1, set to Broadcast, if we can prove it is >1 set to
+  // Iteration, and otherwise fall back to Symbolic.
   IterType iter_type = IterType::Symbolic;
   if (iter_type_opt.has_value()) {
     iter_type = iter_type_opt.value();
-  } else if (resized_id_size->isConstInt()) {
-    auto out_extent = resized_id_size->getInt().value();
-    iter_type = out_extent == 1 ? IterType::Broadcast : IterType::Iteration;
+  } else if (left_expansion->isConstInt() && right_expansion->isConstInt()) {
+    if (resized_id_size->isConstInt()) {
+      // Means input extent is also known
+      auto out_extent = resized_id_size->getInt().value();
+      iter_type = out_extent == 1 ? IterType::Broadcast : IterType::Iteration;
+    } else if (
+        left_expansion->getInt().value() + right_expansion->getInt().value() >
+        1) {
+      // Input extent is non-negative, so we know out_extent > 1
+      iter_type = IterType::Iteration;
+    }
   }
 
   auto resized_id =

From a02fbb987e72665477964244186304943f9b08a6 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 10 May 2023 12:55:49 -0700
Subject: [PATCH 27/49] Fix skip resize ops in BestEffortReplay

Concretization introduces multiple exprs of an IterDomain, so just
looking at all `uses()` is not correct.
---
 csrc/transform_iter.cpp | 29 ++++++++++++++++++++---------
 csrc/transform_iter.h   |  4 +++-
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/csrc/transform_iter.cpp b/csrc/transform_iter.cpp
index 99ea4e377cc..cd7d835f9a3 100644
--- a/csrc/transform_iter.cpp
+++ b/csrc/transform_iter.cpp
@@ -386,7 +386,7 @@ BestEffortReplay::BestEffortReplay(
   }
 
   if (skip_resize) {
-    skipResizes();
+    skipResizes(target_exprs, replay_exprs);
   }
 
   std::string err_str(
@@ -626,7 +626,7 @@ BestEffortReplay::BestEffortReplay(
     }
 
     if (skip_resize) {
-      skipResizes();
+      skipResizes(target_exprs, replay_exprs);
     }
   }
 }
@@ -1089,9 +1089,18 @@ void BestEffortReplay::skipSwizzles(
 }
 
 // Same logic as skipSwizzles
-void BestEffortReplay::skipResizes() {
-  auto isResizeInput = [](IterDomain* id) -> bool {
-    return id->uses().size() == 1 && id->uses().front()->isA<Resize>();
+void BestEffortReplay::skipResizes(
+    const std::vector<Expr*>& target_exprs,
+    const std::vector<Expr*>& replay_exprs) {
+  auto getResizeUse = [](IterDomain* id,
+                         const std::vector<Expr*>& exprs) -> Resize* {
+    for (auto id_use : id->uses()) {
+      if (std::find(exprs.begin(), exprs.end(), id_use) == exprs.end()) {
+        continue;
+      }
+      return dynamic_cast<Resize*>(id_use);
+    }
+    return nullptr;
   };
 
   bool updated = true;
@@ -1103,11 +1112,13 @@ void BestEffortReplay::skipResizes() {
       auto new_target_id = target_id;
       auto replay_id = it.second;
       auto new_replay_id = replay_id;
-      if (isResizeInput(target_id)) {
-        new_target_id = target_id->uses().front()->as<Resize>()->out();
+      if (auto target_resize = getResizeUse(target_id, target_exprs);
+          target_resize != nullptr) {
+        new_target_id = target_resize->out();
       }
-      if (isResizeInput(replay_id)) {
-        new_replay_id = replay_id->uses().front()->as<Resize>()->out();
+      if (auto replay_resize = getResizeUse(replay_id, replay_exprs);
+          replay_resize != nullptr) {
+        new_replay_id = replay_resize->out();
       }
 
       if (new_target_id == target_id && new_replay_id == replay_id) {
diff --git a/csrc/transform_iter.h b/csrc/transform_iter.h
index 076ece21f95..9ad78b46980 100644
--- a/csrc/transform_iter.h
+++ b/csrc/transform_iter.h
@@ -306,7 +306,9 @@ class TORCH_CUDA_CU_API BestEffortReplay {
       const std::unordered_map<IterDomain*, Expr*>& replay_id2expr);
 
   // Skip resize in both target and replay domains
-  void skipResizes();
+  void skipResizes(
+      const std::vector<Expr*>& target_exprs,
+      const std::vector<Expr*>& replay_exprs);
 
  public:
   // When skip_resize is true, resize is ignored or in other words forwarded

From 23f383ba9d2f767432a11ae8ef919c21d449d101 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 10 May 2023 19:48:28 -0400
Subject: [PATCH 28/49] Replace getInt with evaluateInt in IterDomain::resize()

---
 csrc/ir_nodes.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp
index 641d5ea48cd..432d924d9cd 100644
--- a/csrc/ir_nodes.cpp
+++ b/csrc/ir_nodes.cpp
@@ -2513,11 +2513,10 @@ IterDomain* IterDomain::resize(
   } else if (left_expansion->isConstInt() && right_expansion->isConstInt()) {
     if (resized_id_size->isConstInt()) {
       // Means input extent is also known
-      auto out_extent = resized_id_size->getInt().value();
+      auto out_extent = resized_id_size->evaluateInt();
       iter_type = out_extent == 1 ? IterType::Broadcast : IterType::Iteration;
     } else if (
-        left_expansion->getInt().value() + right_expansion->getInt().value() >
-        1) {
+        left_expansion->evaluateInt() + right_expansion->evaluateInt() > 1) {
       // Input extent is non-negative, so we know out_extent > 1
       iter_type = IterType::Iteration;
     }

From 6cd6932bba74866f99a8dfe73cf0ed8c148c7cf7 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 10 May 2023 20:10:16 -0400
Subject: [PATCH 29/49] Use concrete sizes in cat and slice tests.

This fixes errors that popped up when those became dynamic Fusions.
---
 test/test_resize.cpp | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/test/test_resize.cpp b/test/test_resize.cpp
index 0214d8f2e62..d8fe4cffa76 100644
--- a/test/test_resize.cpp
+++ b/test/test_resize.cpp
@@ -567,10 +567,11 @@ TEST_F(NVFuserTest, FusionResizeCat3_CUDA) {
   std::vector<int64_t> shape0({4, 2});
   std::vector<int64_t> shape1({4, 3});
 
-  auto tv0 = makeSymbolicTensor(2);
+  // concrete shapes to avoid dynamic Fusion
+  auto tv0 = makeConcreteTensor(shape0);
   fusion.addInput(tv0);
 
-  auto tv1 = makeSymbolicTensor(2);
+  auto tv1 = makeConcreteTensor(shape1);
   fusion.addInput(tv1);
 
   auto tv2 = cat({tv0, tv1}, 1);
@@ -608,10 +609,11 @@ TEST_F(NVFuserTest, FusionResizeCat4_CUDA) {
   std::vector<int64_t> shape0({11, 12});
   std::vector<int64_t> shape1({11, 13});
 
-  auto tv0 = makeSymbolicTensor(2);
+  // concrete shapes to avoid dynamic Fusion
+  auto tv0 = makeConcreteTensor(shape0);
   fusion.addInput(tv0);
 
-  auto tv1 = makeSymbolicTensor(2);
+  auto tv1 = makeConcreteTensor(shape1);
   fusion.addInput(tv1);
 
   auto tv2 = cat({tv0, tv1}, 1);
@@ -649,11 +651,12 @@ TEST_F(NVFuserTest, FusionResizeCat5_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  auto tv0 = makeSymbolicTensor(2);
+  // concrete shapes to avoid dynamic Fusion
+  auto tv0 = makeConcreteTensor({11, 12});
   fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
+  auto tv1 = makeConcreteTensor({11, 13});
   fusion.addInput(tv1);
-  auto tv2 = makeSymbolicTensor(2);
+  auto tv2 = makeConcreteTensor({11, 25});
   fusion.addInput(tv2);
 
   auto tv3 = cat({tv0, tv1}, 1);
@@ -743,6 +746,7 @@ TEST_F(NVFuserTest, FusionResizeCat6_CUDA) {
 // Cat many tensors
 TEST_F(NVFuserTest, FusionResizeCat7_CUDA) {
   int num_tensors_to_concat = 10;
+  std::vector<int64_t> base_shape({11, 13});
 
   for (int concat_dim : {0, 1}) {
     Fusion fusion;
@@ -751,7 +755,10 @@ TEST_F(NVFuserTest, FusionResizeCat7_CUDA) {
     std::vector<TensorView*> inputs;
     for (const auto i : c10::irange(num_tensors_to_concat)) {
       (void)i;
-      auto tv = makeSymbolicTensor(2);
+      // concrete shapes to avoid dynamic Fusion
+      auto shape = base_shape;
+      shape[concat_dim] = 10 + (i % 5);
+      auto tv = makeConcreteTensor(shape);
       fusion.addInput(tv);
       inputs.push_back(tv);
     }
@@ -774,7 +781,6 @@ TEST_F(NVFuserTest, FusionResizeCat7_CUDA) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     at::manual_seed(0);
 
-    std::vector<int64_t> base_shape({11, 13});
     std::vector<at::Tensor> aten_inputs;
     for (const auto i : c10::irange(num_tensors_to_concat)) {
       auto shape = base_shape;
@@ -914,7 +920,8 @@ TEST_F(NVFuserTest, FusionResizeSlice1_CUDA) {
 
   std::vector<int64_t> shape({9});
 
-  auto tv0 = makeSymbolicTensor(1);
+  // concrete shapes to avoid dynamic Fusion
+  auto tv0 = makeConcreteTensor(shape);
   fusion.addInput(tv0);
 
   auto tv1 = slice(
@@ -1004,7 +1011,8 @@ TEST_F(NVFuserTest, FusionResizeSlice4_CUDA) {
 
   std::vector<int64_t> shape({5, 100});
 
-  auto tv0 = makeSymbolicTensor(2);
+  // concrete shapes to avoid dynamic Fusion
+  auto tv0 = makeConcreteTensor(shape);
   fusion.addInput(tv0);
 
   // Consider a fusion of:
@@ -1083,7 +1091,10 @@ TEST_F(NVFuserTest, FusionResizeSlice5_CUDA) {
   auto& fusion = *fusion_ptr;
   FusionGuard fg(fusion_ptr.get());
 
-  auto tv0 = makeSymbolicTensor(2);
+  std::vector<int64_t> shape({11, 1000});
+
+  // concrete shapes to avoid dynamic Fusion
+  auto tv0 = makeConcreteTensor(shape);
   fusion.addInput(tv0);
 
   auto tv1 = slice(
@@ -1118,7 +1129,6 @@ TEST_F(NVFuserTest, FusionResizeSlice5_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::manual_seed(0);
 
-  std::vector<int64_t> shape({11, 1000});
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 

From aec043d187fd5d0f82e284c05aa43218ded63051 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 11 May 2023 05:39:16 -0400
Subject: [PATCH 30/49] Stop holding TensorView in resize_transforms_

---
 csrc/dynamic_transform.cpp | 29 ++++++++++++-----------------
 csrc/dynamic_transform.h   |  7 +++----
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 0bf105c3cca..8c9182f8245 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -87,21 +87,20 @@ DynamicTransformConcretizationInfo DynamicTransformConcretizationInfo::clone(
     IrCloner& ir_cloner) const {
   DynamicTransformConcretizationInfo cloned_info(
       (Fusion*)ir_cloner.container());
-  for (auto& pair : reshape_transforms_) {
+  for (const auto& [tv, analyze_result] : reshape_transforms_) {
     cloned_info.reshape_transforms_.emplace_back(
-        ir_cloner.clone(pair.first),
+        ir_cloner.clone(tv),
         // reshape_transforms_ holds pairs of TensorView* and AnalyzeViewResult
         // AnalyzeViewResult can be copied directly as it holds no references to
         // Statements that would need cloning, only integer indices of axes.
-        pair.second);
+        analyze_result);
   }
-  for (auto& tx : resize_transforms_) {
+  for (const auto& [id, iter_type] : resize_transforms_) {
     cloned_info.resize_transforms_.emplace_back(
-        ir_cloner.clone(std::get<0>(tx)),
-        ir_cloner.clone(std::get<1>(tx)),
-        // Similar to reshape_transforms_, we only clone the TensorViews and
-        // IterDomains in resize_transforms_
-        std::get<2>(tx));
+        ir_cloner.clone(id),
+        // Similar to reshape_transforms_, we only clone the IterDomains in
+        // resize_transforms_
+        iter_type);
   }
   return cloned_info;
 }
@@ -116,9 +115,8 @@ std::string DynamicTransformConcretizationInfo::toString() const {
        << kv.second.toString() << "\n";
   }
   ss << indent << "Resize:\n";
-  for (const auto& kv : resize_transforms_) {
-    ss << indent << indent << std::get<0>(kv)->toString() << ", "
-       << std::get<1>(kv)->toString() << ", " << std::get<2>(kv) << "\n";
+  for (const auto& [id, iter_type] : resize_transforms_) {
+    ss << indent << indent << id->toString() << ", " << iter_type << "\n";
   }
   return ss.str();
 }
@@ -164,7 +162,7 @@ void DynamicTransformInfoBuilder::handle(TensorView* tv) {
           left_val->as<int64_t>(),
           right_val->as<int64_t>());
 
-      info_.resize_transforms_.emplace_back(tv, id, out_itertype);
+      info_.resize_transforms_.emplace_back(id, out_itertype);
     }
   }
 }
@@ -325,10 +323,7 @@ void DynamicTransformConcretizer::concretizeReshape() {
 
 void DynamicTransformConcretizer::concretizeResize() {
   // Concretize each resize op.
-  for (const auto& resize_info : info_.getResizeTransforms()) {
-    auto id = std::get<1>(resize_info);
-    auto iter_type = std::get<2>(resize_info);
-
+  for (const auto& [id, iter_type] : info_.getResizeTransforms()) {
     TORCH_CHECK(
         id->definition() && id->definition()->isA<Resize>(),
         "Resized IterDomain must have a Resize definition");
diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h
index 681a2a0693e..8dc21f60c3a 100644
--- a/csrc/dynamic_transform.h
+++ b/csrc/dynamic_transform.h
@@ -32,8 +32,8 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo {
     return reshape_transforms_;
   }
 
-  const std::vector<std::tuple<TensorView*, IterDomain*, IterType>>&
-  getResizeTransforms() const {
+  const std::vector<std::pair<IterDomain*, IterType>>& getResizeTransforms()
+      const {
     return resize_transforms_;
   }
 
@@ -65,8 +65,7 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo {
 
   // Holds the resized IterDomain (output of the Resize op) along with the
   // TensorView where it appears, and its concretized IterType
-  std::vector<std::tuple<TensorView*, IterDomain*, IterType>>
-      resize_transforms_;
+  std::vector<std::pair<IterDomain*, IterType>> resize_transforms_;
 
   friend class DynamicTransformInfoBuilder;
 };

From c76c2ee822d12926b88ac7d440ece406809f9806 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 11 May 2023 20:33:14 -0400
Subject: [PATCH 31/49] Mutate all IterDomain expressions from root

---
 csrc/dynamic_transform.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 8c9182f8245..d1af3519e8b 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -386,6 +386,9 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
               ". IterDomain was expected.");
         }
 
+        // Mutate the expression, in case inputs or outputs are symbolic
+        OptOutMutator::mutate(expr);
+
         // If none of the output IDs is symbolic, nothing to concretize
         if (std::all_of(
                 expr->outputs().begin(),

From 678e0eca7124b9e8985836c6451ece65b7eb4160 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 11 May 2023 20:50:55 -0400
Subject: [PATCH 32/49] Don't bail if no ID expr outputs are symbolic

But still check that they all are symbolic if any are. This should fix
cases like DynamicTransform5 where a root ID is symbolic but its rfactor
is not.
---
 csrc/dynamic_transform.cpp | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index d1af3519e8b..c48754e38be 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -386,25 +386,21 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
               ". IterDomain was expected.");
         }
 
-        // Mutate the expression, in case inputs or outputs are symbolic
-        OptOutMutator::mutate(expr);
-
         // If none of the output IDs is symbolic, nothing to concretize
-        if (std::all_of(
+        if (std::any_of(
                 expr->outputs().begin(),
                 expr->outputs().end(),
                 [](Val* output) {
-                  return output->as<IterDomain>()->getIterType() !=
+                  return output->as<IterDomain>()->getIterType() ==
                       IterType::Symbolic;
                 })) {
-          continue;
+          // If any of output IDs is symbolic, all outputs should be symbolic
+          TORCH_INTERNAL_ASSERT(std::all_of(
+              expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
+                return output->as<IterDomain>()->getIterType() ==
+                    IterType::Symbolic;
+              }));
         }
-        // If any of output IDs is symbolic, all outputs should be symbolic
-        TORCH_INTERNAL_ASSERT(std::all_of(
-            expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
-              return output->as<IterDomain>()->getIterType() ==
-                  IterType::Symbolic;
-            }));
 
         // Determine the output IterType
         IterType iter_type = IterType::Symbolic;

From b6b085f0c92688c7ad753c551a805f8b2c097933 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 12 May 2023 09:20:53 -0400
Subject: [PATCH 33/49] Update ID symbolic check

---
 csrc/dynamic_transform.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index c48754e38be..70bccff408c 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -386,7 +386,7 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
               ". IterDomain was expected.");
         }
 
-        // If none of the output IDs is symbolic, nothing to concretize
+        // If any output ID is symbolic, all output IDs should be symbolic
         if (std::any_of(
                 expr->outputs().begin(),
                 expr->outputs().end(),
@@ -394,12 +394,20 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
                   return output->as<IterDomain>()->getIterType() ==
                       IterType::Symbolic;
                 })) {
-          // If any of output IDs is symbolic, all outputs should be symbolic
           TORCH_INTERNAL_ASSERT(std::all_of(
               expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
                 return output->as<IterDomain>()->getIterType() ==
                     IterType::Symbolic;
               }));
+        } else if (std::all_of(
+                       expr->inputs().begin(),
+                       expr->inputs().end(),
+                       [](Val* output) {
+                         return output->as<IterDomain>()->getIterType() !=
+                             IterType::Symbolic;
+                       })) {
+          // If no inputs or outputs are symbolic, nothing to concretize
+          continue;
         }
 
         // Determine the output IterType

From 38269679ae5493b17ad502c244db89f1553a1204 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 12 May 2023 09:28:25 -0400
Subject: [PATCH 34/49] De-indent to clarify diff

---
 csrc/dynamic_transform.cpp | 153 ++++++++++++++++++-------------------
 1 file changed, 76 insertions(+), 77 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 70bccff408c..139e2574c9e 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -352,88 +352,87 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
   // axes inherited from the producers
   auto propagated = propagateFromProducerToConsumer(tv);
 
-  if (propagated) {
-    // Root IDs are altered. Need to propagate the changes to rfactor
-    // domain
+  // If no root domain is altered, nothing to do further
+  if (!propagated) {
+    return;
+  }
 
-    // At this point, there should be no expr beyond rfactor root
-    TORCH_INTERNAL_ASSERT(
-        tv->getLeafDomain() == tv->getMaybeRFactorDomain(),
-        "Invalid tensor: ",
-        tv->toString());
-
-    // If it has an rfactor root domain, the IterTypes of the rfactor
-    // IDs may need to be updated as well. Traverse the rfactor exprs
-    // and mutate the IterTypes of output IDs if symbolic.
-    if (tv->hasRFactor()) {
-      // Note that it is assumed that theres's no further expression
-      // beyond the rfactor domain as asserted above
-      auto all_id_exprs = StmtSort::getExprsBetween(
-          tv->fusion(),
-          {tv->getRootDomain().begin(), tv->getRootDomain().end()},
-          {tv->getMaybeRFactorDomain().begin(),
-           tv->getMaybeRFactorDomain().end()});
-      for (auto expr : all_id_exprs) {
-        // Assume outputs of IterDomain exprs are always IterDomains. If
-        // the assumption is invalidated, the logic here would need to
-        // be updated. Assert the assumption to immediately detect such
-        // a case if happened.
-        for (auto out_val : expr->outputs()) {
-          TORCH_INTERNAL_ASSERT(
-              out_val->isA<IterDomain>(),
-              "Unexpected output: ",
-              out_val->toString(),
-              ". IterDomain was expected.");
-        }
-
-        // If any output ID is symbolic, all output IDs should be symbolic
-        if (std::any_of(
-                expr->outputs().begin(),
-                expr->outputs().end(),
-                [](Val* output) {
-                  return output->as<IterDomain>()->getIterType() ==
-                      IterType::Symbolic;
-                })) {
-          TORCH_INTERNAL_ASSERT(std::all_of(
+  // Root IDs are altered. Need to propagate the changes to rfactor
+  // domain
+
+  // At this point, there should be no expr beyond rfactor root
+  TORCH_INTERNAL_ASSERT(
+      tv->getLeafDomain() == tv->getMaybeRFactorDomain(),
+      "Invalid tensor: ",
+      tv->toString());
+
+  // If it has an rfactor root domain, the IterTypes of the rfactor
+  // IDs may need to be updated as well. Traverse the rfactor exprs
+  // and mutate the IterTypes of output IDs if symbolic.
+  if (tv->hasRFactor()) {
+    // Note that it is assumed that theres's no further expression
+    // beyond the rfactor domain as asserted above
+    auto all_id_exprs = StmtSort::getExprsBetween(
+        tv->fusion(),
+        {tv->getRootDomain().begin(), tv->getRootDomain().end()},
+        {tv->getMaybeRFactorDomain().begin(),
+         tv->getMaybeRFactorDomain().end()});
+    for (auto expr : all_id_exprs) {
+      // Assume outputs of IterDomain exprs are always IterDomains. If
+      // the assumption is invalidated, the logic here would need to
+      // be updated. Assert the assumption to immediately detect such
+      // a case if happened.
+      for (auto out_val : expr->outputs()) {
+        TORCH_INTERNAL_ASSERT(
+            out_val->isA<IterDomain>(),
+            "Unexpected output: ",
+            out_val->toString(),
+            ". IterDomain was expected.");
+      }
+
+      // If any output ID is symbolic, all output IDs should be symbolic
+      if (std::any_of(
               expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
                 return output->as<IterDomain>()->getIterType() ==
                     IterType::Symbolic;
-              }));
-        } else if (std::all_of(
-                       expr->inputs().begin(),
-                       expr->inputs().end(),
-                       [](Val* output) {
-                         return output->as<IterDomain>()->getIterType() !=
-                             IterType::Symbolic;
-                       })) {
-          // If no inputs or outputs are symbolic, nothing to concretize
-          continue;
-        }
-
-        // Determine the output IterType
-        IterType iter_type = IterType::Symbolic;
-        for (auto inp_id : ir_utils::filterByType<IterDomain>(expr->inputs())) {
-          auto updated_id = maybeMutated(inp_id)->as<IterDomain>();
-          iter_type =
-              ops::promoteIterType(iter_type, updated_id->getIterType());
-        }
-        TORCH_INTERNAL_ASSERT(
-            iter_type != IterType::Symbolic,
-            "Failed to concretize an output IterType for expression: ",
-            expr->toString());
-
-        // Update the IterType of each output
-        for (auto out_id :
-             ir_utils::filterByType<IterDomain>(expr->outputs())) {
-          auto concreteized_out_id =
-              IterDomainBuilder(out_id).iter_type(iter_type).build();
-          registerMutation(out_id, concreteized_out_id);
-        }
-
-        // Outputs are mutated. The expr itself needs to be mutated as
-        // well, which can be done by the mutate method
-        OptOutMutator::mutate(expr);
+              })) {
+        TORCH_INTERNAL_ASSERT(std::all_of(
+            expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
+              return output->as<IterDomain>()->getIterType() ==
+                  IterType::Symbolic;
+            }));
+      } else if (std::all_of(
+                     expr->inputs().begin(),
+                     expr->inputs().end(),
+                     [](Val* output) {
+                       return output->as<IterDomain>()->getIterType() !=
+                           IterType::Symbolic;
+                     })) {
+        // If no inputs or outputs are symbolic, nothing to concretize
+        continue;
+      }
+
+      // Determine the output IterType
+      IterType iter_type = IterType::Symbolic;
+      for (auto inp_id : ir_utils::filterByType<IterDomain>(expr->inputs())) {
+        auto updated_id = maybeMutated(inp_id)->as<IterDomain>();
+        iter_type = ops::promoteIterType(iter_type, updated_id->getIterType());
       }
+      TORCH_INTERNAL_ASSERT(
+          iter_type != IterType::Symbolic,
+          "Failed to concretize an output IterType for expression: ",
+          expr->toString());
+
+      // Update the IterType of each output
+      for (auto out_id : ir_utils::filterByType<IterDomain>(expr->outputs())) {
+        auto concreteized_out_id =
+            IterDomainBuilder(out_id).iter_type(iter_type).build();
+        registerMutation(out_id, concreteized_out_id);
+      }
+
+      // Outputs are mutated. The expr itself needs to be mutated as
+      // well, which can be done by the mutate method
+      OptOutMutator::mutate(expr);
     }
   }
 

From ab987b80b266ba220102b0d5d3b9b3b55e71ea74 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 12 May 2023 09:48:24 -0400
Subject: [PATCH 35/49] Do simple mutation if nothing propagates from producer

---
 csrc/dynamic_transform.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 139e2574c9e..ffc62c7c415 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -352,8 +352,11 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
   // axes inherited from the producers
   auto propagated = propagateFromProducerToConsumer(tv);
 
-  // If no root domain is altered, nothing to do further
+  // If no root domain is altered by producer, we don't need to propagate back
+  // up to rfactor, so do a simple mutation.
   if (!propagated) {
+    mutate(tv->domain());
+    OptOutMutator::mutate(tv);
     return;
   }
 
@@ -547,7 +550,10 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
 
     TORCH_INTERNAL_ASSERT(
         id_type.has_value(),
-        "Did not find id_type. Perhaps TensorView def has no inputs.");
+        "Did not find id_type for consumer root domain ",
+        root_id->toString(),
+        ". Perhaps consumer def has no inputs. Consumer definition = ",
+        def->toString());
 
     TORCH_INTERNAL_ASSERT(
         id_type != IterType::Symbolic,

From eb3fca66a9453c08279370fced8e660878a05acb Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 12 May 2023 13:50:41 -0400
Subject: [PATCH 36/49] Change checks to better explain cases, expand comments

---
 csrc/dynamic_transform.cpp | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index ffc62c7c415..47b3753c915 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -393,26 +393,42 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
             ". IterDomain was expected.");
       }
 
+      // If all inputs are concrete, all outputs should be concrete, and there
+      // is nothing to concretize.
+      if (std::all_of(
+              expr->inputs().begin(), expr->inputs().end(), [](Val* output) {
+                return output->as<IterDomain>()->getIterType() !=
+                    IterType::Symbolic;
+              })) {
+        TORCH_INTERNAL_ASSERT(std::all_of(
+            expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
+              return output->as<IterDomain>()->getIterType() !=
+                  IterType::Symbolic;
+            }));
+        continue;
+      }
+
       // If any output ID is symbolic, all output IDs should be symbolic
       if (std::any_of(
               expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
-                return output->as<IterDomain>()->getIterType() ==
+                return output->as<IterDomain>()->getIterType() !=
                     IterType::Symbolic;
               })) {
         TORCH_INTERNAL_ASSERT(std::all_of(
             expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
-              return output->as<IterDomain>()->getIterType() ==
+              return output->as<IterDomain>()->getIterType() !=
                   IterType::Symbolic;
             }));
-      } else if (std::all_of(
-                     expr->inputs().begin(),
-                     expr->inputs().end(),
-                     [](Val* output) {
-                       return output->as<IterDomain>()->getIterType() !=
-                           IterType::Symbolic;
-                     })) {
-        // If no inputs or outputs are symbolic, nothing to concretize
-        continue;
+        // NOTE: We do not return early at this point. Even though all outputs
+        // are concrete, there may still be concrete inputs. For example, a
+        // Symbolic IterDomain might be padded with constant pad widths (1, 1),
+        // in which case although we do not know the exact extent of the output,
+        // we know it is at least as large as the sum of the pad widths, 2. In
+        // such cases, the output IterDomain is concrete at definition, since if
+        // the extent is >1 we know the IterType is Iteration. In these cases,
+        // we must continue to concretize intermediate expressions between the
+        // root and R-factor domain. See test DynamicTransform5_CUDA which
+        // demonstrates this behavior.
       }
 
       // Determine the output IterType

From c9448caa8f934f0d948b071f5d37b369e2a5b757 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 15 May 2023 08:12:33 -0400
Subject: [PATCH 37/49] Disable all-or-nothing symbolic output check on exprs

---
 csrc/dynamic_transform.cpp | 37 +++++++++++++++----------------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index e55f4490cce..e0027069761 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -408,28 +408,21 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
         continue;
       }
 
-      // If any output ID is symbolic, all output IDs should be symbolic
-      if (std::any_of(
-              expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
-                return output->as<IterDomain>()->getIterType() !=
-                    IterType::Symbolic;
-              })) {
-        TORCH_INTERNAL_ASSERT(std::all_of(
-            expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
-              return output->as<IterDomain>()->getIterType() !=
-                  IterType::Symbolic;
-            }));
-        // NOTE: We do not return early at this point. Even though all outputs
-        // are concrete, there may still be concrete inputs. For example, a
-        // Symbolic IterDomain might be padded with constant pad widths (1, 1),
-        // in which case although we do not know the exact extent of the output,
-        // we know it is at least as large as the sum of the pad widths, 2. In
-        // such cases, the output IterDomain is concrete at definition, since if
-        // the extent is >1 we know the IterType is Iteration. In these cases,
-        // we must continue to concretize intermediate expressions between the
-        // root and R-factor domain. See test DynamicTransform5_CUDA which
-        // demonstrates this behavior.
-      }
+      // NOTE: We do not return early if all outputs are concrete as there may
+      // still be concrete inputs. For example, a Symbolic IterDomain might be
+      // padded with constant pad widths (1, 1), in which case although we do
+      // not know the exact extent of the output, we know it is at least as
+      // large as the sum of the pad widths, 2. In such cases, the output
+      // IterDomain is concrete at definition, since if the extent is >1 we know
+      // the IterType is Iteration. In these cases, we must continue to
+      // concretize intermediate expressions between the root and R-factor
+      // domain. See test DynamicTransform5_CUDA which demonstrates this
+      // behavior.
+      // NOTE: We also do not assume that if one output ID is symbolic, that
+      // they all must be. See test FusionSliceForNanoGPT3_CUDA for an example
+      // that does a static split by a factor of 16 of a symbolic input domain.
+      // The static split in that case results in a concrete IterDomain with
+      // extent 16 along with a symbolic one (extent ceilDiv(n / 16)).
 
       // Determine the output IterType
       IterType iter_type = IterType::Symbolic;

From e5aa9c85d16c29beeea364e6af8b6bbfff92755c Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 15 May 2023 11:56:17 -0400
Subject: [PATCH 38/49] Defer fusion->hasDynamicTransform() in FEC

This fixes a subtle bug where all C++ tests were passing, but for
example `test_cat` in `test_python_frontend.py` failed. The exact same
test would fail in Python that succeeded in C++. The reason is that in
the Python frontend, a `FusionExecutorCache` is initialized _before_
defining the fusion, i.e. with an empty `Fusion`. Doing the check for
dynamic transforms in the constructor then for `FusionExecutorCache`
lead to always finding a static (empty) transform. Instead, this commit
adds the `isDynamic()` method which will cache its results. It should
only be used somewhere inside `runFusionWithInputs` which indicates that
the definition of the `Fusion` is complete.
---
 csrc/kernel_cache.cpp | 11 +++++------
 csrc/kernel_cache.h   | 18 ++++++++++++++++--
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp
index 49ec15b2175..6dfe870f3b8 100644
--- a/csrc/kernel_cache.cpp
+++ b/csrc/kernel_cache.cpp
@@ -112,8 +112,7 @@ InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId(
 }
 
 FusionExecutorCache::FusionExecutorCache(std::unique_ptr<Fusion> fusion)
-    : fusion_(std::move(fusion)),
-      has_dynamic_reshape_(fusion_->hasDynamicTransform()) {}
+    : fusion_(std::move(fusion)) {}
 
 KernelArgumentHolder FusionExecutorCache::prepareInputs(
     const at::ArrayRef<c10::IValue>& inputs) {
@@ -131,7 +130,7 @@ KernelArgumentHolder FusionExecutorCache::prepareInputs(
   // short-circuiting here, resulting in avoidable rebuilds of concretization
   // info.
   auto id_lookup_ret =
-      inputs_id_lookup_.lookupId(inputs, /*hash_scalars*/ has_dynamic_reshape_);
+      inputs_id_lookup_.lookupId(inputs, /*hash_scalars*/ isDynamic());
   if (id_lookup_ret.eviction) {
     evictCache(id_lookup_ret.evict_id);
   }
@@ -375,7 +374,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
   // will be used only as a cache key.
   std::optional<DynamicTransformConcretizationInfo> conc_info = std::nullopt;
   size_t conc_info_index = 0;
-  if (has_dynamic_reshape_) {
+  if (isDynamic()) {
     conc_info = DynamicTransform::getConcretizationInfo(fusion_.get(), &args);
     TORCH_CHECK(
         conc_info.has_value(),
@@ -426,7 +425,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
     // concretize fusion_ for use in this runtime
     auto fusion = std::make_unique<Fusion>(*fusion_);
     FusionGuard fg(fusion.get());
-    if (has_dynamic_reshape_) {
+    if (isDynamic()) {
       const auto& cloned_conc_info =
           fusion->getManagedSafe<DynamicTransformConcretizationInfo>(
               conc_info_index);
@@ -451,7 +450,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
     }
   }
 
-  if (has_dynamic_reshape_) {
+  if (isDynamic()) {
     // In the case of cache hits, we tend to accumulate managed data in
     // fusion_. Here we release the concretization info we created to avoid
     // cloning more and more entries.
diff --git a/csrc/kernel_cache.h b/csrc/kernel_cache.h
index 9d6b9d624e5..dee82055b98 100644
--- a/csrc/kernel_cache.h
+++ b/csrc/kernel_cache.h
@@ -516,6 +516,19 @@ class TORCH_CUDA_CU_API FusionExecutorCache {
       const KernelArgumentHolder& inputs,
       std::optional<PrimDataType> forced_index_type = std::nullopt);
 
+  //! Check whether the input `fusion_` has dynamic elements such as non-static
+  //! reshapes. Note that `fusion_` might be updated after initializing
+  //! `FusionExecutorCache` as is done by `FusionDefinition` in the Python
+  //! frontend. In that case care must be taken to delay this check until the
+  //! entire Fusion is defined. For that reason, this function is private, and
+  //! should only be called inside runFusionWithInputs.
+  bool isDynamic() {
+    if (!is_dynamic_.has_value()) {
+      is_dynamic_ = fusion_->hasDynamicTransform();
+    }
+    return is_dynamic_.value();
+  }
+
  private:
   //! original un-scheduled `Fusion`. This may contain dynamic transforms and
   //! Symbolic IterDomains.
@@ -551,8 +564,9 @@ class TORCH_CUDA_CU_API FusionExecutorCache {
   //!   caching profiles. Currently it just makes it easier to test
   FusionKernelRuntime* most_recent_runtime_ = nullptr;
 
-  //! Whether fusion_ contains dynamic reshapes
-  bool has_dynamic_reshape_ = false;
+  //! Whether fusion_ contains dynamic reshapes. This is cached by
+  //! `fusionIsDynamic()`
+  std::optional<bool> is_dynamic_ = std::nullopt;
 };
 
 class GraphCache {

From 5cf6971b123f9e104a13c56ba6b572ca269b4768 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 15 May 2023 20:15:43 -0400
Subject: [PATCH 39/49] Remove zero-element changes that leaked into this PR

---
 csrc/scheduler/reduction.cpp | 41 ++-------------------------
 test/test_gpu3.cpp           | 54 ------------------------------------
 2 files changed, 2 insertions(+), 93 deletions(-)

diff --git a/csrc/scheduler/reduction.cpp b/csrc/scheduler/reduction.cpp
index c6e624d33b9..eb7515173ee 100644
--- a/csrc/scheduler/reduction.cpp
+++ b/csrc/scheduler/reduction.cpp
@@ -77,25 +77,6 @@ std::shared_ptr<ReductionParams> innerReductionHeuristic(
 
   const int64_t n_elems = total_reduction_numel * total_iteration_numel;
 
-  if (n_elems == 0) {
-    // Number of elements in input is zero
-    auto rparams = std::make_unique<ReductionParams>();
-    if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) {
-      std::cerr << "\nNumber of elements in reduction input buffer is zero\n"
-                << std::endl;
-      std::cerr << "\n===== Reduction Stats ========\n"
-                << "total_reduction_numel: " << total_reduction_numel << "\n"
-                << "total_iteration_numel: " << total_iteration_numel << "\n"
-                << "vectorize_factor: " << vectorize_factor << "\n"
-                << "n_tensor_inputs: " << n_tensor_inputs << "\n"
-                << "max_input_dtype_size: " << max_input_dtype_size << "\n"
-                << "block(" << rparams->lparams.bdimx() << ", "
-                << rparams->lparams.bdimy() << ", 1)" << std::endl;
-      std::cerr << rparams->toString() << std::endl;
-    }
-    return std::move(rparams);
-  }
-
   // WARNING: At some point we may want to generate heuristics for another
   // device that is not the current device.
   const int64_t device_max_threads_per_multiprocessor =
@@ -535,26 +516,6 @@ std::shared_ptr<ReductionParams> outerReductionHeuristic(
     const int64_t n_tensor_inputs,
     const int64_t max_input_dtype_size,
     const size_t vectorize_factor) {
-  const int64_t n_elems = total_reduction_numel * total_iteration_numel;
-
-  if (n_elems == 0) {
-    // Number of elements in input is zero
-    auto rparams = std::make_unique<ReductionParams>();
-    if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) {
-      std::cerr << "\nNumber of elements in reduction input buffer is zero\n"
-                << std::endl;
-      std::cerr << "\n===== Reduction Stats ========\n"
-                << "total_reduction_numel: " << total_reduction_numel << "\n"
-                << "total_iteration_numel: " << total_iteration_numel << "\n"
-                << "vectorize_factor: " << vectorize_factor << "\n"
-                << "n_tensor_inputs: " << n_tensor_inputs << "\n"
-                << "max_input_dtype_size: " << max_input_dtype_size << "\n"
-                << "block(" << rparams->lparams.bdimx() << ", "
-                << rparams->lparams.bdimy() << ", 1)" << std::endl;
-      std::cerr << rparams->toString() << std::endl;
-    }
-    return std::move(rparams);
-  }
   // WARNING: Current device for codegen may not be the target device
   const int64_t device_max_threads_per_multiprocessor =
       (int64_t)at::cuda::getCurrentDeviceProperties()
@@ -569,6 +530,8 @@ std::shared_ptr<ReductionParams> outerReductionHeuristic(
       // Reduce unrolling if we have many inputs, start reduction at 4 inputs
       scheduler_utils::lastPow2(
           std::max((int64_t)n_tensor_inputs >> 2, (int64_t)1)));
+
+  const int64_t n_elems = total_reduction_numel * total_iteration_numel;
   const int64_t n_waves = 8;
 
   // if data fits in l2 and we need more parallelization in the iter dim,
diff --git a/test/test_gpu3.cpp b/test/test_gpu3.cpp
index c60594042be..df9f4df4ef4 100644
--- a/test/test_gpu3.cpp
+++ b/test/test_gpu3.cpp
@@ -8297,60 +8297,6 @@ TEST_F(NVFuserTest, FusionClearGmemBetweenSegments_CUDA) {
       executor_cache.fusion(), outputs, {at_x}, {t4}, __LINE__, __FILE__);
 }
 
-// Test that 0-dimensional tensors do not break reduction scheduler
-TEST_F(NVFuserTest, FusionScheduleReduceZeroElementTensor_CUDA) {
-  for (auto input_shape : std::vector<std::vector<int>>{
-           {3, 4, 0, 5}, // Warp-reduce in all dim pairs (ignoring zero)
-           {33, 40, 0, 50}, // Require block reduction (ignoring zero)
-           {300, 400, 0, 500}, // Require grid reduction (ignoring zero)
-       }) {
-    for (auto reduction_dims : std::vector<std::vector<int>>{
-             {0}, // outermost only
-             {3}, // innermost only
-             {2}, // only zero-dim
-             {1, 2}, // zero-dim and non-zero
-             {2, 3}, // zero-dim and non-zero (innermost)
-         }) {
-      auto fusion = std::make_unique<Fusion>();
-      FusionGuard fg(fusion.get());
-
-      auto tv0 = makeSymbolicTensor(4);
-      fusion->addInput(tv0);
-      auto tv1 = sum(tv0, reduction_dims);
-      fusion->addOutput(tv1);
-
-      auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-      at::Tensor at_x = at::randn(
-          std::vector<int64_t>(input_shape.begin(), input_shape.end()),
-          options);
-      auto t2 = at_x.sum(
-          std::vector<int64_t>(reduction_dims.begin(), reduction_dims.end()));
-
-      auto reduction_params = getReductionHeuristics(fusion.get(), {at_x});
-      TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-      scheduleReduction(fusion.get(), *reduction_params);
-
-      FusionExecutor fe;
-      fe.compileFusion(fusion.get(), {at_x});
-      auto cg_outputs = fe.runFusion({at_x});
-
-      testValidate(fusion.get(), cg_outputs, {at_x}, {t2}, __LINE__, __FILE__);
-
-      // verify that the scheduler does not parallelize any IterDomains
-      for (auto tv : ir_utils::allTvs(fusion.get())) {
-        for (auto id : tv->domain()->leaf()) {
-          TORCH_CHECK(
-              id->getParallelType() == ParallelType::Serial ||
-                  id->getParallelType() == ParallelType::Unswitch ||
-                  id->getParallelType() == ParallelType::Unroll,
-              "No IterDomains should be parallelized in zero-element reduction but found ",
-              id->toString());
-        }
-      }
-    }
-  }
-}
-
 // Test nan propagation during min/max with floats and doubles
 TEST_F(NVFuserTest, FusionMinMaxNanPropagation_CUDA) {
   for (auto dtype : {DataType::Float, DataType::Double}) {

From 8f4cc09a6deafe940da487e7a3374a5d29c48e4a Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 15 May 2023 20:58:24 -0400
Subject: [PATCH 40/49] Remove early returns in concretize root->rfactor

---
 csrc/dynamic_transform.cpp | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index e0027069761..81f717ceaea 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -350,18 +350,12 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
 
   // First, try to concretize the root domain as there may be symbolic
   // axes inherited from the producers
-  auto propagated = propagateFromProducerToConsumer(tv);
+  propagateFromProducerToConsumer(tv);
 
   // If no root domain is altered by producer, we don't need to propagate back
-  // up to rfactor, so do a simple mutation.
-  if (!propagated) {
-    mutate(tv->domain());
-    OptOutMutator::mutate(tv);
-    return;
-  }
-
-  // Root IDs are altered. Need to propagate the changes to rfactor
-  // domain
+  // up to rfactor. We could return early, but instead we go ahead and check the
+  // root to rfactor transforms to be sure we have concretized any intermediate
+  // IterDomains.
 
   // At this point, there should be no expr beyond rfactor root
   TORCH_INTERNAL_ASSERT(
@@ -393,21 +387,6 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
             ". IterDomain was expected.");
       }
 
-      // If all inputs are concrete, all outputs should be concrete, and there
-      // is nothing to concretize.
-      if (std::all_of(
-              expr->inputs().begin(), expr->inputs().end(), [](Val* output) {
-                return output->as<IterDomain>()->getIterType() !=
-                    IterType::Symbolic;
-              })) {
-        TORCH_INTERNAL_ASSERT(std::all_of(
-            expr->outputs().begin(), expr->outputs().end(), [](Val* output) {
-              return output->as<IterDomain>()->getIterType() !=
-                  IterType::Symbolic;
-            }));
-        continue;
-      }
-
       // NOTE: We do not return early if all outputs are concrete as there may
       // still be concrete inputs. For example, a Symbolic IterDomain might be
       // padded with constant pad widths (1, 1), in which case although we do

From 15f0cd8dfacfb5ae9a93dfb2ca7cdccb46431df4 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 15 May 2023 20:59:10 -0400
Subject: [PATCH 41/49] Concretize IterDomains in-place instead of building

This change replaces uses of IterDomainBuilder which previously required
registerMutation. This is because replacing an IterDomain in a
TensorDomain requires creation of an entire new TensorDomain and
swapping it out in the enclosing TensorView. This change simplifies that
by introducing a protected method IterDomain::setIterType, and making
DynamicTransformConcretizer a friend of IterDomain. Then we can simply
change the iter_type and not require any changes to the TensorDomain or
TensorView. This also means we do not need to modify the resize()
command since we will not call it in order to create the concretized
IterDomain for resize().
---
 csrc/dynamic_transform.cpp    | 26 +++++---------------------
 csrc/ir_internal_base_nodes.h | 19 +++++++++++++++++--
 csrc/ir_nodes.cpp             | 12 ++++--------
 3 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 81f717ceaea..6619f592227 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -322,20 +322,9 @@ void DynamicTransformConcretizer::concretizeReshape() {
 }
 
 void DynamicTransformConcretizer::concretizeResize() {
-  // Concretize each resize op.
+  // Concretize each resize op's output IterType.
   for (const auto& [id, iter_type] : info_.getResizeTransforms()) {
-    TORCH_CHECK(
-        id->definition() && id->definition()->isA<Resize>(),
-        "Resized IterDomain must have a Resize definition");
-    auto def = id->definition()->as<Resize>();
-    auto new_id = IterDomain::resize(
-        def->in(),
-        def->leftExpand(),
-        def->rightExpand(),
-        id->isRFactorProduct(),
-        iter_type);
-
-    registerMutation(id, new_id);
+    id->setIterType(iter_type);
   }
 }
 
@@ -416,9 +405,7 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
 
       // Update the IterType of each output
       for (auto out_id : ir_utils::filterByType<IterDomain>(expr->outputs())) {
-        auto concreteized_out_id =
-            IterDomainBuilder(out_id).iter_type(iter_type).build();
-        registerMutation(out_id, concreteized_out_id);
+        out_id->setIterType(iter_type);
       }
 
       // Outputs are mutated. The expr itself needs to be mutated as
@@ -544,16 +531,13 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
         def->toString());
 
     TORCH_INTERNAL_ASSERT(
-        id_type != IterType::Symbolic,
+        id_type.value() != IterType::Symbolic,
         "Failed to concretize ",
         root_id->toString(),
         " of ",
         consumer->toString());
 
-    auto concretized_id =
-        IterDomainBuilder(root_id).iter_type(*id_type).build();
-
-    registerMutation(root_id, concretized_id);
+    root_id->setIterType(id_type.value());
     is_concretized = true;
   }
 
diff --git a/csrc/ir_internal_base_nodes.h b/csrc/ir_internal_base_nodes.h
index edfd5cf465c..2e7834c87bf 100644
--- a/csrc/ir_internal_base_nodes.h
+++ b/csrc/ir_internal_base_nodes.h
@@ -28,6 +28,9 @@ class Scope;
 class IrCloner;
 struct AnalyzeViewResult;
 
+// Friends for modifying IterDomains in place
+class DynamicTransformConcretizer;
+
 // Convenience utility to initialize IterDomain's without having to sort through
 // all the default values. Intended to be used with
 // IterDomain::IterDomain(IrBuilderPasskey IterDomainBuildArgs)
@@ -152,12 +155,18 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
   //! is marked as an rfactor domain. For example, expressions such as
   //! PadOp and SliceOp resize IterDomains and generate rfactor
   //! resized domains.
+  //!
+  //! Note that this operation might result in outputs with IterDomains having
+  //! IterType::Symbolic iteration type. This is because unless the expansion
+  //! arguments are known at compile time, we cannot infer that the output
+  //! extent is not 1, in which case we would set the iter_type to Broadcast. In
+  //! such a case, the proper iter_type will be set during concretization
+  //! (inside FusionExecutorCache::runFusionWithInputs) using setIterType.
   static IterDomain* resize(
       IterDomain* in,
       Val* left_expansion,
       Val* right_expansion,
-      bool mark_as_rfactor = false,
-      std::optional<IterType> iter_type = std::nullopt);
+      bool mark_as_rfactor = false);
 
   bool isReduction() const {
     return getIterType() == IterType::Reduction;
@@ -362,6 +371,12 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
   friend TensorDomain;
   friend ReplayTransformations;
   friend IndexReferenceReplay;
+  friend DynamicTransformConcretizer;
+
+  //! Set the iter_type
+  void setIterType(IterType iter_type) {
+    iter_type_ = iter_type;
+  }
 
  private:
   //! Valid range is defined as [start:-stop_offset]
diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp
index cbaaaa4d034..5914e96444f 100644
--- a/csrc/ir_nodes.cpp
+++ b/csrc/ir_nodes.cpp
@@ -2460,8 +2460,7 @@ IterDomain* IterDomain::resize(
     IterDomain* in,
     Val* left_expansion,
     Val* right_expansion,
-    bool mark_as_rfactor,
-    std::optional<IterType> iter_type_opt) {
+    bool mark_as_rfactor) {
   TORCH_CHECK(
       left_expansion->isIntegralScalar(),
       "Expansion factor must be an integer scalar: ",
@@ -2504,13 +2503,10 @@ IterDomain* IterDomain::resize(
         right_expansion);
   }
 
-  // If output IterType is provided, use it. Otherwise, if we can prove the
-  // resized extent is 1, set to Broadcast, if we can prove it is >1 set to
-  // Iteration, and otherwise fall back to Symbolic.
+  // If we can prove the resized extent is 1, set to Broadcast. If we can prove
+  // it is >1 set to Iteration. Otherwise fall back to Symbolic.
   IterType iter_type = IterType::Symbolic;
-  if (iter_type_opt.has_value()) {
-    iter_type = iter_type_opt.value();
-  } else if (left_expansion->isConstInt() && right_expansion->isConstInt()) {
+  if (left_expansion->isConstInt() && right_expansion->isConstInt()) {
     if (resized_id_size->isConstInt()) {
       // Means input extent is also known
       auto out_extent = resized_id_size->evaluateInt();

From d40fcfeca9562a45c4346da029836a4f1457c2ed Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 15 May 2023 21:08:35 -0400
Subject: [PATCH 42/49] Remove erroneous TODO comment in DynamicPadShmoo_CUDA

---
 test/test_dynamic_transform.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index 97c314c9ebf..2870457f91f 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -947,9 +947,7 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) {
       //{{3, 5}, {0, -4}, true},
 
       // Test full negative shifts, so output doesn't overlap input
-      {{3, 5},
-       {-5, 2},
-       false}, // TODO: why doesn't this miss due to concretize to broadcast?
+      {{3, 5}, {-5, 2}, false},
       {{3, 5}, {2, -5}, false}, // full shift the other direction, re-use
 
       // The following reuses the schedule of {3, 5} inputs, and does not set

From 65656a68cf0c82288ee41cbe014c66f438310aef Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 16 May 2023 08:00:46 -0400
Subject: [PATCH 43/49] Revert "Concretize IterDomains in-place instead of
 building"

This reverts commit 15f0cd8dfacfb5ae9a93dfb2ca7cdccb46431df4.
---
 csrc/dynamic_transform.cpp    | 26 +++++++++++++++++++++-----
 csrc/ir_internal_base_nodes.h | 19 ++-----------------
 csrc/ir_nodes.cpp             | 12 ++++++++----
 3 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 6619f592227..81f717ceaea 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -322,9 +322,20 @@ void DynamicTransformConcretizer::concretizeReshape() {
 }
 
 void DynamicTransformConcretizer::concretizeResize() {
-  // Concretize each resize op's output IterType.
+  // Concretize each resize op.
   for (const auto& [id, iter_type] : info_.getResizeTransforms()) {
-    id->setIterType(iter_type);
+    TORCH_CHECK(
+        id->definition() && id->definition()->isA<Resize>(),
+        "Resized IterDomain must have a Resize definition");
+    auto def = id->definition()->as<Resize>();
+    auto new_id = IterDomain::resize(
+        def->in(),
+        def->leftExpand(),
+        def->rightExpand(),
+        id->isRFactorProduct(),
+        iter_type);
+
+    registerMutation(id, new_id);
   }
 }
 
@@ -405,7 +416,9 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
 
       // Update the IterType of each output
       for (auto out_id : ir_utils::filterByType<IterDomain>(expr->outputs())) {
-        out_id->setIterType(iter_type);
+        auto concreteized_out_id =
+            IterDomainBuilder(out_id).iter_type(iter_type).build();
+        registerMutation(out_id, concreteized_out_id);
       }
 
       // Outputs are mutated. The expr itself needs to be mutated as
@@ -531,13 +544,16 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
         def->toString());
 
     TORCH_INTERNAL_ASSERT(
-        id_type.value() != IterType::Symbolic,
+        id_type != IterType::Symbolic,
         "Failed to concretize ",
         root_id->toString(),
         " of ",
         consumer->toString());
 
-    root_id->setIterType(id_type.value());
+    auto concretized_id =
+        IterDomainBuilder(root_id).iter_type(*id_type).build();
+
+    registerMutation(root_id, concretized_id);
     is_concretized = true;
   }
 
diff --git a/csrc/ir_internal_base_nodes.h b/csrc/ir_internal_base_nodes.h
index 2e7834c87bf..edfd5cf465c 100644
--- a/csrc/ir_internal_base_nodes.h
+++ b/csrc/ir_internal_base_nodes.h
@@ -28,9 +28,6 @@ class Scope;
 class IrCloner;
 struct AnalyzeViewResult;
 
-// Friends for modifying IterDomains in place
-class DynamicTransformConcretizer;
-
 // Convenience utility to initialize IterDomain's without having to sort through
 // all the default values. Intended to be used with
 // IterDomain::IterDomain(IrBuilderPasskey IterDomainBuildArgs)
@@ -155,18 +152,12 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
   //! is marked as an rfactor domain. For example, expressions such as
   //! PadOp and SliceOp resize IterDomains and generate rfactor
   //! resized domains.
-  //!
-  //! Note that this operation might result in outputs with IterDomains having
-  //! IterType::Symbolic iteration type. This is because unless the expansion
-  //! arguments are known at compile time, we cannot infer that the output
-  //! extent is not 1, in which case we would set the iter_type to Broadcast. In
-  //! such a case, the proper iter_type will be set during concretization
-  //! (inside FusionExecutorCache::runFusionWithInputs) using setIterType.
   static IterDomain* resize(
       IterDomain* in,
       Val* left_expansion,
       Val* right_expansion,
-      bool mark_as_rfactor = false);
+      bool mark_as_rfactor = false,
+      std::optional<IterType> iter_type = std::nullopt);
 
   bool isReduction() const {
     return getIterType() == IterType::Reduction;
@@ -371,12 +362,6 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
   friend TensorDomain;
   friend ReplayTransformations;
   friend IndexReferenceReplay;
-  friend DynamicTransformConcretizer;
-
-  //! Set the iter_type
-  void setIterType(IterType iter_type) {
-    iter_type_ = iter_type;
-  }
 
  private:
   //! Valid range is defined as [start:-stop_offset]
diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp
index 5914e96444f..cbaaaa4d034 100644
--- a/csrc/ir_nodes.cpp
+++ b/csrc/ir_nodes.cpp
@@ -2460,7 +2460,8 @@ IterDomain* IterDomain::resize(
     IterDomain* in,
     Val* left_expansion,
     Val* right_expansion,
-    bool mark_as_rfactor) {
+    bool mark_as_rfactor,
+    std::optional<IterType> iter_type_opt) {
   TORCH_CHECK(
       left_expansion->isIntegralScalar(),
       "Expansion factor must be an integer scalar: ",
@@ -2503,10 +2504,13 @@ IterDomain* IterDomain::resize(
         right_expansion);
   }
 
-  // If we can prove the resized extent is 1, set to Broadcast. If we can prove
-  // it is >1 set to Iteration. Otherwise fall back to Symbolic.
+  // If output IterType is provided, use it. Otherwise, if we can prove the
+  // resized extent is 1, set to Broadcast, if we can prove it is >1 set to
+  // Iteration, and otherwise fall back to Symbolic.
   IterType iter_type = IterType::Symbolic;
-  if (left_expansion->isConstInt() && right_expansion->isConstInt()) {
+  if (iter_type_opt.has_value()) {
+    iter_type = iter_type_opt.value();
+  } else if (left_expansion->isConstInt() && right_expansion->isConstInt()) {
     if (resized_id_size->isConstInt()) {
       // Means input extent is also known
       auto out_extent = resized_id_size->evaluateInt();

From c86e048b8ac78d65c925974acdaffaeae30ef86c Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 16 May 2023 08:05:41 -0400
Subject: [PATCH 44/49] Remove resizeOutputIterType

This was a hold-over from when I was doing more complicated
replacements. Since we now just change the IterType to Broadcast or
Iteration during concretization, there is not additional complexity to
consider, so this function is not needed.
---
 csrc/dynamic_transform.cpp |  9 +++------
 csrc/ir_utils.h            | 23 -----------------------
 2 files changed, 3 insertions(+), 29 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 81f717ceaea..7aa664d50eb 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -156,12 +156,9 @@ void DynamicTransformInfoBuilder::handle(TensorView* tv) {
           "Cannot evaluate the right expansion of an IterDomain resize: ",
           right->toString());
 
-      auto out_itertype = ir_utils::resizeOutputIterType(
-          in_extent_val->as<int64_t>(),
-          out_extent_val->as<int64_t>(),
-          left_val->as<int64_t>(),
-          right_val->as<int64_t>());
-
+      auto out_itertype = out_extent_val->as<int64_t>() == 1
+          ? IterType::Broadcast
+          : IterType::Iteration;
       info_.resize_transforms_.emplace_back(id, out_itertype);
     }
   }
diff --git a/csrc/ir_utils.h b/csrc/ir_utils.h
index 695aa3828ab..0ef61991df2 100644
--- a/csrc/ir_utils.h
+++ b/csrc/ir_utils.h
@@ -429,28 +429,5 @@ void validateDomainEquivalence(
     const std::vector<IterDomain*>& initial_domain,
     const std::vector<IterDomain*>& derived_domain);
 
-//! Compute the IterType of an IterDomain that has been resized. If the output
-//! is size 1, or the output uses no input elements, this function returns
-//! Broadcast. Otherwise, it returns Iteration.
-inline IterType resizeOutputIterType(
-    int64_t in_extent,
-    int64_t out_extent,
-    int64_t left,
-    int64_t right) {
-  TORCH_CHECK(out_extent >= 0, "Resized extent must be non-negative.");
-  if (
-      // negative padding sums to input extent. Output is zero-dimensional
-      out_extent == 0 ||
-      // input overlaps output
-      left + in_extent > 0 || right + in_extent > 0) {
-    return IterType::Iteration;
-  } else {
-    // Result is size-1 or input doesn't overlap output.
-    // In these cases, the output is just a broadcast of either the used input
-    // value, or the pad value.
-    return IterType::Broadcast;
-  }
-}
-
 } // namespace ir_utils
 } // namespace nvfuser

From 5885c80790f3f7743a0668a05e4f4b7ad143bb3b Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 16 May 2023 08:13:58 -0400
Subject: [PATCH 45/49] Expand comment on resize(), explain iter_type arg

---
 csrc/ir_internal_base_nodes.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/csrc/ir_internal_base_nodes.h b/csrc/ir_internal_base_nodes.h
index edfd5cf465c..dbd29fc7c15 100644
--- a/csrc/ir_internal_base_nodes.h
+++ b/csrc/ir_internal_base_nodes.h
@@ -152,6 +152,19 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
   //! is marked as an rfactor domain. For example, expressions such as
   //! PadOp and SliceOp resize IterDomains and generate rfactor
   //! resized domains.
+  //!
+  //! Usually, the IterType of the output IterDomain will be Symbolic. This is
+  //! because unless the left and right expansions are known at Fusion
+  //! definition we cannot be sure that the output will have an extent != 1. In
+  //! case the output extent is in fact 1, we will set the IterType to
+  //! Broadcast. If the left and right expansions are constant, and sum to at
+  //! least two, then even an empty input will result in an Iteration IterType.
+  //! In these cases, we will set the output IterType to Iteration at
+  //! definition. Otherwise, it will be set to Symbolic and will be resolved
+  //! when concretization is performed by FusionExecutorCache.
+  //!
+  //! The optional iter_type argument can be used to force the output IterType,
+  //! but for safety its use should typically be confined to concretization.
   static IterDomain* resize(
       IterDomain* in,
       Val* left_expansion,

From dc2c28b12bf8e59ccf2263e9663922c71a313436 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 16 May 2023 08:33:08 -0400
Subject: [PATCH 46/49] Point to #264 and #346 in DynamicPadShmoo test

---
 test/test_dynamic_transform.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index 2870457f91f..b9d83c1e5ba 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -937,13 +937,15 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) {
 
       {{3, 5}, {2, 1}, false}, // simple pad of both sides
       {{3, 5}, {-1, 1}, false}, // shift by one
-      // TODO: The following fails with a SIGFPE in innerReductionHeuristic
+      // The following fails with a SIGFPE in innerReductionHeuristic
+      // See https://github.com/NVIDIA/Fuser/issues/264
       //{{3, 5}, {-3, -2}, false}, // output is zero-dimensional
 
       // Output has size 1 so is set to broadcast
       // Currently fails since: IterDomain cannot be both a broadcast and
       // rfactor domain. Exception raised from IterDomain at
       // /opt/pytorch/nvfuser/csrc/ir_nodes.cpp:2080
+      // See https://github.com/NVIDIA/Fuser/issues/346
       //{{3, 5}, {0, -4}, true},
 
       // Test full negative shifts, so output doesn't overlap input
@@ -955,9 +957,9 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) {
       {{3, 1}, {1, 1}, false},
 
       // Test zero-dimensional input
-      //{{3, 0}, {0, 0}, false}, // TODO: SIGFPE (see above)
+      //{{3, 0}, {0, 0}, false}, // SIGFPE (see #264 above)
       {{3, 0}, {1, 1}, false},
-      //{{3, 0}, {-1, 1}, false}, // TODO: SIGFPE (see above)
+      //{{3, 0}, {-1, 1}, false}, // SIGFPE (see #264 above)
   };
   // NOLINTEND(bugprone-implicit-widening-of-multiplication-result)
   reductionDynamicPadAddFusion(invocations);

From 25819582a0cb2c8ee88be75cf89d767bab8216b6 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 16 May 2023 08:39:15 -0400
Subject: [PATCH 47/49] Minor clean up

---
 csrc/dynamic_transform.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 7aa664d50eb..7e2d02565f1 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -413,13 +413,13 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
 
       // Update the IterType of each output
       for (auto out_id : ir_utils::filterByType<IterDomain>(expr->outputs())) {
-        auto concreteized_out_id =
+        auto concretized_out_id =
             IterDomainBuilder(out_id).iter_type(iter_type).build();
-        registerMutation(out_id, concreteized_out_id);
+        registerMutation(out_id, concretized_out_id);
       }
 
-      // Outputs are mutated. The expr itself needs to be mutated as
-      // well, which can be done by the mutate method
+      // The expr itself needs to be mutated as well in case the outputs are
+      // mutated, which can be done by the mutate method
       OptOutMutator::mutate(expr);
     }
   }

From 152a0a8fdbe789f80e615af00be520a15f294668 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 16 May 2023 15:40:25 -0400
Subject: [PATCH 48/49] Fix DynamicTransformConcretizationInfo::operator==

Previously it did not compare resize_transforms_, leading to erroneous
cache hits!
---
 csrc/dynamic_transform.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 7e2d02565f1..9c6d9c3b914 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -80,6 +80,14 @@ bool DynamicTransformConcretizationInfo::operator==(
     }
   }
 
+  for (const auto i : c10::irange(resize_transforms_.size())) {
+    const auto& transform = resize_transforms_.at(i);
+    const auto& other_transform = other.resize_transforms_.at(i);
+    if (transform != other_transform) {
+      return false;
+    }
+  }
+
   return true;
 }
 

From 4f9b4ba5dae51e0eeb05ba42764174af64e21a3a Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 16 May 2023 15:41:06 -0400
Subject: [PATCH 49/49] Remove rfactor&&broadcast check, uncomment failing
 test.

---
 csrc/ir_nodes.cpp               |  9 ++++++---
 test/test_dynamic_transform.cpp | 17 +++--------------
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp
index cbaaaa4d034..4c4634841a4 100644
--- a/csrc/ir_nodes.cpp
+++ b/csrc/ir_nodes.cpp
@@ -2102,9 +2102,12 @@ IterDomain::IterDomain(
       is_padded_dimension_(is_padded_dimension),
       padded_to_size_(padded_to_size),
       is_mma_swizzled_(is_mma_swizzled) {
-  TORCH_CHECK(
-      !(isRFactorProduct() && isBroadcast()),
-      "IterDomain cannot be both a broadcast and rfactor domain.");
+  // NOTE: We previously asserted !(isRFactorProduct() && isBroadcast()), i.e.
+  // that an IterDomain could not be both a broadcast and an rfactor domain.
+  // However, since the introduction of the resize op, we now have a legitimate
+  // case where this may be true; namely, whenever we resize an IterDomain to
+  // size 1, we will mark it as Broadcast, but the resize must lie between root
+  // and rfactor.
 
   TORCH_INTERNAL_ASSERT(
       extent->isIntegralScalar(),
diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index b9d83c1e5ba..50566371655 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -883,14 +883,7 @@ void reductionDynamicPadAddFusion(
 
   // Return pair of: number of concretizations & total number of kernel runtimes
   auto countConcretizations = [&fusion_executor_cache]() {
-    std::unordered_set<const std::pair<
-        size_t,
-        std::optional<DynamicTransformConcretizationInfo>>*>
-        concs;
-    for (auto& it : fusion_executor_cache.getKernelRuntimes()) {
-      concs.insert(&it.first);
-    }
-    return concs.size();
+    return fusion_executor_cache.getKernelRuntimes().size();
   };
   size_t num_concretizations = countConcretizations();
   // Check that concretizations and runtimes are cache misses only when they
@@ -941,12 +934,8 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) {
       // See https://github.com/NVIDIA/Fuser/issues/264
       //{{3, 5}, {-3, -2}, false}, // output is zero-dimensional
 
-      // Output has size 1 so is set to broadcast
-      // Currently fails since: IterDomain cannot be both a broadcast and
-      // rfactor domain. Exception raised from IterDomain at
-      // /opt/pytorch/nvfuser/csrc/ir_nodes.cpp:2080
-      // See https://github.com/NVIDIA/Fuser/issues/346
-      //{{3, 5}, {0, -4}, true},
+      // Output has size 1 so is set to broadcast.
+      {{3, 5}, {0, -4}, true},
 
       // Test full negative shifts, so output doesn't overlap input
       {{3, 5}, {-5, 2}, false},