From 757f685494dcbf0d468e812fad0621f6b55d3481 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 30 May 2023 10:56:52 -0400
Subject: [PATCH 01/22] Add C++ repro for #418

---
 test/test_dynamic_transform.cpp | 47 +++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index dc0201d0833..1b9003f5c3b 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -1001,4 +1001,51 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) {
   reductionDynamicPadAddFusion(invocations);
 }
 
+// Repro of https://github.com/NVIDIA/Fuser/issues/418
+TEST_F(NVFuserTest, DynamicTransformIssue418_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  auto s0 = IrBuilder::create<Int>();
+  fusion->addInput(s0);
+
+  auto v00 = tv0->axis(0)->extent();
+  auto v01 = tv0->axis(1)->extent();
+  auto v02 = tv0->axis(2)->extent();
+  auto v03 = tv0->axis(3)->extent();
+
+  auto tv1 = reshape(tv0, {v00, div(v01, s0), s0, v02, v03});
+  auto vm = variance_mean(tv1, {2, 3, 4}, 0, true);
+  fusion->addOutput(vm.mean);
+  fusion->addOutput(vm.var);
+
+  // tv1 has symbolic axes as reshape is dynamic
+  TORCH_CHECK(
+      tv1->domain()->hasSymbolicAxis(),
+      "Expected to have symbolic axes: ",
+      tv1->toString());
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  { // trivial reshape
+    auto t0 = at::randn({256, 128, 28, 28}, options);
+    std::vector<c10::IValue> inputs = {t0, 32};
+    auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
+    auto t0_resh = t0.reshape({256, 4, 32, 28, 28});
+    auto mu = t0_resh.mean({2, 3, 4});
+    auto v = t0_resh.var({2, 3, 4});
+    testValidate(
+        executor_cache.fusion(),
+        cg_outputs,
+        inputs,
+        {mu, v},
+        __LINE__,
+        __FILE__);
+  }
+}
+
 } // namespace nvfuser

From 03ac754e40d711c781d6daa249165a87b9f8fafb Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 30 May 2023 10:58:22 -0400
Subject: [PATCH 02/22] Process all outputs of each Expr in concretization

---
 csrc/dynamic_transform.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index f838e07a1aa..a31ca3d9edd 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -404,8 +404,15 @@ void DynamicTransformConcretizer::concretize() {
   // Finally, propagate concretized domains
   auto all_stmts = StmtSort::getStmts(info_.fusion(), true);
   for (auto stmt : all_stmts) {
-    if (stmt->isA<Val>()) {
+    if (stmt->isA<Val>() && !stmt->asVal()->definition()) {
+      // vals with definitions will be processed as exprs along with _all_
+      // outputs
       mutate(stmt);
+    } else if (stmt->isA<Expr>()) {
+      // concretize all outputs of each expression that will be evaluated
+      for (auto o : stmt->as<Expr>()->outputs()) {
+        mutate(o);
+      }
     }
   }
 }

From 36c70c5bd62f3388b51c5b8c4a13440a207a8a3f Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 31 May 2023 19:31:02 -0400
Subject: [PATCH 03/22] Add fusion_ir_dynamic debug dump option

---
 csrc/kernel_cache.cpp | 5 +++++
 csrc/utils.cpp        | 1 +
 csrc/utils.h          | 1 +
 3 files changed, 7 insertions(+)

diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp
index 6cc6dc37ed3..9abbd35ed7b 100644
--- a/csrc/kernel_cache.cpp
+++ b/csrc/kernel_cache.cpp
@@ -554,6 +554,11 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
     }
   }
 
+  if (isDebugDumpEnabled(DebugDumpOption::FusionIrDynamic)) {
+    std::cout << "Dynamic Fusion (before concretization):" << std::endl;
+    fusion()->printMath();
+  }
+
   // Compute or get cached initial concretization info
   auto& initial_info = initialInfo();
 
diff --git a/csrc/utils.cpp b/csrc/utils.cpp
index 644f9337196..4c35f247cd7 100644
--- a/csrc/utils.cpp
+++ b/csrc/utils.cpp
@@ -109,6 +109,7 @@ auto parseDebugDumpOptions() {
       {"fusion_ir", DebugDumpOption::FusionIr},
       {"fusion_ir_math", DebugDumpOption::FusionIrMath},
       {"fusion_ir_presched", DebugDumpOption::FusionIrPresched},
+      {"fusion_ir_dynamic", DebugDumpOption::FusionIrDynamic},
       {"fusion_ir_concretized", DebugDumpOption::FusionIrConcretized},
       {"kernel_ir", DebugDumpOption::KernelIr},
       {"ca_map", DebugDumpOption::ComputeAtMap},
diff --git a/csrc/utils.h b/csrc/utils.h
index e5a0171f9f0..83362f51acc 100644
--- a/csrc/utils.h
+++ b/csrc/utils.h
@@ -55,6 +55,7 @@ enum class DebugDumpOption {
   FusionIr, //!< Dump the Fusion IR before lowering
   FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
   FusionIrPresched, //!< Dump the Fusion IR before it is scheduled.
+  FusionIrDynamic, //!< Dump the Fusion IR before concretization
   FusionIrConcretized, //!< Dump the Fusion IR after concretization
   KernelIr, //!< Dump the compiler Kernel IR
   ComputeAtMap, //!< Dump the computeAt map

From 9727388594eea9371422faa0899a113156e4e89e Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 31 May 2023 20:09:52 -0400
Subject: [PATCH 04/22] Propagate extent from P2C during concretization

---
 csrc/dynamic_transform.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index a31ca3d9edd..b6d3dc14aad 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -645,6 +645,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
     // corresponding producer IDs
 
     std::optional<IterType> id_type;
+    Val* extent = nullptr;
 
     for (auto producer : ir_utils::filterByType<TensorView>(def->inputs())) {
       PairwiseRootDomainMap root_map(producer, consumer);
@@ -667,6 +668,11 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
       } else {
         id_type = input_id->getIterType();
       }
+
+      // Set extent expression based on producer, overwriting that of consumer
+      if (!extent) {
+        extent = input_id->extent();
+      }
     }
 
     TORCH_INTERNAL_ASSERT(
@@ -684,7 +690,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
         consumer->toString());
 
     auto concretized_id =
-        IterDomainBuilder(root_id).iter_type(*id_type).build();
+        IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build();
 
     registerConcretization(root_id, concretized_id);
     is_concretized = true;

From 031f327f90136160a8faf226d33092fb53685939 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 1 Jun 2023 07:51:34 -0400
Subject: [PATCH 05/22] Revert "Propagate extent from P2C during
 concretization"

This reverts commit 9727388594eea9371422faa0899a113156e4e89e.
---
 csrc/dynamic_transform.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index b6d3dc14aad..a31ca3d9edd 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -645,7 +645,6 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
     // corresponding producer IDs
 
     std::optional<IterType> id_type;
-    Val* extent = nullptr;
 
     for (auto producer : ir_utils::filterByType<TensorView>(def->inputs())) {
       PairwiseRootDomainMap root_map(producer, consumer);
@@ -668,11 +667,6 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
       } else {
         id_type = input_id->getIterType();
       }
-
-      // Set extent expression based on producer, overwriting that of consumer
-      if (!extent) {
-        extent = input_id->extent();
-      }
     }
 
     TORCH_INTERNAL_ASSERT(
@@ -690,7 +684,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
         consumer->toString());
 
     auto concretized_id =
-        IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build();
+        IterDomainBuilder(root_id).iter_type(*id_type).build();
 
     registerConcretization(root_id, concretized_id);
     is_concretized = true;

From 3724a004657183a657228614ea02156ba6fc4900 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 1 Jun 2023 11:28:57 -0400
Subject: [PATCH 06/22] Match concretized reshape extents to desired extents.

This is still failing due to segmentation not being able to infer output
shapes.
---
 csrc/dynamic_transform.cpp    | 198 +++++++++++++++++++++++++++++++---
 csrc/ir/internal_base_nodes.h |   3 +-
 csrc/ir/nodes.cpp             |  18 ++--
 csrc/kernel_cache.cpp         |   2 +
 4 files changed, 200 insertions(+), 21 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index a31ca3d9edd..1db9ab0ea82 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -11,6 +11,7 @@
 #include <expr_evaluator.h>
 #include <ir/cloner.h>
 #include <ir/utils.h>
+#include <ops/arith.h>
 #include <ops/utils.h>
 #include <transform_iter.h>
 #include <transform_view.h>
@@ -417,6 +418,166 @@ void DynamicTransformConcretizer::concretize() {
   }
 }
 
+//! Given a concrete rfactor domain and a symbolic rfactor domain, create a new
+//! rfactor domain with the same history as the concrete one, but with the
+//! extent expressions of the symbolic IterDomains.
+std::vector<IterDomain*> matchReshapeRFactorExtents(
+    const std::vector<IterDomain*>& old_rfactor,
+    const std::vector<IterDomain*>& desired_extents_rfactor) {
+  TORCH_INTERNAL_ASSERT(
+      old_rfactor.size() == desired_extents_rfactor.size(),
+      "New RFactor length does not match symbolic RFactor length: found ",
+      old_rfactor.size(),
+      " but expected ",
+      desired_extents_rfactor.size());
+
+  std::cout << "desired_extents_rfactor:" << std::endl;
+  for (auto id : desired_extents_rfactor) {
+    std::cout << id->toString() << " has extent " << id->extent()->toString()
+              << std::endl;
+  }
+  std::cout << std::endl;
+
+  std::vector<IterDomain*> new_rfactor(old_rfactor.size(), nullptr);
+
+  // Note: we do not use c10::irange here so that we can advance i if we see a
+  // Split
+  for (size_t i = 0; i < old_rfactor.size(); i++) {
+    auto old_id = old_rfactor.at(i);
+    // create new Split or Merge expression
+    auto def = old_id->definition();
+    if (def) {
+      if (def->isA<Split>()) {
+        auto spl = def->as<Split>();
+        // For Split, we will replace both this rfactor ID and the next at once
+        // Note: I think this is safe since we don't have any broadcasts at this
+        // point.
+        auto desired_outer_extent = desired_extents_rfactor.at(i)->extent();
+        auto new_outer =
+            IterDomainBuilder(old_id).extent(desired_outer_extent).build();
+        std::cout << "desired_extent = " << desired_outer_extent->toString()
+                  << std::endl;
+        new_rfactor.at(i) = new_outer;
+        i++;
+        TORCH_INTERNAL_ASSERT(
+            i < old_rfactor.size(), "Found outer split output without inner");
+        auto desired_inner_extent = desired_extents_rfactor.at(i)->extent();
+        std::cout << "desired_inner_extent = "
+                  << desired_inner_extent->toString() << std::endl;
+        auto old_inner = old_rfactor.at(i);
+        auto new_inner =
+            IterDomainBuilder(old_inner).extent(desired_inner_extent).build();
+        new_rfactor.at(i) = new_inner;
+        std::cout << "new_outer = " << new_outer->toString()
+                  << "  new_inner = " << new_inner->toString() << std::endl;
+        IrBuilder::create<Split>(
+            new_outer,
+            new_inner,
+            spl->in(),
+            spl->factor(),
+            spl->innerSplit(),
+            spl->startOffset(),
+            spl->stopOffset());
+        // If we do not remove the original expression, there will be multiple
+        // uses of the same root ID, which will cause an error in
+        // BestEffortReplay. So we remove the old expression here.
+        spl->fusion()->removeExpr(spl);
+      } else if (def->isA<Merge>()) {
+        auto m = def->as<Merge>();
+        auto desired_extent = desired_extents_rfactor.at(i)->extent();
+        auto new_id = IterDomainBuilder(old_id).extent(desired_extent).build();
+        new_rfactor.at(i) = new_id;
+        IrBuilder::create<Merge>(new_id, m->outer(), m->inner());
+        m->fusion()->removeExpr(m); // see above
+      } else {
+        TORCH_INTERNAL_ASSERT(
+            false,
+            "Unhandled definition for reshape rfactor ID ",
+            old_id->toString(),
+            ". Definition is ",
+            def->toString());
+      }
+    } else {
+      new_rfactor.at(i) = old_id;
+    }
+  }
+  return new_rfactor;
+}
+
+//! Given a concretized reshape (concrete_tv) and its symbolic stand-in
+//! (symbolic_tv), generate a new TensorView having the same concretization as
+//! concrete_tv with the same final rfactor extents as symbolic_tv.
+//!
+//! Currently, reshape works by applying squeeze, split/merge, then broadcasts
+//! (in that order). Each of these three phases produces a new TensorView. We
+//! re-use the squeezed TV, then replace the applyViewTransforms TV and replay
+//! the broadcast.
+TensorView* matchSymbolicReshapeExtents(
+    TensorView* concrete_tv,
+    TensorView* symbolic_tv) {
+  TORCH_INTERNAL_ASSERT(
+      concrete_tv->getMaybeRFactorDomain().size() ==
+          symbolic_tv->getMaybeRFactorDomain().size(),
+      "Mismatch between concrete and symbolic rfactor domain sizes");
+  // We will skip axes in the symbolic rfactor domain matching broadcast axes
+  // in the concrete rfactor for now.
+  std::vector<IterDomain*> symbolic_rfactor_nobcast;
+  for (auto i : c10::irange(concrete_tv->getMaybeRFactorDomain().size())) {
+    if (concrete_tv->getMaybeRFactorDomain().at(i)->isBroadcast()) {
+      continue;
+    }
+    symbolic_rfactor_nobcast.push_back(
+        symbolic_tv->getMaybeRFactorDomain().at(i));
+  }
+
+  // First determine if there is a broadcast
+  auto def = concrete_tv->definition();
+  TORCH_INTERNAL_ASSERT(
+      def, "matchSymbolicReshapeExtents input has no definition.");
+  BroadcastOp* bcast = nullptr;
+  if (def->isA<BroadcastOp>()) {
+    // There were broadcasts. Move upstream but hold onto the op so we can
+    // replay it later
+    bcast = def->as<BroadcastOp>();
+    concrete_tv = bcast->in()->as<TensorView>();
+    def = concrete_tv->definition();
+    TORCH_INTERNAL_ASSERT(
+        def, "Reshape's internal broadcast has no definition");
+  }
+  TORCH_INTERNAL_ASSERT(
+      def->isA<ViewOp>(),
+      "matchSymbolicReshapeExtents expects a ViewOp output");
+
+  // We can now ignore output broadcast axes and just replace the rfactor
+  // extents in the ViewOp output
+  auto new_rfactor = matchReshapeRFactorExtents(
+      concrete_tv->getMaybeRFactorDomain(), symbolic_rfactor_nobcast);
+  // NOTE: we avoid the validate_domain_equivalence check when creating this
+  // TensorDomain since we cannot easily prove that the new rfactor will match
+  // the root. However, this should be guaranteed by analyzeView in this case.
+  auto new_td = IrBuilder::create<TensorDomain>(
+      concrete_tv->getRootDomain(),
+      new_rfactor,
+      new_rfactor,
+      concrete_tv->domain()->contiguity(),
+      /*validate_domain_equivalence*/ false);
+  auto new_tv = IrBuilder::create<TensorView>(
+      concrete_tv->container(), new_td, concrete_tv->getDataType().value());
+
+  // Create a new ViewOp linking the input to this new output
+  IrBuilder::create<ViewOp>(
+      concrete_tv->container(),
+      new_tv,
+      concrete_tv->definition()->input(0)->as<TensorView>());
+
+  // Replay broadcast
+  if (bcast) {
+    new_tv = broadcast(new_tv, bcast->getBroadcastDimFlags());
+  }
+
+  return new_tv;
+}
+
 void DynamicTransformConcretizer::concretizeReshape() {
   // Concretize each reshape op.
   for (const auto& kv : info_.getReshapeTransforms()) {
@@ -427,19 +588,29 @@ void DynamicTransformConcretizer::concretizeReshape() {
 
     auto concrete_reshape_out_tv = reshape(inp_tv, view_analysis);
 
+    // The above call to reshape uses static sizes and recomputes the output
+    // extents based on the decomposed Split and Merge operations. This means
+    // that it will produce a TensorView with different extent expressions
+    // (though equal for the given inputs) than what the user requested in the
+    // original symbolic reshape invocation. This can cause subtle issues as
+    // it introduces a mismatch with downstream extents; e.g.
+    // https://github.com/NVIDIA/Fuser/issues/418
+    // Here, we take the output above and replace the rfactor extents with the
+    // original ones in order to prevent such mismatches.
+    auto new_tv =
+        matchSymbolicReshapeExtents(concrete_reshape_out_tv, incomplete_out_tv);
+
     // We do the replacement directly here, but we must still check that the
     // replacement is valid
-    checkConcretizedUses(incomplete_out_tv, concrete_reshape_out_tv);
+    checkConcretizedUses(incomplete_out_tv, new_tv);
 
     // Replace the old tensor with the new concretized tensor
     for (auto use_of_old_tv : incomplete_out_tv->uses()) {
-      ir_utils::replaceValInExpr(
-          use_of_old_tv, incomplete_out_tv, concrete_reshape_out_tv);
+      ir_utils::replaceValInExpr(use_of_old_tv, incomplete_out_tv, new_tv);
     }
 
     if (incomplete_out_tv->isFusionOutput()) {
-      incomplete_out_tv->fusion()->replaceOutput(
-          incomplete_out_tv, concrete_reshape_out_tv);
+      incomplete_out_tv->fusion()->replaceOutput(incomplete_out_tv, new_tv);
     }
 
     incomplete_out_tv->fusion()->removeVal(incomplete_out_tv);
@@ -486,9 +657,9 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
   propagateFromProducerToConsumer(tv);
 
   // If no root domain is altered by producer, we don't need to propagate back
-  // up to rfactor. We could return early, but instead we go ahead and check the
-  // root to rfactor transforms to be sure we have concretized any intermediate
-  // IterDomains.
+  // up to rfactor. We could return early, but instead we go ahead and check
+  // the root to rfactor transforms to be sure we have concretized any
+  // intermediate IterDomains.
 
   // At this point, there should be no expr beyond rfactor root
   TORCH_INTERNAL_ASSERT(
@@ -525,16 +696,17 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
       // padded with constant pad widths (1, 1), in which case although we do
       // not know the exact extent of the output, we know it is at least as
       // large as the sum of the pad widths, 2. In such cases, the output
-      // IterDomain is concrete at definition, since if the extent is >1 we know
-      // the IterType is Iteration. In these cases, we must continue to
+      // IterDomain is concrete at definition, since if the extent is >1 we
+      // know the IterType is Iteration. In these cases, we must continue to
       // concretize intermediate expressions between the root and R-factor
       // domain. See test DynamicTransform5_CUDA which demonstrates this
       // behavior.
       // NOTE: We also do not assume that if one output ID is symbolic, that
       // they all must be. See test FusionSliceForNanoGPT3_CUDA for an example
-      // that does a static split by a factor of 16 of a symbolic input domain.
-      // The static split in that case results in a concrete IterDomain with
-      // extent 16 along with a symbolic one (extent ceilDiv(n / 16)).
+      // that does a static split by a factor of 16 of a symbolic input
+      // domain. The static split in that case results in a concrete
+      // IterDomain with extent 16 along with a symbolic one (extent ceilDiv(n
+      // / 16)).
 
       // Determine the output IterType
       IterType iter_type = IterType::Symbolic;
diff --git a/csrc/ir/internal_base_nodes.h b/csrc/ir/internal_base_nodes.h
index b13caaa9dba..4f4b1e7da10 100644
--- a/csrc/ir/internal_base_nodes.h
+++ b/csrc/ir/internal_base_nodes.h
@@ -444,7 +444,8 @@ class TORCH_CUDA_CU_API TensorDomain : public Val {
       std::vector<IterDomain*> root_domain,
       std::vector<IterDomain*> rfactor_domain,
       std::vector<IterDomain*> leaf_domain,
-      std::vector<std::optional<bool>> contiguity = {});
+      std::vector<std::optional<bool>> contiguity = {},
+      bool validate_domain_equivalence = true);
 
   TensorDomain(
       IrBuilderPasskey,
diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
index 3fb3ebaca3e..802757b374e 100644
--- a/csrc/ir/nodes.cpp
+++ b/csrc/ir/nodes.cpp
@@ -2726,7 +2726,8 @@ TensorDomain::TensorDomain(
     std::vector<IterDomain*> root_domain,
     std::vector<IterDomain*> rfactor_domain,
     std::vector<IterDomain*> leaf_domain,
-    std::vector<std::optional<bool>> contiguity)
+    std::vector<std::optional<bool>> contiguity,
+    bool validate_domain_equivalence)
     : Val(passkey, ValType::TensorDomain, DataType::Null),
       root_domain_(std::move(root_domain)),
       rfactor_domain_(std::move(rfactor_domain)),
@@ -2736,12 +2737,15 @@ TensorDomain::TensorDomain(
                              : std::move(contiguity)) {
   validateContiguity(maybeAllocation(), contiguity_);
 
-  if (!root_domain_.empty()) {
-    TORCH_CHECK(!leaf_domain_.empty(), "Root domain is not empty but leaf is");
-    ir_utils::validateDomainEquivalence(root_domain_, leaf_domain_);
-    if (!rfactor_domain_.empty()) {
-      ir_utils::validateDomainEquivalence(root_domain_, rfactor_domain_);
-      ir_utils::validateDomainEquivalence(rfactor_domain_, leaf_domain_);
+  if (validate_domain_equivalence) {
+    if (!root_domain_.empty()) {
+      TORCH_CHECK(
+          !leaf_domain_.empty(), "Root domain is not empty but leaf is");
+      ir_utils::validateDomainEquivalence(root_domain_, leaf_domain_);
+      if (!rfactor_domain_.empty()) {
+        ir_utils::validateDomainEquivalence(root_domain_, rfactor_domain_);
+        ir_utils::validateDomainEquivalence(rfactor_domain_, leaf_domain_);
+      }
     }
   }
 
diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp
index 9abbd35ed7b..a9c7670e2a5 100644
--- a/csrc/kernel_cache.cpp
+++ b/csrc/kernel_cache.cpp
@@ -557,6 +557,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
   if (isDebugDumpEnabled(DebugDumpOption::FusionIrDynamic)) {
     std::cout << "Dynamic Fusion (before concretization):" << std::endl;
     fusion()->printMath();
+    fusion()->printTransforms();
   }
 
   // Compute or get cached initial concretization info
@@ -640,6 +641,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
     if (isDebugDumpEnabled(DebugDumpOption::FusionIrConcretized)) {
       std::cout << "Concretized Fusion:" << std::endl;
       fusion->printMath();
+      fusion->printTransforms();
     }
     kernel_runtimes.emplace_back(std::make_unique<FusionKernelRuntime>(
         std::move(fusion), args, forced_index_type));

From 72b29eefd737eac4805091dfaed048a088282aa9 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 1 Jun 2023 11:40:11 -0400
Subject: [PATCH 07/22] Remove debugging print statements

---
 csrc/dynamic_transform.cpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 1db9ab0ea82..258f82c46ce 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -431,13 +431,6 @@ std::vector<IterDomain*> matchReshapeRFactorExtents(
       " but expected ",
       desired_extents_rfactor.size());
 
-  std::cout << "desired_extents_rfactor:" << std::endl;
-  for (auto id : desired_extents_rfactor) {
-    std::cout << id->toString() << " has extent " << id->extent()->toString()
-              << std::endl;
-  }
-  std::cout << std::endl;
-
   std::vector<IterDomain*> new_rfactor(old_rfactor.size(), nullptr);
 
   // Note: we do not use c10::irange here so that we can advance i if we see a
@@ -455,21 +448,15 @@ std::vector<IterDomain*> matchReshapeRFactorExtents(
         auto desired_outer_extent = desired_extents_rfactor.at(i)->extent();
         auto new_outer =
             IterDomainBuilder(old_id).extent(desired_outer_extent).build();
-        std::cout << "desired_extent = " << desired_outer_extent->toString()
-                  << std::endl;
         new_rfactor.at(i) = new_outer;
         i++;
         TORCH_INTERNAL_ASSERT(
             i < old_rfactor.size(), "Found outer split output without inner");
         auto desired_inner_extent = desired_extents_rfactor.at(i)->extent();
-        std::cout << "desired_inner_extent = "
-                  << desired_inner_extent->toString() << std::endl;
         auto old_inner = old_rfactor.at(i);
         auto new_inner =
             IterDomainBuilder(old_inner).extent(desired_inner_extent).build();
         new_rfactor.at(i) = new_inner;
-        std::cout << "new_outer = " << new_outer->toString()
-                  << "  new_inner = " << new_inner->toString() << std::endl;
         IrBuilder::create<Split>(
             new_outer,
             new_inner,

From 78ad7e262f6855cbd4e5ffddf0f8236c8d77d47c Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 1 Jun 2023 11:42:01 -0400
Subject: [PATCH 08/22] Add output extents as ViewOp inputs

---
 csrc/ir/nodes.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
index 802757b374e..6ac62e46220 100644
--- a/csrc/ir/nodes.cpp
+++ b/csrc/ir/nodes.cpp
@@ -1963,6 +1963,14 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(ViewAsScalar)
 ViewOp::ViewOp(IrBuilderPasskey passkey, Val* out, Val* in) : Expr(passkey) {
   addOutput(out);
   addInput(in);
+  // Note: we also add the output extents as INPUTS here. This helps to ensure
+  // that we include necessary scalars during segmentation to determine outputs.
+  // Without this, the user may pass expressions for the desired shape that are
+  // not available in a given segment. See
+  // https://github.com/NVIDIA/Fuser/issues/418
+  for (auto id : out->as<TensorView>()->getMaybeRFactorDomain()) {
+    addInput(id->extent());
+  }
 }
 
 std::string ViewOp::toString(int indent_size) const {

From 788f500ab894848d54fecaf9c67fac1aa7fb25e0 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 1 Jun 2023 12:57:39 -0400
Subject: [PATCH 09/22] Remove concrete_reshape_out_tv after replacing

---
 csrc/dynamic_transform.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 258f82c46ce..53a27a7b3f2 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -584,8 +584,14 @@ void DynamicTransformConcretizer::concretizeReshape() {
     // https://github.com/NVIDIA/Fuser/issues/418
     // Here, we take the output above and replace the rfactor extents with the
     // original ones in order to prevent such mismatches.
-    auto new_tv =
-        matchSymbolicReshapeExtents(concrete_reshape_out_tv, incomplete_out_tv);
+    TensorView* new_tv = concrete_reshape_out_tv;
+    if (!new_tv->sameAs(inp_tv)) {
+      new_tv = matchSymbolicReshapeExtents(new_tv, incomplete_out_tv);
+      // Remove the concrete TV after we've replaced its extents and generated a
+      // new TV
+      new_tv->fusion()->removeVal(concrete_reshape_out_tv);
+      concrete_reshape_out_tv = nullptr;
+    }
 
     // We do the replacement directly here, but we must still check that the
     // replacement is valid

From bf9aa1e5f6ccfa825d784085beb7715b1b7983d8 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 1 Jun 2023 12:58:00 -0400
Subject: [PATCH 10/22] Find expr outputs before calling StmtSort again

---
 csrc/dynamic_transform.cpp | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 53a27a7b3f2..2f44759cd51 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -403,17 +403,36 @@ void DynamicTransformConcretizer::concretize() {
   concretizeResize();
 
   // Finally, propagate concretized domains
-  auto all_stmts = StmtSort::getStmts(info_.fusion(), true);
+
+  // We need to concretize all immediate outputs of all intermediate
+  // expressions; even those leading to dead code branches. To do this, we
+  // insert all outputs from all intermediate expressions to "leaves". If we
+  // don't insert anything new, we are done. Otherwise, we traverse again using
+  // these as outputs as well, which ensures the output is sorted.
+  auto leaves = info_.fusion()->getTerminatingOutputs();
+  auto leaves_set =
+      std::unordered_set<Statement*>(leaves.begin(), leaves.end());
+  std::vector<Statement*> all_stmts;
+  bool inserted = true;
+  while (inserted) {
+    all_stmts = StmtSort::getStmts(info_.fusion(), leaves, true);
+    inserted = false;
+    for (auto stmt : all_stmts) {
+      if (stmt->isExpr()) {
+        for (auto o : stmt->as<Expr>()->outputs()) {
+          if (leaves_set.find(o) == leaves_set.end()) {
+            leaves.push_back(o);
+            leaves_set.insert(o);
+            inserted = true;
+          }
+        }
+      }
+    }
+  }
+  // Concretize all vals in the final vector
   for (auto stmt : all_stmts) {
-    if (stmt->isA<Val>() && !stmt->asVal()->definition()) {
-      // vals with definitions will be processed as exprs along with _all_
-      // outputs
+    if (stmt->isVal()) {
       mutate(stmt);
-    } else if (stmt->isA<Expr>()) {
-      // concretize all outputs of each expression that will be evaluated
-      for (auto o : stmt->as<Expr>()->outputs()) {
-        mutate(o);
-      }
     }
   }
 }

From 4f05ad50aa3f1e3cfc8c03680d167481dfc1c44f Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 1 Jun 2023 15:48:05 -0400
Subject: [PATCH 11/22] Revert to d675161 but keep bf9aa1e5

---
 csrc/dynamic_transform.cpp    | 199 ++++------------------------------
 csrc/ir/internal_base_nodes.h |   3 +-
 csrc/ir/nodes.cpp             |  26 ++---
 csrc/kernel_cache.cpp         |   2 -
 4 files changed, 28 insertions(+), 202 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 2f44759cd51..a34fac679e3 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -11,7 +11,6 @@
 #include <expr_evaluator.h>
 #include <ir/cloner.h>
 #include <ir/utils.h>
-#include <ops/arith.h>
 #include <ops/utils.h>
 #include <transform_iter.h>
 #include <transform_view.h>
@@ -437,153 +436,6 @@ void DynamicTransformConcretizer::concretize() {
   }
 }
 
-//! Given a concrete rfactor domain and a symbolic rfactor domain, create a new
-//! rfactor domain with the same history as the concrete one, but with the
-//! extent expressions of the symbolic IterDomains.
-std::vector<IterDomain*> matchReshapeRFactorExtents(
-    const std::vector<IterDomain*>& old_rfactor,
-    const std::vector<IterDomain*>& desired_extents_rfactor) {
-  TORCH_INTERNAL_ASSERT(
-      old_rfactor.size() == desired_extents_rfactor.size(),
-      "New RFactor length does not match symbolic RFactor length: found ",
-      old_rfactor.size(),
-      " but expected ",
-      desired_extents_rfactor.size());
-
-  std::vector<IterDomain*> new_rfactor(old_rfactor.size(), nullptr);
-
-  // Note: we do not use c10::irange here so that we can advance i if we see a
-  // Split
-  for (size_t i = 0; i < old_rfactor.size(); i++) {
-    auto old_id = old_rfactor.at(i);
-    // create new Split or Merge expression
-    auto def = old_id->definition();
-    if (def) {
-      if (def->isA<Split>()) {
-        auto spl = def->as<Split>();
-        // For Split, we will replace both this rfactor ID and the next at once
-        // Note: I think this is safe since we don't have any broadcasts at this
-        // point.
-        auto desired_outer_extent = desired_extents_rfactor.at(i)->extent();
-        auto new_outer =
-            IterDomainBuilder(old_id).extent(desired_outer_extent).build();
-        new_rfactor.at(i) = new_outer;
-        i++;
-        TORCH_INTERNAL_ASSERT(
-            i < old_rfactor.size(), "Found outer split output without inner");
-        auto desired_inner_extent = desired_extents_rfactor.at(i)->extent();
-        auto old_inner = old_rfactor.at(i);
-        auto new_inner =
-            IterDomainBuilder(old_inner).extent(desired_inner_extent).build();
-        new_rfactor.at(i) = new_inner;
-        IrBuilder::create<Split>(
-            new_outer,
-            new_inner,
-            spl->in(),
-            spl->factor(),
-            spl->innerSplit(),
-            spl->startOffset(),
-            spl->stopOffset());
-        // If we do not remove the original expression, there will be multiple
-        // uses of the same root ID, which will cause an error in
-        // BestEffortReplay. So we remove the old expression here.
-        spl->fusion()->removeExpr(spl);
-      } else if (def->isA<Merge>()) {
-        auto m = def->as<Merge>();
-        auto desired_extent = desired_extents_rfactor.at(i)->extent();
-        auto new_id = IterDomainBuilder(old_id).extent(desired_extent).build();
-        new_rfactor.at(i) = new_id;
-        IrBuilder::create<Merge>(new_id, m->outer(), m->inner());
-        m->fusion()->removeExpr(m); // see above
-      } else {
-        TORCH_INTERNAL_ASSERT(
-            false,
-            "Unhandled definition for reshape rfactor ID ",
-            old_id->toString(),
-            ". Definition is ",
-            def->toString());
-      }
-    } else {
-      new_rfactor.at(i) = old_id;
-    }
-  }
-  return new_rfactor;
-}
-
-//! Given a concretized reshape (concrete_tv) and its symbolic stand-in
-//! (symbolic_tv), generate a new TensorView having the same concretization as
-//! concrete_tv with the same final rfactor extents as symbolic_tv.
-//!
-//! Currently, reshape works by applying squeeze, split/merge, then broadcasts
-//! (in that order). Each of these three phases produces a new TensorView. We
-//! re-use the squeezed TV, then replace the applyViewTransforms TV and replay
-//! the broadcast.
-TensorView* matchSymbolicReshapeExtents(
-    TensorView* concrete_tv,
-    TensorView* symbolic_tv) {
-  TORCH_INTERNAL_ASSERT(
-      concrete_tv->getMaybeRFactorDomain().size() ==
-          symbolic_tv->getMaybeRFactorDomain().size(),
-      "Mismatch between concrete and symbolic rfactor domain sizes");
-  // We will skip axes in the symbolic rfactor domain matching broadcast axes
-  // in the concrete rfactor for now.
-  std::vector<IterDomain*> symbolic_rfactor_nobcast;
-  for (auto i : c10::irange(concrete_tv->getMaybeRFactorDomain().size())) {
-    if (concrete_tv->getMaybeRFactorDomain().at(i)->isBroadcast()) {
-      continue;
-    }
-    symbolic_rfactor_nobcast.push_back(
-        symbolic_tv->getMaybeRFactorDomain().at(i));
-  }
-
-  // First determine if there is a broadcast
-  auto def = concrete_tv->definition();
-  TORCH_INTERNAL_ASSERT(
-      def, "matchSymbolicReshapeExtents input has no definition.");
-  BroadcastOp* bcast = nullptr;
-  if (def->isA<BroadcastOp>()) {
-    // There were broadcasts. Move upstream but hold onto the op so we can
-    // replay it later
-    bcast = def->as<BroadcastOp>();
-    concrete_tv = bcast->in()->as<TensorView>();
-    def = concrete_tv->definition();
-    TORCH_INTERNAL_ASSERT(
-        def, "Reshape's internal broadcast has no definition");
-  }
-  TORCH_INTERNAL_ASSERT(
-      def->isA<ViewOp>(),
-      "matchSymbolicReshapeExtents expects a ViewOp output");
-
-  // We can now ignore output broadcast axes and just replace the rfactor
-  // extents in the ViewOp output
-  auto new_rfactor = matchReshapeRFactorExtents(
-      concrete_tv->getMaybeRFactorDomain(), symbolic_rfactor_nobcast);
-  // NOTE: we avoid the validate_domain_equivalence check when creating this
-  // TensorDomain since we cannot easily prove that the new rfactor will match
-  // the root. However, this should be guaranteed by analyzeView in this case.
-  auto new_td = IrBuilder::create<TensorDomain>(
-      concrete_tv->getRootDomain(),
-      new_rfactor,
-      new_rfactor,
-      concrete_tv->domain()->contiguity(),
-      /*validate_domain_equivalence*/ false);
-  auto new_tv = IrBuilder::create<TensorView>(
-      concrete_tv->container(), new_td, concrete_tv->getDataType().value());
-
-  // Create a new ViewOp linking the input to this new output
-  IrBuilder::create<ViewOp>(
-      concrete_tv->container(),
-      new_tv,
-      concrete_tv->definition()->input(0)->as<TensorView>());
-
-  // Replay broadcast
-  if (bcast) {
-    new_tv = broadcast(new_tv, bcast->getBroadcastDimFlags());
-  }
-
-  return new_tv;
-}
-
 void DynamicTransformConcretizer::concretizeReshape() {
   // Concretize each reshape op.
   for (const auto& kv : info_.getReshapeTransforms()) {
@@ -594,35 +446,19 @@ void DynamicTransformConcretizer::concretizeReshape() {
 
     auto concrete_reshape_out_tv = reshape(inp_tv, view_analysis);
 
-    // The above call to reshape uses static sizes and recomputes the output
-    // extents based on the decomposed Split and Merge operations. This means
-    // that it will produce a TensorView with different extent expressions
-    // (though equal for the given inputs) than what the user requested in the
-    // original symbolic reshape invocation. This can cause subtle issues as
-    // it introduces a mismatch with downstream extents; e.g.
-    // https://github.com/NVIDIA/Fuser/issues/418
-    // Here, we take the output above and replace the rfactor extents with the
-    // original ones in order to prevent such mismatches.
-    TensorView* new_tv = concrete_reshape_out_tv;
-    if (!new_tv->sameAs(inp_tv)) {
-      new_tv = matchSymbolicReshapeExtents(new_tv, incomplete_out_tv);
-      // Remove the concrete TV after we've replaced its extents and generated a
-      // new TV
-      new_tv->fusion()->removeVal(concrete_reshape_out_tv);
-      concrete_reshape_out_tv = nullptr;
-    }
-
     // We do the replacement directly here, but we must still check that the
     // replacement is valid
-    checkConcretizedUses(incomplete_out_tv, new_tv);
+    checkConcretizedUses(incomplete_out_tv, concrete_reshape_out_tv);
 
     // Replace the old tensor with the new concretized tensor
     for (auto use_of_old_tv : incomplete_out_tv->uses()) {
-      ir_utils::replaceValInExpr(use_of_old_tv, incomplete_out_tv, new_tv);
+      ir_utils::replaceValInExpr(
+          use_of_old_tv, incomplete_out_tv, concrete_reshape_out_tv);
     }
 
     if (incomplete_out_tv->isFusionOutput()) {
-      incomplete_out_tv->fusion()->replaceOutput(incomplete_out_tv, new_tv);
+      incomplete_out_tv->fusion()->replaceOutput(
+          incomplete_out_tv, concrete_reshape_out_tv);
     }
 
     incomplete_out_tv->fusion()->removeVal(incomplete_out_tv);
@@ -669,9 +505,9 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
   propagateFromProducerToConsumer(tv);
 
   // If no root domain is altered by producer, we don't need to propagate back
-  // up to rfactor. We could return early, but instead we go ahead and check
-  // the root to rfactor transforms to be sure we have concretized any
-  // intermediate IterDomains.
+  // up to rfactor. We could return early, but instead we go ahead and check the
+  // root to rfactor transforms to be sure we have concretized any intermediate
+  // IterDomains.
 
   // At this point, there should be no expr beyond rfactor root
   TORCH_INTERNAL_ASSERT(
@@ -708,17 +544,16 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
       // padded with constant pad widths (1, 1), in which case although we do
       // not know the exact extent of the output, we know it is at least as
       // large as the sum of the pad widths, 2. In such cases, the output
-      // IterDomain is concrete at definition, since if the extent is >1 we
-      // know the IterType is Iteration. In these cases, we must continue to
+      // IterDomain is concrete at definition, since if the extent is >1 we know
+      // the IterType is Iteration. In these cases, we must continue to
       // concretize intermediate expressions between the root and R-factor
       // domain. See test DynamicTransform5_CUDA which demonstrates this
       // behavior.
       // NOTE: We also do not assume that if one output ID is symbolic, that
       // they all must be. See test FusionSliceForNanoGPT3_CUDA for an example
-      // that does a static split by a factor of 16 of a symbolic input
-      // domain. The static split in that case results in a concrete
-      // IterDomain with extent 16 along with a symbolic one (extent ceilDiv(n
-      // / 16)).
+      // that does a static split by a factor of 16 of a symbolic input domain.
+      // The static split in that case results in a concrete IterDomain with
+      // extent 16 along with a symbolic one (extent ceilDiv(n / 16)).
 
       // Determine the output IterType
       IterType iter_type = IterType::Symbolic;
@@ -829,6 +664,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
     // corresponding producer IDs
 
     std::optional<IterType> id_type;
+    Val* extent = nullptr;
 
     for (auto producer : ir_utils::filterByType<TensorView>(def->inputs())) {
       PairwiseRootDomainMap root_map(producer, consumer);
@@ -851,6 +687,11 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
       } else {
         id_type = input_id->getIterType();
       }
+
+      // Set extent expression based on producer, overwriting that of consumer
+      if (!extent) {
+        extent = input_id->extent();
+      }
     }
 
     TORCH_INTERNAL_ASSERT(
@@ -868,7 +709,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
         consumer->toString());
 
     auto concretized_id =
-        IterDomainBuilder(root_id).iter_type(*id_type).build();
+        IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build();
 
     registerConcretization(root_id, concretized_id);
     is_concretized = true;
diff --git a/csrc/ir/internal_base_nodes.h b/csrc/ir/internal_base_nodes.h
index 4f4b1e7da10..b13caaa9dba 100644
--- a/csrc/ir/internal_base_nodes.h
+++ b/csrc/ir/internal_base_nodes.h
@@ -444,8 +444,7 @@ class TORCH_CUDA_CU_API TensorDomain : public Val {
       std::vector<IterDomain*> root_domain,
       std::vector<IterDomain*> rfactor_domain,
       std::vector<IterDomain*> leaf_domain,
-      std::vector<std::optional<bool>> contiguity = {},
-      bool validate_domain_equivalence = true);
+      std::vector<std::optional<bool>> contiguity = {});
 
   TensorDomain(
       IrBuilderPasskey,
diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
index 6ac62e46220..3fb3ebaca3e 100644
--- a/csrc/ir/nodes.cpp
+++ b/csrc/ir/nodes.cpp
@@ -1963,14 +1963,6 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(ViewAsScalar)
 ViewOp::ViewOp(IrBuilderPasskey passkey, Val* out, Val* in) : Expr(passkey) {
   addOutput(out);
   addInput(in);
-  // Note: we also add the output extents as INPUTS here. This helps to ensure
-  // that we include necessary scalars during segmentation to determine outputs.
-  // Without this, the user may pass expressions for the desired shape that are
-  // not available in a given segment. See
-  // https://github.com/NVIDIA/Fuser/issues/418
-  for (auto id : out->as<TensorView>()->getMaybeRFactorDomain()) {
-    addInput(id->extent());
-  }
 }
 
 std::string ViewOp::toString(int indent_size) const {
@@ -2734,8 +2726,7 @@ TensorDomain::TensorDomain(
     std::vector<IterDomain*> root_domain,
     std::vector<IterDomain*> rfactor_domain,
     std::vector<IterDomain*> leaf_domain,
-    std::vector<std::optional<bool>> contiguity,
-    bool validate_domain_equivalence)
+    std::vector<std::optional<bool>> contiguity)
     : Val(passkey, ValType::TensorDomain, DataType::Null),
       root_domain_(std::move(root_domain)),
       rfactor_domain_(std::move(rfactor_domain)),
@@ -2745,15 +2736,12 @@ TensorDomain::TensorDomain(
                              : std::move(contiguity)) {
   validateContiguity(maybeAllocation(), contiguity_);
 
-  if (validate_domain_equivalence) {
-    if (!root_domain_.empty()) {
-      TORCH_CHECK(
-          !leaf_domain_.empty(), "Root domain is not empty but leaf is");
-      ir_utils::validateDomainEquivalence(root_domain_, leaf_domain_);
-      if (!rfactor_domain_.empty()) {
-        ir_utils::validateDomainEquivalence(root_domain_, rfactor_domain_);
-        ir_utils::validateDomainEquivalence(rfactor_domain_, leaf_domain_);
-      }
+  if (!root_domain_.empty()) {
+    TORCH_CHECK(!leaf_domain_.empty(), "Root domain is not empty but leaf is");
+    ir_utils::validateDomainEquivalence(root_domain_, leaf_domain_);
+    if (!rfactor_domain_.empty()) {
+      ir_utils::validateDomainEquivalence(root_domain_, rfactor_domain_);
+      ir_utils::validateDomainEquivalence(rfactor_domain_, leaf_domain_);
     }
   }
 
diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp
index a9c7670e2a5..9abbd35ed7b 100644
--- a/csrc/kernel_cache.cpp
+++ b/csrc/kernel_cache.cpp
@@ -557,7 +557,6 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
   if (isDebugDumpEnabled(DebugDumpOption::FusionIrDynamic)) {
     std::cout << "Dynamic Fusion (before concretization):" << std::endl;
     fusion()->printMath();
-    fusion()->printTransforms();
   }
 
   // Compute or get cached initial concretization info
@@ -641,7 +640,6 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
     if (isDebugDumpEnabled(DebugDumpOption::FusionIrConcretized)) {
       std::cout << "Concretized Fusion:" << std::endl;
       fusion->printMath();
-      fusion->printTransforms();
     }
     kernel_runtimes.emplace_back(std::make_unique<FusionKernelRuntime>(
         std::move(fusion), args, forced_index_type));

From 8160c6fa8b2e802c749701921c103e188263166b Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 1 Jun 2023 15:49:23 -0400
Subject: [PATCH 12/22] Fix DynamicTransformIssue418_CUDA ATen keepdim args

---
 test/test_dynamic_transform.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index 1b9003f5c3b..ea257f81558 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -1036,8 +1036,8 @@ TEST_F(NVFuserTest, DynamicTransformIssue418_CUDA) {
     std::vector<c10::IValue> inputs = {t0, 32};
     auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
     auto t0_resh = t0.reshape({256, 4, 32, 28, 28});
-    auto mu = t0_resh.mean({2, 3, 4});
-    auto v = t0_resh.var({2, 3, 4});
+    auto mu = t0_resh.mean({2, 3, 4}, true);
+    auto v = t0_resh.var({2, 3, 4}, true, true);
     testValidate(
         executor_cache.fusion(),
         cg_outputs,

From 83062b878571d3d696c5b79b0c9a46127442f755 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 2 Jun 2023 10:10:38 -0400
Subject: [PATCH 13/22] Add failing full groupnorm test

---
 test/test_dynamic_transform.cpp | 51 +++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index ea257f81558..17d70028771 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -1048,4 +1048,55 @@ TEST_F(NVFuserTest, DynamicTransformIssue418_CUDA) {
   }
 }
 
+// Repro of https://github.com/NVIDIA/Fuser/issues/418 (full GroupNorm example)
+TEST_F(NVFuserTest, DynamicTransformIssue418Full_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  auto weight = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  auto bias = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  auto s0 = IrBuilder::create<Int>();
+  fusion->addInput(s0);
+
+  auto v00 = tv0->axis(0)->extent();
+  auto v01 = tv0->axis(1)->extent();
+  auto v02 = tv0->axis(2)->extent();
+  auto v03 = tv0->axis(3)->extent();
+
+  auto tv1 = reshape(tv0, {v00, div(v01, s0), s0, v02, v03});
+  auto vm = variance_mean(tv1, {2, 3, 4}, 0, true);
+  auto eps = IrBuilder::create<Double>(1e-5);
+  auto tv2 = mul(sub(tv1, vm.mean), rsqrt(add(vm.var, eps)));
+  auto tv3 = reshape(tv2, {v00, v01, v02, v03});
+  auto tv4 = add(mul(tv3, weight), bias);
+  fusion->addOutput(tv4);
+
+  // tv1 has symbolic axes as reshape is dynamic
+  TORCH_CHECK(
+      tv1->domain()->hasSymbolicAxis(),
+      "Expected to have symbolic axes: ",
+      tv1->toString());
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({256, 128, 28, 28}, options);
+  auto w = at::randn({1, 128, 1, 1}, options);
+  auto b = at::randn({1, 128, 1, 1}, options);
+  std::vector<c10::IValue> inputs = {t0, w, b, 32};
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
+  auto t0_resh = t0.reshape({256, 4, 32, 28, 28});
+  auto mu = t0_resh.mean({2, 3, 4}, true);
+  auto v = t0_resh.var({2, 3, 4}, true, true);
+  auto gn =
+      ((t0_resh - mu) * (v + 1e-5).rsqrt()).reshape({256, 128, 28, 28}) * w + b;
+  testValidate(
+      executor_cache.fusion(), cg_outputs, inputs, {gn}, __LINE__, __FILE__);
+}
+
 } // namespace nvfuser

From 82e7ee8f7f78a9eefef3e328e2c4ee05eea8424c Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 2 Jun 2023 10:12:47 -0400
Subject: [PATCH 14/22] Fix bug in failing test

---
 test/test_dynamic_transform.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index 17d70028771..c54cd1b5543 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -1056,9 +1056,9 @@ TEST_F(NVFuserTest, DynamicTransformIssue418Full_CUDA) {
   auto tv0 = makeSymbolicTensor(4);
   fusion->addInput(tv0);
   auto weight = makeSymbolicTensor(4);
-  fusion->addInput(tv0);
+  fusion->addInput(weight);
   auto bias = makeSymbolicTensor(4);
-  fusion->addInput(tv0);
+  fusion->addInput(bias);
   auto s0 = IrBuilder::create<Int>();
   fusion->addInput(s0);
 

From 5c8a4c2ea60141112f0a885f4a246d12a042b6f3 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 2 Jun 2023 12:07:56 -0400
Subject: [PATCH 15/22] Properly set broadcast axes in inputs to 418Full test

---
 test/test_dynamic_transform.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index c54cd1b5543..e367e9c49b3 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -1055,9 +1055,9 @@ TEST_F(NVFuserTest, DynamicTransformIssue418Full_CUDA) {
 
   auto tv0 = makeSymbolicTensor(4);
   fusion->addInput(tv0);
-  auto weight = makeSymbolicTensor(4);
+  auto weight = makeSymbolicTensor({1, -1, 1, 1});
   fusion->addInput(weight);
-  auto bias = makeSymbolicTensor(4);
+  auto bias = makeSymbolicTensor({1, -1, 1, 1});
   fusion->addInput(bias);
   auto s0 = IrBuilder::create<Int>();
   fusion->addInput(s0);

From 9ffadb9b990f53254f0058ed907c4c71ce142c0a Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 2 Jun 2023 12:27:53 -0400
Subject: [PATCH 16/22] Add channels-last test

---
 test/test_dynamic_transform.cpp | 51 +++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index e367e9c49b3..4ba73961ddc 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -1099,4 +1099,55 @@ TEST_F(NVFuserTest, DynamicTransformIssue418Full_CUDA) {
       executor_cache.fusion(), cg_outputs, inputs, {gn}, __LINE__, __FILE__);
 }
 
+// Repro of https://github.com/NVIDIA/Fuser/issues/418 (channels-last)
+TEST_F(NVFuserTest, DynamicTransformIssue418FullNHWC_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  auto weight = makeSymbolicTensor({1, 1, 1, -1});
+  fusion->addInput(weight);
+  auto bias = makeSymbolicTensor({1, 1, 1, -1});
+  fusion->addInput(bias);
+  auto s0 = IrBuilder::create<Int>();
+  fusion->addInput(s0);
+
+  auto v00 = tv0->axis(0)->extent();
+  auto v01 = tv0->axis(1)->extent();
+  auto v02 = tv0->axis(2)->extent();
+  auto v03 = tv0->axis(3)->extent();
+
+  auto tv1 = reshape(tv0, {v00, v01, v02, div(v03, s0), s0});
+  auto vm = variance_mean(tv1, {1, 2, 4}, 0, true);
+  auto eps = IrBuilder::create<Double>(1e-5);
+  auto tv2 = mul(sub(tv1, vm.mean), rsqrt(add(vm.var, eps)));
+  auto tv3 = reshape(tv2, {v00, v01, v02, v03});
+  auto tv4 = add(mul(tv3, weight), bias);
+  fusion->addOutput(tv4);
+
+  // tv1 has symbolic axes as reshape is dynamic
+  TORCH_CHECK(
+      tv1->domain()->hasSymbolicAxis(),
+      "Expected to have symbolic axes: ",
+      tv1->toString());
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({256, 28, 28, 128}, options);
+  auto w = at::randn({1, 1, 1, 128}, options);
+  auto b = at::randn({1, 1, 1, 128}, options);
+  std::vector<c10::IValue> inputs = {t0, w, b, 32};
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
+  auto t0_resh = t0.reshape({256, 28, 28, 4, 32});
+  auto mu = t0_resh.mean({1, 2, 4}, true);
+  auto v = t0_resh.var({1, 2, 4}, true, true);
+  auto gn =
+      ((t0_resh - mu) * (v + 1e-5).rsqrt()).reshape({256, 28, 28, 128}) * w + b;
+  testValidate(
+      executor_cache.fusion(), cg_outputs, inputs, {gn}, __LINE__, __FILE__);
+}
+
 } // namespace nvfuser

From 596ceab387c8733188dbcf7770dc0a0d7dc1ee84 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 2 Jun 2023 13:12:31 -0400
Subject: [PATCH 17/22] Grab iter_type fix in newOutputDomain from #358

---
 csrc/ops/utils.cpp | 66 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 60 insertions(+), 6 deletions(-)

diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index 43b7ba18206..31b00f72af4 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -223,9 +223,10 @@ std::vector<IterDomain*> newOutputDomain(
   std::vector<int64_t> start_offsets(out_domain.size(), 0);
   std::vector<int64_t> stop_offsets(out_domain.size(), 0);
   std::vector<Val*> extent_vals(out_domain.size(), nullptr);
+  std::vector<bool> mismatched_symbolic_extents(out_domain.size(), false);
   std::vector<Val*> expanded_extent_vals(out_domain.size(), nullptr);
-  std::vector<c10::optional<IterType>> iter_types(
-      out_domain.size(), c10::nullopt);
+  std::vector<std::optional<IterType>> iter_types(
+      out_domain.size(), std::nullopt);
 
   for (auto tv : tvs) {
     auto dom = TensorDomain::noReductions(tv->getMaybeRFactorDomain());
@@ -236,6 +237,53 @@ std::vector<IterDomain*> newOutputDomain(
         " dimensions but expected ",
         out_domain.size());
     for (const auto i : c10::irange(dom.size())) {
+      auto iter_type = dom[i]->getIterType();
+      auto prev_iter_type = iter_types[i];
+      if (prev_iter_type.has_value()) {
+        // Clang-tidy complains about unchecked access to optional value here
+        if (iter_type == IterType::Iteration &&
+            prev_iter_type.value() == IterType::Symbolic) {
+          // Prefer the Iteration extent, since Symbolic could be broadcast
+          extent_vals[i] = nullptr;
+        } else if (iter_type == IterType::Symbolic) {
+          switch (prev_iter_type.value()) {
+            case IterType::Iteration:
+              // Previously found Iteration domain, so ignore all Symbolic
+              // domains
+              continue;
+            case IterType::Symbolic:
+              if (extent_vals[i]->sameAs(dom[i]->extent())) {
+                // matching symbolic extent
+                continue;
+              } else {
+                // Mismatched symbolic input extents. Any one of the symbolic
+                // inputs could be a Broadcast or Iteration domain. Until
+                // concretization, we will not know which one holds the true
+                // extent (or whether they all are Broadcast, so that the output
+                // is also Broadcast). We record that these symbolic extents
+                // mismatched so that we can introduce a new symbolic extent
+                // later.
+                mismatched_symbolic_extents[i] = true;
+              }
+              break;
+            case IterType::Broadcast:
+              // Previously found only broadcast, so this will either also
+              // broadcast or resolve those broadcasts. If the expanded
+              // extent of any of the broadcasts is not 1, then it will need to
+              // match that of the dom[i]. In either case, prefer dom[i]'s
+              // extent, so clear iter_types[i] and extent_vals[i] so that the
+              // rest of this iteration will mark output as Symbolic.
+              iter_types[i] = std::nullopt;
+              extent_vals[i] = nullptr;
+              break;
+            default:
+              TORCH_CHECK(
+                  false,
+                  "Encountered unexpected IterType when creating new output domain: ",
+                  prev_iter_type.value());
+          }
+        }
+      }
       if (dom[i]->isBroadcast()) {
         if (dom[i]->hasExpandedExtent()) {
           expanded_extent_vals[i] =
@@ -244,9 +292,9 @@ std::vector<IterDomain*> newOutputDomain(
         continue;
       }
       extent_vals[i] = promoteSize(extent_vals[i], dom[i]->extent());
-      if (iter_types[i].has_value()) {
+      if (prev_iter_type.has_value()) {
         iter_types[i] =
-            promoteIterType(iter_types[i].value(), dom[i]->getIterType());
+            promoteIterType(prev_iter_type.value(), dom[i]->getIterType());
       } else {
         iter_types[i] = dom[i]->getIterType();
       }
@@ -268,15 +316,21 @@ std::vector<IterDomain*> newOutputDomain(
     }
   }
   for (const auto dim_i : c10::irange(out_domain.size())) {
+    auto iter_type = iter_types[dim_i];
+    if (iter_type == IterType::Symbolic && mismatched_symbolic_extents[dim_i]) {
+      // if we have a symbolic output but the input symbolic extents did not
+      // match, create a new extent
+      extent_vals[dim_i] = IrBuilder::create<Int>();
+    }
     if (extent_vals[dim_i] != nullptr) {
       TORCH_INTERNAL_ASSERT(
-          iter_types[dim_i].has_value(),
+          iter_type.has_value(),
           "Could not deduce iter type for new tensor view.");
       out_domain[dim_i] =
           IterDomainBuilder(
               IrBuilder::create<Int>(start_offsets[dim_i]), extent_vals[dim_i])
               .stop_offset(IrBuilder::create<Int>(stop_offsets[dim_i]))
-              .iter_type(iter_types[dim_i].value())
+              .iter_type(iter_type.value())
               .build();
     } else {
       out_domain[dim_i] = IterDomainBuilder(

From 6b85cf48145d9d36c7db750c1d472f9ee9eb54fc Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 2 Jun 2023 15:48:11 -0400
Subject: [PATCH 18/22] Propagate replacements for placeholder extents

---
 csrc/dynamic_transform.cpp | 39 ++++++++++++++++++++++++--------------
 test/test_resize.cpp       | 10 ++++++++--
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index dd379e6c293..779736f271f 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -392,6 +392,8 @@ class DynamicTransformConcretizer : public OptOutMutator {
 
  private:
   const DynamicTransformConcretizationInfo& info_;
+
+  std::unordered_map<Val*, Val*> replaced_extents_;
 };
 
 void DynamicTransformConcretizer::concretize() {
@@ -496,10 +498,6 @@ void DynamicTransformConcretizer::checkConcretizedUses(
 // concretized. Since symbolic IDs may be propagated down to
 // consumers, those domains need to be concretized accordingly.
 void DynamicTransformConcretizer::mutate(TensorView* tv) {
-  if (!tv->domain()->hasSymbolicAxis()) {
-    return;
-  }
-
   // First, try to concretize the root domain as there may be symbolic
   // axes inherited from the producers
   propagateFromProducerToConsumer(tv);
@@ -646,11 +644,6 @@ void DynamicTransformConcretizer::mutate(TensorDomain* td) {
 
 bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
     TensorView* consumer) {
-  if (consumer->definition() == nullptr ||
-      !consumer->domain()->hasSymbolicAxis()) {
-    return false;
-  }
-
   const auto& root_domain = consumer->getRootDomain();
 
   auto def = consumer->definition();
@@ -659,15 +652,25 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
 
   for (const auto i : c10::irange(root_domain.size())) {
     auto root_id = root_domain.at(i);
-    if (root_id->getIterType() != IterType::Symbolic) {
-      continue;
+    Val* extent = nullptr;
+
+    auto replaced_extent_it = replaced_extents_.find(root_id->extent());
+    if (replaced_extent_it == replaced_extents_.end()) {
+      if (root_id->getIterType() != IterType::Symbolic) {
+        continue;
+      }
+    } else {
+      extent = replaced_extent_it->second;
     }
 
     // Figure out the right IterType of this consumer root ID from its
     // corresponding producer IDs
 
-    std::optional<IterType> id_type;
-    Val* extent = nullptr;
+    std::optional<IterType> id_type = std::nullopt;
+
+    if (root_id->isReduction()) {
+      id_type = IterType::Reduction;
+    }
 
     for (auto producer : ir_utils::filterByType<TensorView>(def->inputs())) {
       PairwiseRootDomainMap root_map(producer, consumer);
@@ -686,7 +689,9 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
           input_id->toString());
 
       if (id_type.has_value()) {
-        id_type = ops::promoteIterType(*id_type, input_id->getIterType());
+        if (*id_type != IterType::Reduction) {
+          id_type = ops::promoteIterType(*id_type, input_id->getIterType());
+        }
       } else {
         id_type = input_id->getIterType();
       }
@@ -714,6 +719,12 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
     auto concretized_id =
         IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build();
 
+    if (!extent->sameAs(root_id->extent())) {
+      replaced_extents_[root_id->extent()] = extent;
+      // In case this value is used elsewhere, register for mutation
+      registerMutation(root_id->extent(), extent);
+    }
+
     registerConcretization(root_id, concretized_id);
     is_concretized = true;
   }
diff --git a/test/test_resize.cpp b/test/test_resize.cpp
index 29b16d6ed74..252fc10018f 100644
--- a/test/test_resize.cpp
+++ b/test/test_resize.cpp
@@ -1391,7 +1391,10 @@ TEST_F(NVFuserTest, FusionResizeCatReduceScheduler1_CUDA) {
   auto ref = at::cat({t0, t1}, 1).sum({1});
 
   testValidate(
-      executor_cache.fusion(),
+      // Reference the most recent concretized Fusion, not the dynamic Fusion
+      executor_cache.getMostRecentKernelRuntime()
+          ->fusionSegments()
+          ->completeFusion(),
       cg_outputs,
       aten_inputs,
       {ref},
@@ -1429,7 +1432,10 @@ TEST_F(NVFuserTest, FusionResizeCatSoftmaxScheduler1_CUDA) {
   auto ref = at::_softmax(t2.to(at::kDouble), -1, false);
 
   testValidate(
-      executor_cache.fusion(),
+      // Reference the most recent concretized Fusion, not the dynamic Fusion
+      executor_cache.getMostRecentKernelRuntime()
+          ->fusionSegments()
+          ->completeFusion(),
       cg_outputs,
       aten_inputs,
       {ref},

From f8a0610634ff8df83503084cf0baa47e6897d1a7 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 2 Jun 2023 15:59:06 -0400
Subject: [PATCH 19/22] Temporarily revert "Propagate replacements for
 placeholder extents"

This reverts commit 6b85cf48145d9d36c7db750c1d472f9ee9eb54fc.
---
 csrc/dynamic_transform.cpp | 39 ++++++++++++++------------------------
 test/test_resize.cpp       | 10 ++--------
 2 files changed, 16 insertions(+), 33 deletions(-)

diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 779736f271f..dd379e6c293 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -392,8 +392,6 @@ class DynamicTransformConcretizer : public OptOutMutator {
 
  private:
   const DynamicTransformConcretizationInfo& info_;
-
-  std::unordered_map<Val*, Val*> replaced_extents_;
 };
 
 void DynamicTransformConcretizer::concretize() {
@@ -498,6 +496,10 @@ void DynamicTransformConcretizer::checkConcretizedUses(
 // concretized. Since symbolic IDs may be propagated down to
 // consumers, those domains need to be concretized accordingly.
 void DynamicTransformConcretizer::mutate(TensorView* tv) {
+  if (!tv->domain()->hasSymbolicAxis()) {
+    return;
+  }
+
   // First, try to concretize the root domain as there may be symbolic
   // axes inherited from the producers
   propagateFromProducerToConsumer(tv);
@@ -644,6 +646,11 @@ void DynamicTransformConcretizer::mutate(TensorDomain* td) {
 
 bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
     TensorView* consumer) {
+  if (consumer->definition() == nullptr ||
+      !consumer->domain()->hasSymbolicAxis()) {
+    return false;
+  }
+
   const auto& root_domain = consumer->getRootDomain();
 
   auto def = consumer->definition();
@@ -652,25 +659,15 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
 
   for (const auto i : c10::irange(root_domain.size())) {
     auto root_id = root_domain.at(i);
-    Val* extent = nullptr;
-
-    auto replaced_extent_it = replaced_extents_.find(root_id->extent());
-    if (replaced_extent_it == replaced_extents_.end()) {
-      if (root_id->getIterType() != IterType::Symbolic) {
-        continue;
-      }
-    } else {
-      extent = replaced_extent_it->second;
+    if (root_id->getIterType() != IterType::Symbolic) {
+      continue;
     }
 
     // Figure out the right IterType of this consumer root ID from its
     // corresponding producer IDs
 
-    std::optional<IterType> id_type = std::nullopt;
-
-    if (root_id->isReduction()) {
-      id_type = IterType::Reduction;
-    }
+    std::optional<IterType> id_type;
+    Val* extent = nullptr;
 
     for (auto producer : ir_utils::filterByType<TensorView>(def->inputs())) {
       PairwiseRootDomainMap root_map(producer, consumer);
@@ -689,9 +686,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
           input_id->toString());
 
       if (id_type.has_value()) {
-        if (*id_type != IterType::Reduction) {
-          id_type = ops::promoteIterType(*id_type, input_id->getIterType());
-        }
+        id_type = ops::promoteIterType(*id_type, input_id->getIterType());
       } else {
         id_type = input_id->getIterType();
       }
@@ -719,12 +714,6 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
     auto concretized_id =
         IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build();
 
-    if (!extent->sameAs(root_id->extent())) {
-      replaced_extents_[root_id->extent()] = extent;
-      // In case this value is used elsewhere, register for mutation
-      registerMutation(root_id->extent(), extent);
-    }
-
     registerConcretization(root_id, concretized_id);
     is_concretized = true;
   }
diff --git a/test/test_resize.cpp b/test/test_resize.cpp
index 252fc10018f..29b16d6ed74 100644
--- a/test/test_resize.cpp
+++ b/test/test_resize.cpp
@@ -1391,10 +1391,7 @@ TEST_F(NVFuserTest, FusionResizeCatReduceScheduler1_CUDA) {
   auto ref = at::cat({t0, t1}, 1).sum({1});
 
   testValidate(
-      // Reference the most recent concretized Fusion, not the dynamic Fusion
-      executor_cache.getMostRecentKernelRuntime()
-          ->fusionSegments()
-          ->completeFusion(),
+      executor_cache.fusion(),
       cg_outputs,
       aten_inputs,
       {ref},
@@ -1432,10 +1429,7 @@ TEST_F(NVFuserTest, FusionResizeCatSoftmaxScheduler1_CUDA) {
   auto ref = at::_softmax(t2.to(at::kDouble), -1, false);
 
   testValidate(
-      // Reference the most recent concretized Fusion, not the dynamic Fusion
-      executor_cache.getMostRecentKernelRuntime()
-          ->fusionSegments()
-          ->completeFusion(),
+      executor_cache.fusion(),
       cg_outputs,
       aten_inputs,
       {ref},

From bd17931ac81001cf9487d169a04309b6cfd039ae Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 6 Jun 2023 14:32:39 -0400
Subject: [PATCH 20/22] Print dynamic fusion if fusion_ir_concretized is given

---
 csrc/kernel_cache.cpp | 10 ++++------
 csrc/utils.cpp        |  1 -
 csrc/utils.h          |  1 -
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp
index 3dc989eafe7..dc4288a97c0 100644
--- a/csrc/kernel_cache.cpp
+++ b/csrc/kernel_cache.cpp
@@ -555,11 +555,6 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
     }
   }
 
-  if (isDebugDumpEnabled(DebugDumpOption::FusionIrDynamic)) {
-    std::cout << "Dynamic Fusion (before concretization):" << std::endl;
-    fusion()->printMath();
-  }
-
   // Compute or get cached initial concretization info
   auto& initial_info = initialInfo();
 
@@ -616,7 +611,10 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
     kernel_runtime->updateHeuristicsLaunchParams(new_heuristics.get());
   } else {
     // cache miss, need to re-build an optimized graph for this case
-
+    if (isDebugDumpEnabled(DebugDumpOption::FusionIrConcretized)) {
+      std::cout << "Fusion Before Concretization:" << std::endl;
+      fusion()->printMath();
+    }
     // concretize fusion_ for use in this runtime
     auto fusion = std::make_unique<Fusion>(*fusion_);
     FusionGuard fg(fusion.get());
diff --git a/csrc/utils.cpp b/csrc/utils.cpp
index 7fa703b1eea..8e2d030476f 100644
--- a/csrc/utils.cpp
+++ b/csrc/utils.cpp
@@ -109,7 +109,6 @@ auto parseDebugDumpOptions() {
       {"fusion_ir", DebugDumpOption::FusionIr},
       {"fusion_ir_math", DebugDumpOption::FusionIrMath},
       {"fusion_ir_presched", DebugDumpOption::FusionIrPresched},
-      {"fusion_ir_dynamic", DebugDumpOption::FusionIrDynamic},
       {"fusion_ir_concretized", DebugDumpOption::FusionIrConcretized},
       {"kernel_ir", DebugDumpOption::KernelIr},
       {"ca_map", DebugDumpOption::ComputeAtMap},
diff --git a/csrc/utils.h b/csrc/utils.h
index e5885ecd965..a21e94a986c 100644
--- a/csrc/utils.h
+++ b/csrc/utils.h
@@ -55,7 +55,6 @@ enum class DebugDumpOption {
   FusionIr, //!< Dump the Fusion IR before lowering
   FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
   FusionIrPresched, //!< Dump the Fusion IR before it is scheduled.
-  FusionIrDynamic, //!< Dump the Fusion IR before concretization
   FusionIrConcretized, //!< Dump the Fusion IR after concretization
   KernelIr, //!< Dump the compiler Kernel IR
   ComputeAtMap, //!< Dump the computeAt map

From 9aaedec806b4378b2f49833f60b7809a7f6b2894 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 6 Jun 2023 14:33:00 -0400
Subject: [PATCH 21/22] Set proper output extent in dynamic cat

---
 csrc/ops/alias.cpp | 17 +++++++++++++++--
 csrc/ops/utils.cpp |  6 ++----
 csrc/ops/utils.h   |  4 +---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp
index 49ed3b9912d..99f3071a4da 100644
--- a/csrc/ops/alias.cpp
+++ b/csrc/ops/alias.cpp
@@ -116,7 +116,7 @@ TensorView* reshape(TensorView* inp_tv, const std::vector<Val*>& new_sizes) {
     return static_reshape_output;
   }
 
-  auto root_domain = ops::newOutputDomain({inp_tv}, inp_tv->dtype());
+  auto root_domain = ops::newOutputDomain({inp_tv});
 
   // Create placeholder rfactor domain. Note it's not connected with the root
   // domain.
@@ -632,7 +632,20 @@ TensorView* cat(const std::vector<TensorView*>& inputs, int64_t cat_dim) {
   }
 
   // Now all of resized_inputs have the same shape as the out tensor
-  auto out = ops::newOutputTV(resized_inputs, dtype);
+  // NOTE: ops::newOutputTV would not necessarily be able to infer that the
+  // padded dimensions are all of the same size. However, we know that they are
+  // constructed such that that is the case, so we can use
+  auto out_domain = ops::newOutputDomain(resized_inputs);
+  // Override the concatenated dimension and insert an IterDomain with the true
+  // extent, if needed
+  if (!out_domain.at(cat_dim)->extent()->sameAs(concat_ext)) {
+    out_domain[cat_dim] =
+        IterDomainBuilder(out_domain.at(cat_dim)).extent(concat_ext).build();
+  }
+  auto out = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(
+          out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)),
+      dtype);
 
   IrBuilder::create<CatOp>(out, resized_inputs, cat_dim);
 
diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index 31b00f72af4..effa5d617a7 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -198,9 +198,7 @@ IterType promoteIterType(IterType type1, IterType type2) {
   }
 }
 
-std::vector<IterDomain*> newOutputDomain(
-    const std::vector<Val*>& vals,
-    DataType dtype) {
+std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals) {
   std::vector<TensorView*> tvs;
   for (auto val : vals) {
     if (val->getValType() == ValType::TensorView) {
@@ -346,7 +344,7 @@ std::vector<IterDomain*> newOutputDomain(
 }
 
 TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype) {
-  auto out_domain = newOutputDomain(vals, dtype);
+  auto out_domain = newOutputDomain(vals);
   return IrBuilder::create<TensorView>(
       IrBuilder::create<TensorDomain>(
           out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)),
diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h
index e9dc563d4fe..7af3a606986 100644
--- a/csrc/ops/utils.h
+++ b/csrc/ops/utils.h
@@ -31,9 +31,7 @@ Val* newScalar(ValType vtype, DataType dtype);
 
 IterType promoteIterType(IterType type1, IterType type2);
 
-std::vector<IterDomain*> newOutputDomain(
-    const std::vector<Val*>& vals,
-    DataType dtype);
+std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals);
 
 TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype);
 

From 140298fe00f5aaacf53c3b3472f7fbfa27a8a261 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Sat, 10 Jun 2023 11:19:23 -0400
Subject: [PATCH 22/22] Add failing PadBroadcast test

---
 test/test_dynamic_transform.cpp | 61 +++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index 55bd7c5c1fa..0d5096bf156 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -1181,4 +1181,65 @@ TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) {
   testValidate(&fusion, outputs, aten_inputs, {at2}, __LINE__, __FILE__);
 }
 
+// Test dynamic pad followed by broadcast resolution
+TEST_F(NVFuserTest, DynamicPadBroadcast_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  // 2d axis order here is YX
+  auto ypad = IrBuilder::create<Int>();
+  fusion.addInput(ypad);
+  auto xpad = IrBuilder::create<Int>();
+  fusion.addInput(xpad);
+
+  // two-way resizes to cut square tv down to broadcastable size in each axis
+  auto tv0_pad = pad(tv0, {fusion.zeroVal(), xpad, fusion.zeroVal(), ypad});
+
+  // This will potentially resolve the y or x broadcast
+  auto p = mul(tv0_pad, tv1);
+  fusion.addOutput(p);
+  fusion.addOutput(tv0_pad);
+
+  fusion.printMath();
+
+  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn({5, 5}, options);
+  at::Tensor at_y = at::randn({5, 5}, options);
+
+  // trivial resize
+  std::vector<c10::IValue> aten_inputs({at_x, at_y, 0, 0});
+  std::vector<at::Tensor> outputs;
+
+  /*
+  aten_inputs[2] = 0;
+  aten_inputs[3] = 0;
+  outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  testValidate(fusion_executor_cache.fusion(), outputs, aten_inputs, {at_x *
+  at_y}, __LINE__, __FILE__);
+  */
+
+  // shrink first axis
+  aten_inputs[2] = -4;
+  aten_inputs[3] = 0;
+  outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  std::cout << outputs << std::endl;
+  std::cout << at_x.slice(0, 0, 1) * at_y << std::endl;
+  std::cout << at_x.slice(0, 0, 1) << std::endl;
+  testValidate(
+      fusion_executor_cache.fusion(),
+      outputs,
+      aten_inputs,
+      {at_x.slice(0, 0, 1) * at_y, at_x.slice(0, 0, 1)},
+      __LINE__,
+      __FILE__);
+}
+
 } // namespace nvfuser