diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index df0ad505649..dd379e6c293 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -402,9 +402,35 @@ void DynamicTransformConcretizer::concretize() {
   concretizeResize();
 
   // Finally, propagate concretized domains
-  auto all_stmts = StmtSort::getStmts(info_.fusion(), true);
+
+  // We need to concretize all immediate outputs of all intermediate
+  // expressions; even those leading to dead code branches. To do this, we
+  // insert all outputs from all intermediate expressions to "leaves". If we
+  // don't insert anything new, we are done. Otherwise, we traverse again using
+  // these as outputs as well, which ensures the output is sorted.
+  auto leaves = info_.fusion()->getTerminatingOutputs();
+  auto leaves_set =
+      std::unordered_set<Statement*>(leaves.begin(), leaves.end());
+  std::vector<Statement*> all_stmts;
+  bool inserted = true;
+  while (inserted) {
+    all_stmts = StmtSort::getStmts(info_.fusion(), leaves, true);
+    inserted = false;
+    for (auto stmt : all_stmts) {
+      if (stmt->isExpr()) {
+        for (auto o : stmt->as<Expr>()->outputs()) {
+          if (leaves_set.find(o) == leaves_set.end()) {
+            leaves.push_back(o);
+            leaves_set.insert(o);
+            inserted = true;
+          }
+        }
+      }
+    }
+  }
+  // Concretize all vals in the final vector
   for (auto stmt : all_stmts) {
-    if (stmt->isA<Val>()) {
+    if (stmt->isVal()) {
       mutate(stmt);
     }
   }
@@ -641,6 +667,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
     // corresponding producer IDs
 
     std::optional<IterType> id_type;
+    Val* extent = nullptr;
 
     for (auto producer : ir_utils::filterByType<TensorView>(def->inputs())) {
       PairwiseRootDomainMap root_map(producer, consumer);
@@ -663,6 +690,11 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
       } else {
         id_type = input_id->getIterType();
       }
+
+      // Set extent expression based on producer, overwriting that of consumer
+      if (!extent) {
+        extent = input_id->extent();
+      }
     }
 
     TORCH_INTERNAL_ASSERT(
@@ -680,7 +712,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
         consumer->toString());
 
     auto concretized_id =
-        IterDomainBuilder(root_id).iter_type(*id_type).build();
+        IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build();
 
     registerConcretization(root_id, concretized_id);
     is_concretized = true;
diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp
index 5f054da2715..d6cbe3c90e9 100644
--- a/csrc/kernel_cache.cpp
+++ b/csrc/kernel_cache.cpp
@@ -618,7 +618,10 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
     kernel_runtime->updateHeuristicsLaunchParams(new_heuristics.get());
   } else {
     // cache miss, need to re-build an optimized graph for this case
-
+    if (isDebugDumpEnabled(DebugDumpOption::FusionIrConcretized)) {
+      std::cout << "Fusion Before Concretization:" << std::endl;
+      fusion()->printMath();
+    }
     // concretize fusion_ for use in this runtime
     auto fusion = std::make_unique<Fusion>(*fusion_);
     FusionGuard fg(fusion.get());
diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp
index 49ed3b9912d..99f3071a4da 100644
--- a/csrc/ops/alias.cpp
+++ b/csrc/ops/alias.cpp
@@ -116,7 +116,7 @@ TensorView* reshape(TensorView* inp_tv, const std::vector<Val*>& new_sizes) {
     return static_reshape_output;
   }
 
-  auto root_domain = ops::newOutputDomain({inp_tv}, inp_tv->dtype());
+  auto root_domain = ops::newOutputDomain({inp_tv});
 
   // Create placeholder rfactor domain. Note it's not connected with the root
   // domain.
@@ -632,7 +632,20 @@ TensorView* cat(const std::vector<TensorView*>& inputs, int64_t cat_dim) {
   }
 
   // Now all of resized_inputs have the same shape as the out tensor
-  auto out = ops::newOutputTV(resized_inputs, dtype);
+  // NOTE: ops::newOutputTV would not necessarily be able to infer that the
+  // padded dimensions are all of the same size. However, we know that they are
+  // constructed such that that is the case, so we can use
+  auto out_domain = ops::newOutputDomain(resized_inputs);
+  // Override the concatenated dimension and insert an IterDomain with the true
+  // extent, if needed
+  if (!out_domain.at(cat_dim)->extent()->sameAs(concat_ext)) {
+    out_domain[cat_dim] =
+        IterDomainBuilder(out_domain.at(cat_dim)).extent(concat_ext).build();
+  }
+  auto out = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(
+          out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)),
+      dtype);
 
   IrBuilder::create<CatOp>(out, resized_inputs, cat_dim);
 
diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index 43b7ba18206..effa5d617a7 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -198,9 +198,7 @@ IterType promoteIterType(IterType type1, IterType type2) {
   }
 }
 
-std::vector<IterDomain*> newOutputDomain(
-    const std::vector<Val*>& vals,
-    DataType dtype) {
+std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals) {
   std::vector<TensorView*> tvs;
   for (auto val : vals) {
     if (val->getValType() == ValType::TensorView) {
@@ -223,9 +221,10 @@ std::vector<IterDomain*> newOutputDomain(
   std::vector<int64_t> start_offsets(out_domain.size(), 0);
   std::vector<int64_t> stop_offsets(out_domain.size(), 0);
   std::vector<Val*> extent_vals(out_domain.size(), nullptr);
+  std::vector<bool> mismatched_symbolic_extents(out_domain.size(), false);
   std::vector<Val*> expanded_extent_vals(out_domain.size(), nullptr);
-  std::vector<c10::optional<IterType>> iter_types(
-      out_domain.size(), c10::nullopt);
+  std::vector<std::optional<IterType>> iter_types(
+      out_domain.size(), std::nullopt);
 
   for (auto tv : tvs) {
     auto dom = TensorDomain::noReductions(tv->getMaybeRFactorDomain());
@@ -236,6 +235,53 @@ std::vector<IterDomain*> newOutputDomain(
         " dimensions but expected ",
         out_domain.size());
     for (const auto i : c10::irange(dom.size())) {
+      auto iter_type = dom[i]->getIterType();
+      auto prev_iter_type = iter_types[i];
+      if (prev_iter_type.has_value()) {
+        // Clang-tidy complains about unchecked access to optional value here
+        if (iter_type == IterType::Iteration &&
+            prev_iter_type.value() == IterType::Symbolic) {
+          // Prefer the Iteration extent, since Symbolic could be broadcast
+          extent_vals[i] = nullptr;
+        } else if (iter_type == IterType::Symbolic) {
+          switch (prev_iter_type.value()) {
+            case IterType::Iteration:
+              // Previously found Iteration domain, so ignore all Symbolic
+              // domains
+              continue;
+            case IterType::Symbolic:
+              if (extent_vals[i]->sameAs(dom[i]->extent())) {
+                // matching symbolic extent
+                continue;
+              } else {
+                // Mismatched symbolic input extents. Any one of the symbolic
+                // inputs could be a Broadcast or Iteration domain. Until
+                // concretization, we will not know which one holds the true
+                // extent (or whether they all are Broadcast, so that the output
+                // is also Broadcast). We record that these symbolic extents
+                // mismatched so that we can introduce a new symbolic extent
+                // later.
+                mismatched_symbolic_extents[i] = true;
+              }
+              break;
+            case IterType::Broadcast:
+              // Previously found only broadcast, so this will either also
+              // broadcast or resolve those broadcasts. If the expanded
+              // extent of any of the broadcasts is not 1, then it will need to
+              // match that of the dom[i]. In either case, prefer dom[i]'s
+              // extent, so clear iter_types[i] and extent_vals[i] so that the
+              // rest of this iteration will mark output as Symbolic.
+              iter_types[i] = std::nullopt;
+              extent_vals[i] = nullptr;
+              break;
+            default:
+              TORCH_CHECK(
+                  false,
+                  "Encountered unexpected IterType when creating new output domain: ",
+                  prev_iter_type.value());
+          }
+        }
+      }
       if (dom[i]->isBroadcast()) {
         if (dom[i]->hasExpandedExtent()) {
           expanded_extent_vals[i] =
@@ -244,9 +290,9 @@ std::vector<IterDomain*> newOutputDomain(
         continue;
       }
       extent_vals[i] = promoteSize(extent_vals[i], dom[i]->extent());
-      if (iter_types[i].has_value()) {
+      if (prev_iter_type.has_value()) {
         iter_types[i] =
-            promoteIterType(iter_types[i].value(), dom[i]->getIterType());
+            promoteIterType(prev_iter_type.value(), dom[i]->getIterType());
       } else {
         iter_types[i] = dom[i]->getIterType();
       }
@@ -268,15 +314,21 @@ std::vector<IterDomain*> newOutputDomain(
     }
   }
   for (const auto dim_i : c10::irange(out_domain.size())) {
+    auto iter_type = iter_types[dim_i];
+    if (iter_type == IterType::Symbolic && mismatched_symbolic_extents[dim_i]) {
+      // if we have a symbolic output but the input symbolic extents did not
+      // match, create a new extent
+      extent_vals[dim_i] = IrBuilder::create<Int>();
+    }
     if (extent_vals[dim_i] != nullptr) {
       TORCH_INTERNAL_ASSERT(
-          iter_types[dim_i].has_value(),
+          iter_type.has_value(),
           "Could not deduce iter type for new tensor view.");
       out_domain[dim_i] =
           IterDomainBuilder(
               IrBuilder::create<Int>(start_offsets[dim_i]), extent_vals[dim_i])
               .stop_offset(IrBuilder::create<Int>(stop_offsets[dim_i]))
-              .iter_type(iter_types[dim_i].value())
+              .iter_type(iter_type.value())
               .build();
     } else {
       out_domain[dim_i] = IterDomainBuilder(
@@ -292,7 +344,7 @@ std::vector<IterDomain*> newOutputDomain(
 }
 
 TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype) {
-  auto out_domain = newOutputDomain(vals, dtype);
+  auto out_domain = newOutputDomain(vals);
   return IrBuilder::create<TensorView>(
       IrBuilder::create<TensorDomain>(
           out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)),
diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h
index e9dc563d4fe..7af3a606986 100644
--- a/csrc/ops/utils.h
+++ b/csrc/ops/utils.h
@@ -31,9 +31,7 @@ Val* newScalar(ValType vtype, DataType dtype);
 
 IterType promoteIterType(IterType type1, IterType type2);
 
-std::vector<IterDomain*> newOutputDomain(
-    const std::vector<Val*>& vals,
-    DataType dtype);
+std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals);
 
 TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype);
 
diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
index 78e23e41040..0d5096bf156 100644
--- a/test/test_dynamic_transform.cpp
+++ b/test/test_dynamic_transform.cpp
@@ -1001,6 +1001,155 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) {
   reductionDynamicPadAddFusion(invocations);
 }
 
+// Repro of https://github.com/NVIDIA/Fuser/issues/418
+TEST_F(NVFuserTest, DynamicTransformIssue418_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  auto s0 = IrBuilder::create<Int>();
+  fusion->addInput(s0);
+
+  auto v00 = tv0->axis(0)->extent();
+  auto v01 = tv0->axis(1)->extent();
+  auto v02 = tv0->axis(2)->extent();
+  auto v03 = tv0->axis(3)->extent();
+
+  auto tv1 = reshape(tv0, {v00, div(v01, s0), s0, v02, v03});
+  auto vm = variance_mean(tv1, {2, 3, 4}, 0, true);
+  fusion->addOutput(vm.mean);
+  fusion->addOutput(vm.var);
+
+  // tv1 has symbolic axes as reshape is dynamic
+  TORCH_CHECK(
+      tv1->domain()->hasSymbolicAxis(),
+      "Expected to have symbolic axes: ",
+      tv1->toString());
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  { // trivial reshape
+    auto t0 = at::randn({256, 128, 28, 28}, options);
+    std::vector<c10::IValue> inputs = {t0, 32};
+    auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
+    auto t0_resh = t0.reshape({256, 4, 32, 28, 28});
+    auto mu = t0_resh.mean({2, 3, 4}, true);
+    auto v = t0_resh.var({2, 3, 4}, true, true);
+    testValidate(
+        executor_cache.fusion(),
+        cg_outputs,
+        inputs,
+        {mu, v},
+        __LINE__,
+        __FILE__);
+  }
+}
+
+// Repro of https://github.com/NVIDIA/Fuser/issues/418 (full GroupNorm example)
+TEST_F(NVFuserTest, DynamicTransformIssue418Full_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  auto weight = makeSymbolicTensor({1, -1, 1, 1});
+  fusion->addInput(weight);
+  auto bias = makeSymbolicTensor({1, -1, 1, 1});
+  fusion->addInput(bias);
+  auto s0 = IrBuilder::create<Int>();
+  fusion->addInput(s0);
+
+  auto v00 = tv0->axis(0)->extent();
+  auto v01 = tv0->axis(1)->extent();
+  auto v02 = tv0->axis(2)->extent();
+  auto v03 = tv0->axis(3)->extent();
+
+  auto tv1 = reshape(tv0, {v00, div(v01, s0), s0, v02, v03});
+  auto vm = variance_mean(tv1, {2, 3, 4}, 0, true);
+  auto eps = IrBuilder::create<Double>(1e-5);
+  auto tv2 = mul(sub(tv1, vm.mean), rsqrt(add(vm.var, eps)));
+  auto tv3 = reshape(tv2, {v00, v01, v02, v03});
+  auto tv4 = add(mul(tv3, weight), bias);
+  fusion->addOutput(tv4);
+
+  // tv1 has symbolic axes as reshape is dynamic
+  TORCH_CHECK(
+      tv1->domain()->hasSymbolicAxis(),
+      "Expected to have symbolic axes: ",
+      tv1->toString());
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({256, 128, 28, 28}, options);
+  auto w = at::randn({1, 128, 1, 1}, options);
+  auto b = at::randn({1, 128, 1, 1}, options);
+  std::vector<c10::IValue> inputs = {t0, w, b, 32};
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
+  auto t0_resh = t0.reshape({256, 4, 32, 28, 28});
+  auto mu = t0_resh.mean({2, 3, 4}, true);
+  auto v = t0_resh.var({2, 3, 4}, true, true);
+  auto gn =
+      ((t0_resh - mu) * (v + 1e-5).rsqrt()).reshape({256, 128, 28, 28}) * w + b;
+  testValidate(
+      executor_cache.fusion(), cg_outputs, inputs, {gn}, __LINE__, __FILE__);
+}
+
+// Repro of https://github.com/NVIDIA/Fuser/issues/418 (channels-last)
+TEST_F(NVFuserTest, DynamicTransformIssue418FullNHWC_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  auto weight = makeSymbolicTensor({1, 1, 1, -1});
+  fusion->addInput(weight);
+  auto bias = makeSymbolicTensor({1, 1, 1, -1});
+  fusion->addInput(bias);
+  auto s0 = IrBuilder::create<Int>();
+  fusion->addInput(s0);
+
+  auto v00 = tv0->axis(0)->extent();
+  auto v01 = tv0->axis(1)->extent();
+  auto v02 = tv0->axis(2)->extent();
+  auto v03 = tv0->axis(3)->extent();
+
+  auto tv1 = reshape(tv0, {v00, v01, v02, div(v03, s0), s0});
+  auto vm = variance_mean(tv1, {1, 2, 4}, 0, true);
+  auto eps = IrBuilder::create<Double>(1e-5);
+  auto tv2 = mul(sub(tv1, vm.mean), rsqrt(add(vm.var, eps)));
+  auto tv3 = reshape(tv2, {v00, v01, v02, v03});
+  auto tv4 = add(mul(tv3, weight), bias);
+  fusion->addOutput(tv4);
+
+  // tv1 has symbolic axes as reshape is dynamic
+  TORCH_CHECK(
+      tv1->domain()->hasSymbolicAxis(),
+      "Expected to have symbolic axes: ",
+      tv1->toString());
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({256, 28, 28, 128}, options);
+  auto w = at::randn({1, 1, 1, 128}, options);
+  auto b = at::randn({1, 1, 1, 128}, options);
+  std::vector<c10::IValue> inputs = {t0, w, b, 32};
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
+  auto t0_resh = t0.reshape({256, 28, 28, 4, 32});
+  auto mu = t0_resh.mean({1, 2, 4}, true);
+  auto v = t0_resh.var({1, 2, 4}, true, true);
+  auto gn =
+      ((t0_resh - mu) * (v + 1e-5).rsqrt()).reshape({256, 28, 28, 128}) * w + b;
+  testValidate(
+      executor_cache.fusion(), cg_outputs, inputs, {gn}, __LINE__, __FILE__);
+}
+
 // Test that a Symbolic root/Broadcast rfactor is not  concretized to
 // Iteration/Iteration
 TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) {
@@ -1032,4 +1181,65 @@ TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) {
   testValidate(&fusion, outputs, aten_inputs, {at2}, __LINE__, __FILE__);
 }
 
+// Test dynamic pad followed by broadcast resolution
+TEST_F(NVFuserTest, DynamicPadBroadcast_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  // 2d axis order here is YX
+  auto ypad = IrBuilder::create<Int>();
+  fusion.addInput(ypad);
+  auto xpad = IrBuilder::create<Int>();
+  fusion.addInput(xpad);
+
+  // two-way resizes to cut square tv down to broadcastable size in each axis
+  auto tv0_pad = pad(tv0, {fusion.zeroVal(), xpad, fusion.zeroVal(), ypad});
+
+  // This will potentially resolve the y or x broadcast
+  auto p = mul(tv0_pad, tv1);
+  fusion.addOutput(p);
+  fusion.addOutput(tv0_pad);
+
+  fusion.printMath();
+
+  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn({5, 5}, options);
+  at::Tensor at_y = at::randn({5, 5}, options);
+
+  // trivial resize
+  std::vector<c10::IValue> aten_inputs({at_x, at_y, 0, 0});
+  std::vector<at::Tensor> outputs;
+
+  /*
+  aten_inputs[2] = 0;
+  aten_inputs[3] = 0;
+  outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  testValidate(fusion_executor_cache.fusion(), outputs, aten_inputs, {at_x *
+  at_y}, __LINE__, __FILE__);
+  */
+
+  // shrink first axis
+  aten_inputs[2] = -4;
+  aten_inputs[3] = 0;
+  outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  std::cout << outputs << std::endl;
+  std::cout << at_x.slice(0, 0, 1) * at_y << std::endl;
+  std::cout << at_x.slice(0, 0, 1) << std::endl;
+  testValidate(
+      fusion_executor_cache.fusion(),
+      outputs,
+      aten_inputs,
+      {at_x.slice(0, 0, 1) * at_y, at_x.slice(0, 0, 1)},
+      __LINE__,
+      __FILE__);
+}
+
 } // namespace nvfuser