NVIDIA · jacobhinkle · May 30, 2023 · May 30, 2023 · May 31, 2023 · Jun 1, 2023
diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
@@ -402,9 +402,35 @@ void DynamicTransformConcretizer::concretize() {
   concretizeResize();
 
   // Finally, propagate concretized domains
-  auto all_stmts = StmtSort::getStmts(info_.fusion(), true);
+
+  // We need to concretize all immediate outputs of all intermediate
+  // expressions; even those leading to dead code branches. To do this, we
+  // insert all outputs from all intermediate expressions to "leaves". If we
+  // don't insert anything new, we are done. Otherwise, we traverse again using
+  // these as outputs as well, which ensures the output is sorted.
+  auto leaves = info_.fusion()->getTerminatingOutputs();
+  auto leaves_set =
+      std::unordered_set<Statement*>(leaves.begin(), leaves.end());
+  std::vector<Statement*> all_stmts;
+  bool inserted = true;
+  while (inserted) {
+    all_stmts = StmtSort::getStmts(info_.fusion(), leaves, true);
+    inserted = false;
+    for (auto stmt : all_stmts) {
+      if (stmt->isExpr()) {
+        for (auto o : stmt->as<Expr>()->outputs()) {
+          if (leaves_set.find(o) == leaves_set.end()) {
+            leaves.push_back(o);
+            leaves_set.insert(o);
+            inserted = true;
+          }
+        }
+      }
+    }
+  }
+  // Concretize all vals in the final vector
   for (auto stmt : all_stmts) {
-    if (stmt->isA<Val>()) {
+    if (stmt->isVal()) {
       mutate(stmt);
     }
   }
@@ -641,6 +667,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
     // corresponding producer IDs
 
     std::optional<IterType> id_type;
+    Val* extent = nullptr;
 
     for (auto producer : ir_utils::filterByType<TensorView>(def->inputs())) {
       PairwiseRootDomainMap root_map(producer, consumer);
@@ -663,6 +690,11 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
       } else {
         id_type = input_id->getIterType();
       }
+
+      // Set extent expression based on producer, overwriting that of consumer
+      if (!extent) {
+        extent = input_id->extent();
+      }
     }
 
     TORCH_INTERNAL_ASSERT(
@@ -680,7 +712,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer(
         consumer->toString());
 
     auto concretized_id =
-        IterDomainBuilder(root_id).iter_type(*id_type).build();
+        IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build();
 
     registerConcretization(root_id, concretized_id);
     is_concretized = true;

diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp
@@ -618,7 +618,10 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor(
     kernel_runtime->updateHeuristicsLaunchParams(new_heuristics.get());
   } else {
     // cache miss, need to re-build an optimized graph for this case
-
+    if (isDebugDumpEnabled(DebugDumpOption::FusionIrConcretized)) {
+      std::cout << "Fusion Before Concretization:" << std::endl;
+      fusion()->printMath();
+    }
     // concretize fusion_ for use in this runtime
     auto fusion = std::make_unique<Fusion>(*fusion_);
     FusionGuard fg(fusion.get());

diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp
@@ -116,7 +116,7 @@ TensorView* reshape(TensorView* inp_tv, const std::vector<Val*>& new_sizes) {
     return static_reshape_output;
   }
 
-  auto root_domain = ops::newOutputDomain({inp_tv}, inp_tv->dtype());
+  auto root_domain = ops::newOutputDomain({inp_tv});
 
   // Create placeholder rfactor domain. Note it's not connected with the root
   // domain.
@@ -632,7 +632,20 @@ TensorView* cat(const std::vector<TensorView*>& inputs, int64_t cat_dim) {
   }
 
   // Now all of resized_inputs have the same shape as the out tensor
-  auto out = ops::newOutputTV(resized_inputs, dtype);
+  // NOTE: ops::newOutputTV would not necessarily be able to infer that the
+  // padded dimensions are all of the same size. However, we know that they are
+  // constructed such that that is the case, so we can use
+  auto out_domain = ops::newOutputDomain(resized_inputs);
+  // Override the concatenated dimension and insert an IterDomain with the true
+  // extent, if needed
+  if (!out_domain.at(cat_dim)->extent()->sameAs(concat_ext)) {
+    out_domain[cat_dim] =
+        IterDomainBuilder(out_domain.at(cat_dim)).extent(concat_ext).build();
+  }
+  auto out = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(
+          out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)),
+      dtype);
 
   IrBuilder::create<CatOp>(out, resized_inputs, cat_dim);
 

diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
@@ -198,9 +198,7 @@ IterType promoteIterType(IterType type1, IterType type2) {
   }
 }
 
-std::vector<IterDomain*> newOutputDomain(
-    const std::vector<Val*>& vals,
-    DataType dtype) {
+std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals) {
   std::vector<TensorView*> tvs;
   for (auto val : vals) {
     if (val->getValType() == ValType::TensorView) {
@@ -223,9 +221,10 @@ std::vector<IterDomain*> newOutputDomain(
   std::vector<int64_t> start_offsets(out_domain.size(), 0);
   std::vector<int64_t> stop_offsets(out_domain.size(), 0);
   std::vector<Val*> extent_vals(out_domain.size(), nullptr);
+  std::vector<bool> mismatched_symbolic_extents(out_domain.size(), false);
   std::vector<Val*> expanded_extent_vals(out_domain.size(), nullptr);
-  std::vector<c10::optional<IterType>> iter_types(
-      out_domain.size(), c10::nullopt);
+  std::vector<std::optional<IterType>> iter_types(
+      out_domain.size(), std::nullopt);
 
   for (auto tv : tvs) {
     auto dom = TensorDomain::noReductions(tv->getMaybeRFactorDomain());
@@ -236,6 +235,53 @@ std::vector<IterDomain*> newOutputDomain(
         " dimensions but expected ",
         out_domain.size());
     for (const auto i : c10::irange(dom.size())) {
+      auto iter_type = dom[i]->getIterType();
+      auto prev_iter_type = iter_types[i];
+      if (prev_iter_type.has_value()) {
+        // Clang-tidy complains about unchecked access to optional value here
+        if (iter_type == IterType::Iteration &&
+            prev_iter_type.value() == IterType::Symbolic) {
+          // Prefer the Iteration extent, since Symbolic could be broadcast
+          extent_vals[i] = nullptr;
+        } else if (iter_type == IterType::Symbolic) {
+          switch (prev_iter_type.value()) {
+            case IterType::Iteration:
+              // Previously found Iteration domain, so ignore all Symbolic
+              // domains
+              continue;
+            case IterType::Symbolic:
+              if (extent_vals[i]->sameAs(dom[i]->extent())) {
+                // matching symbolic extent
+                continue;
+              } else {
+                // Mismatched symbolic input extents. Any one of the symbolic
+                // inputs could be a Broadcast or Iteration domain. Until
+                // concretization, we will not know which one holds the true
+                // extent (or whether they all are Broadcast, so that the output
+                // is also Broadcast). We record that these symbolic extents
+                // mismatched so that we can introduce a new symbolic extent
+                // later.
+                mismatched_symbolic_extents[i] = true;
+              }
+              break;
+            case IterType::Broadcast:
+              // Previously found only broadcast, so this will either also
+              // broadcast or resolve those broadcasts. If the expanded
+              // extent of any of the broadcasts is not 1, then it will need to
+              // match that of the dom[i]. In either case, prefer dom[i]'s
+              // extent, so clear iter_types[i] and extent_vals[i] so that the
+              // rest of this iteration will mark output as Symbolic.
+              iter_types[i] = std::nullopt;
+              extent_vals[i] = nullptr;
+              break;
+            default:
+              TORCH_CHECK(
+                  false,
+                  "Encountered unexpected IterType when creating new output domain: ",
+                  prev_iter_type.value());
+          }
+        }
+      }
       if (dom[i]->isBroadcast()) {
         if (dom[i]->hasExpandedExtent()) {
           expanded_extent_vals[i] =
@@ -244,9 +290,9 @@ std::vector<IterDomain*> newOutputDomain(
         continue;
       }
       extent_vals[i] = promoteSize(extent_vals[i], dom[i]->extent());
-      if (iter_types[i].has_value()) {
+      if (prev_iter_type.has_value()) {
         iter_types[i] =
-            promoteIterType(iter_types[i].value(), dom[i]->getIterType());
+            promoteIterType(prev_iter_type.value(), dom[i]->getIterType());
       } else {
         iter_types[i] = dom[i]->getIterType();
       }
@@ -268,15 +314,21 @@ std::vector<IterDomain*> newOutputDomain(
     }
   }
   for (const auto dim_i : c10::irange(out_domain.size())) {
+    auto iter_type = iter_types[dim_i];
+    if (iter_type == IterType::Symbolic && mismatched_symbolic_extents[dim_i]) {
+      // if we have a symbolic output but the input symbolic extents did not
+      // match, create a new extent
+      extent_vals[dim_i] = IrBuilder::create<Int>();
+    }
     if (extent_vals[dim_i] != nullptr) {
       TORCH_INTERNAL_ASSERT(
-          iter_types[dim_i].has_value(),
+          iter_type.has_value(),
           "Could not deduce iter type for new tensor view.");
       out_domain[dim_i] =
           IterDomainBuilder(
               IrBuilder::create<Int>(start_offsets[dim_i]), extent_vals[dim_i])
               .stop_offset(IrBuilder::create<Int>(stop_offsets[dim_i]))
-              .iter_type(iter_types[dim_i].value())
+              .iter_type(iter_type.value())
               .build();
     } else {
       out_domain[dim_i] = IterDomainBuilder(
@@ -292,7 +344,7 @@ std::vector<IterDomain*> newOutputDomain(
 }
 
 TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype) {
-  auto out_domain = newOutputDomain(vals, dtype);
+  auto out_domain = newOutputDomain(vals);
   return IrBuilder::create<TensorView>(
       IrBuilder::create<TensorDomain>(
           out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)),

diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h
@@ -31,9 +31,7 @@ Val* newScalar(ValType vtype, DataType dtype);
 
 IterType promoteIterType(IterType type1, IterType type2);
 
-std::vector<IterDomain*> newOutputDomain(
-    const std::vector<Val*>& vals,
-    DataType dtype);
+std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals);
 
 TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype);