NVIDIA · zasdfgbnm · Jul 28, 2023 · Jul 19, 2023 · Jul 19, 2023 · Jul 19, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -177,6 +177,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/type_inference.cpp
   ${NVFUSER_SRCS_DIR}/type_promotion.cpp
   ${NVFUSER_SRCS_DIR}/fusion_segmenter.cpp
+  ${NVFUSER_SRCS_DIR}/tensor_metadata.cpp
   ${NVFUSER_SRCS_DIR}/tensor_view.cpp
   ${NVFUSER_SRCS_DIR}/transform_iter.cpp
   ${NVFUSER_SRCS_DIR}/transform_replay.cpp

diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp
@@ -575,7 +575,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
           [&](auto&& dtype) {
             using T = std::decay_t<decltype(dtype)>;
             if constexpr (std::is_same_v<T, StructOf>) {
-              for (auto& [name, _] : dtype.types) {
+              for (auto& name : dtype.field_names) {
                 indent() << gen(gop->output(0)) << "." << name << " = "
                          << gen(gop->in()) << "." << name << ";\n";
               }
@@ -1376,7 +1376,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
           ldst->out()->dtype(),
           " = ",
           ldst->in()->dtype());
-      for (auto& [name, _] : out_type.types) {
+      for (auto& name : out_type.field_names) {
         TORCH_INTERNAL_ASSERT(
             in_type.types.find(name) != in_type.types.end(),
             "Mismatched field in struct assignment: ",

diff --git a/csrc/device_lower/pass/replace_size.cpp b/csrc/device_lower/pass/replace_size.cpp
@@ -203,7 +203,7 @@ void replaceSymbolicSizes(Fusion* fusion) {
       if (tensor_dim_map.find(orig_size) == tensor_dim_map.end() &&
           !orig_size->isFusionInput() && !orig_size->isConstScalar()) {
         std::stringstream ss;
-        ss << "T" << tv->name() << ".size[" << dim++ << "]";
+        ss << "T" << tv->name() << ".logical_size[" << dim++ << "]";
         tensor_dim_map[orig_size] = IrBuilder::create<NamedScalar>(
             ss.str(), orig_size->getDataType().value());
       } else {

diff --git a/csrc/executor.cpp b/csrc/executor.cpp
@@ -1838,8 +1838,8 @@ float FusionExecutor::runRtc(
     Struct<PolymorphicValue> concrete_value;
     concrete_value["data"] = PolymorphicValue(
         Pointer(input.data_ptr(), aten_to_data_type(input.scalar_type())));
-    concrete_value["size"] = PolymorphicValue(input.sizes().vec());
-    concrete_value["stride"] = PolymorphicValue(input.strides().vec());
+    concrete_value["logical_size"] = PolymorphicValue(input.sizes().vec());
+    concrete_value["alloc_stride"] = PolymorphicValue(input.strides().vec());
     data.emplace_back(getTensorArgBuffer(concrete_value, index_type));
     pointers.emplace_back(data.back().data());
   }

diff --git a/csrc/executor_kernel_arg.cpp b/csrc/executor_kernel_arg.cpp
@@ -14,303 +14,6 @@
 
 namespace nvfuser {
 
-namespace {
-
-// Forward traverse from rFactor domain to allocation domain, compute frontier
-// sizes and strides, validate that splits are divisible and merges are
-// contiguous, and update active_ids_ correspondingly.
-class ForwardTraverseFromRFactorToAlloc {
-  ExpressionEvaluator& ee_;
-  std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>>& active_ids_;
-
-  void handle(Split* split) {
-    auto in = split->in();
-    auto inner = split->inner();
-    auto outer = split->outer();
-    auto in_it = active_ids_.find(in);
-    // TORCH_INTERNAL_ASSERT(in_it != active_ids_.end())
-    if (in_it == active_ids_.end()) {
-      // TODO: see [Allocation domain on both side of rFactor]
-      return;
-    }
-    auto [in_size, in_stride] = in_it->second;
-    auto factor = ee_.evaluate(split->factor()).as<int64_t>();
-    TORCH_INTERNAL_ASSERT(
-        in_size % factor == 0,
-        "The rFactor domain and allocation domain of fusion input/output ",
-        "tensors must be a one-to-one map, therefore, ",
-        "non-divisible split is not allowed in allocation domain");
-    TORCH_INTERNAL_ASSERT(active_ids_.erase(in) == 1);
-    TORCH_INTERNAL_ASSERT(
-        active_ids_
-            .emplace(inner, std::pair<int64_t, int64_t>{factor, in_stride})
-            .second);
-    TORCH_INTERNAL_ASSERT(active_ids_
-                              .emplace(
-                                  outer,
-                                  std::pair<int64_t, int64_t>{
-                                      in_size / factor, in_stride * factor})
-                              .second);
-  }
-
-  void handle(Merge* merge) {
-    auto inner = merge->inner();
-    auto outer = merge->outer();
-    auto out = merge->out();
-    auto inner_it = active_ids_.find(inner);
-    auto outer_it = active_ids_.find(outer);
-    // TORCH_INTERNAL_ASSERT(inner_it != active_ids_.end())
-    // TORCH_INTERNAL_ASSERT(outer_it != active_ids_.end())
-    if (inner_it == active_ids_.end() || outer_it == active_ids_.end()) {
-      // TODO: see [Allocation domain on both side of rFactor]
-      return;
-    }
-    auto [inner_size, inner_stride] = inner_it->second;
-    auto [outer_size, outer_stride] = outer_it->second;
-    TORCH_INTERNAL_ASSERT(
-        inner_stride * inner_size == outer_stride,
-        "The rFactor domain and allocation domain of fusion input/output ",
-        "tensors must be a one-to-one map, therefore, ",
-        "merging of discontiguous dimensions is not allowed in allocation domain");
-    TORCH_INTERNAL_ASSERT(active_ids_.erase(inner) == 1);
-    TORCH_INTERNAL_ASSERT(active_ids_.erase(outer) == 1);
-    TORCH_INTERNAL_ASSERT(active_ids_
-                              .emplace(
-                                  out,
-                                  std::pair<int64_t, int64_t>{
-                                      inner_size * outer_size, inner_stride})
-                              .second);
-  }
-
-  void handle(Expr* expr) {
-    if (auto split = dynamic_cast<Split*>(expr)) {
-      handle(split);
-    } else if (auto merge = dynamic_cast<Merge*>(expr)) {
-      handle(merge);
-    } else {
-      TORCH_INTERNAL_ASSERT(
-          false, "Unsupported transormation in allocation domain");
-    }
-  }
-
- public:
-  ForwardTraverseFromRFactorToAlloc(
-      ExpressionEvaluator& ee,
-      std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>>& active_ids)
-      : ee_(ee), active_ids_(active_ids) {}
-
-  void run(
-      TensorView* tv,
-      const std::vector<IterDomain*>& rfactor,
-      const std::vector<IterDomain*>& alloc) {
-    auto forward_exprs = StmtSort::getExprsBetween(
-        tv->fusion(),
-        {rfactor.begin(), rfactor.end()},
-        {alloc.begin(), alloc.end()});
-    for (auto expr : forward_exprs) {
-      handle(expr);
-    }
-  }
-};
-
-// Similar to ForwardTraverseFromRFactorToAlloc, but in the opposite direction.
-class BackwardTraverseFromRFactorToAlloc {
-  at::Tensor tensor_;
-  ExpressionEvaluator& ee_;
-  std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>>& active_ids_;
-
-  void handle(Split* split) {
-    auto in = split->in();
-    auto inner = split->inner();
-    auto outer = split->outer();
-    auto inner_it = active_ids_.find(inner);
-    auto outer_it = active_ids_.find(outer);
-    // TORCH_INTERNAL_ASSERT(inner_it != active_ids_.end())
-    // TORCH_INTERNAL_ASSERT(outer_it != active_ids_.end())
-    if (inner_it == active_ids_.end() || outer_it == active_ids_.end()) {
-      // TODO: see [Allocation domain on both side of rFactor]
-      return;
-    }
-    auto [inner_size, inner_stride] = inner_it->second;
-    auto [outer_size, outer_stride] = outer_it->second;
-    TORCH_INTERNAL_ASSERT(
-        inner_stride * inner_size == outer_stride,
-        "The rFactor domain and allocation domain of fusion input/output ",
-        "tensors must be a one-to-one map, therefore, ",
-        "splitting one dimension into discontiguous dimensions is not allowed in allocation domain");
-    TORCH_INTERNAL_ASSERT(active_ids_.erase(inner) == 1);
-    TORCH_INTERNAL_ASSERT(active_ids_.erase(outer) == 1);
-    TORCH_INTERNAL_ASSERT(active_ids_
-                              .emplace(
-                                  in,
-                                  std::pair<int64_t, int64_t>{
-                                      inner_size * outer_size, inner_stride})
-                              .second);
-  }
-
-  void handle(Merge* merge) {
-    auto inner = merge->inner();
-    auto outer = merge->outer();
-    auto out = merge->out();
-    auto factor = ee_.evaluate(inner->extent()).as<int64_t>();
-    auto out_it = active_ids_.find(out);
-    // TORCH_INTERNAL_ASSERT(out_it != active_ids_.end())
-    if (out_it == active_ids_.end()) {
-      // TODO: see [Allocation domain on both side of rFactor]
-      return;
-    }
-    auto [out_size, out_stride] = out_it->second;
-    TORCH_INTERNAL_ASSERT(
-        out_size % factor == 0,
-        "The rFactor domain and allocation domain of fusion input/output ",
-        "tensors must be a one-to-one map, therefore, ",
-        "the size of the output must divisible by the size of inner dimension");
-    TORCH_INTERNAL_ASSERT(active_ids_.erase(out) == 1);
-    TORCH_INTERNAL_ASSERT(
-        active_ids_
-            .emplace(inner, std::pair<int64_t, int64_t>{factor, out_stride})
-            .second);
-    TORCH_INTERNAL_ASSERT(active_ids_
-                              .emplace(
-                                  outer,
-                                  std::pair<int64_t, int64_t>{
-                                      out_size / factor, out_stride * factor})
-                              .second);
-  }
-
-  void handle(Expr* expr) {
-    if (auto split = dynamic_cast<Split*>(expr)) {
-      handle(split);
-    } else if (auto merge = dynamic_cast<Merge*>(expr)) {
-      handle(merge);
-    } else {
-      TORCH_INTERNAL_ASSERT(
-          false, "Unsupported transormation in allocation domain");
-    }
-  }
-
- public:
-  BackwardTraverseFromRFactorToAlloc(
-      ExpressionEvaluator& ee,
-      std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>>& active_ids)
-      : ee_(ee), active_ids_(active_ids) {}
-
-  void run(
-      TensorView* tv,
-      const std::vector<IterDomain*>& rfactor,
-      const std::vector<IterDomain*>& alloc) {
-    auto backward_exprs = StmtSort::getExprsBetween(
-        tv->fusion(),
-        {alloc.begin(), alloc.end()},
-        {rfactor.begin(), rfactor.end()});
-    std::reverse(backward_exprs.begin(), backward_exprs.end());
-    for (auto expr : backward_exprs) {
-      handle(expr);
-    }
-  }
-};
-
-} // namespace
-
-// Given an ATen tensor, whose sizes and strides are w.r.t to the rFactor domain
-// of its corresponding TensorView, compute the sizes and strides of the tensor
-// with respect to its allocation domain.
-// For example, if the rFactor domain is [I1, I2], and the allocation domain is
-// [I2*I1], and the tensor's size is [5, 3] and stride is [2, 10], then the
-// resulting size will be [15] and stride will be [2]
-// Another example, if the rFactor domain is [I1*I2] and the allocation domain
-// is [I1, I2], and the tensor's size is [15] and stride is [7], and the extent
-// of I2 is 5, then the resulting size will be [3, 5] and stride will be [35, 7]
-std::vector<std::pair<int64_t, int64_t>>
-inferAndValidateAllocationSizesAndStrides(
-    const at::Tensor& tensor,
-    TensorView* tv,
-    ExpressionEvaluator& ee) {
-  if (tv == nullptr || !tv->hasAllocation()) {
-    // When tv is nullptr, or tv does not have allocation, the given sizes and
-    // strides should already be in the target format. So nothing to do here.
-    std::vector<std::pair<int64_t, int64_t>> result;
-    for (auto i : c10::irange(tensor.dim())) {
-      result.emplace_back(tensor.size(i), tensor.stride(i));
-    }
-    return result;
-  }
-  const auto& alloc =
-      TensorDomain::noReductions(tv->getMaybeAllocationDomain());
-  const auto& rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain());
-
-  // active IDs and their shape and stride
-  std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>> active_ids;
-  TORCH_INTERNAL_ASSERT((int64_t)rfactor.size() == tensor.dim());
-  for (int64_t i : c10::irange((int64_t)rfactor.size())) {
-    auto rf_id = rfactor.at(i);
-    active_ids[rf_id] = {tensor.size(i), tensor.stride(i)};
-  }
-
-  ForwardTraverseFromRFactorToAlloc(ee, active_ids).run(tv, rfactor, alloc);
-  BackwardTraverseFromRFactorToAlloc(ee, active_ids).run(tv, rfactor, alloc);
-
-  // Now active_ids should contain the final sizes and strides, unordered. We
-  // need to put them to the correct order.
-  std::vector<std::pair<int64_t, int64_t>> sizes_strides;
-  sizes_strides.reserve(alloc.size());
-  for (auto i : c10::irange(alloc.size())) {
-    auto id = alloc.at(i);
-    sizes_strides.emplace_back(active_ids.at(id));
-  }
-  // Validate final sizes and strides with contiguity
-  int64_t contiguous_stride = 1;
-  std::vector<std::optional<bool>> contiguity = tv->getContiguity();
-  for (int64_t i = (int64_t)sizes_strides.size() - 1; i >= 0; i--) {
-    if (alloc.at(i)->isBroadcast()) {
-      continue;
-    }
-    while (!contiguity.back().has_value()) {
-      contiguity.pop_back();
-    }
-    auto [size, stride] = sizes_strides.at(i);
-    TORCH_INTERNAL_ASSERT(!contiguity.empty());
-    auto last_contiguity = contiguity.back();
-    TORCH_INTERNAL_ASSERT(
-        last_contiguity.has_value(),
-        "I don't think this check makes sense, but unfortunately ",
-        "clang-tidy is not smart enough to infer from the context that this is always true.");
-    if (*last_contiguity) {
-      TORCH_CHECK(
-          stride == contiguous_stride,
-          "Stride mismatch with contiguity info. ",
-          "tv: ",
-          tv->toString(),
-          " allocation domain: ",
-          ir_utils::toString(tv->getMaybeAllocationDomain()),
-          " dim: ",
-          i,
-          " expected stride: ",
-          contiguous_stride,
-          " actual stride: ",
-          stride);
-    }
-    contiguous_stride = stride * size;
-    contiguity.pop_back();
-  }
-  TORCH_INTERNAL_ASSERT(
-      contiguity.empty(),
-      "The size of contiguity mismatch with the dimensionality of allocation domain");
-  // Validate that for expanded broadcast, the stride must be zero.
-  for (int64_t i : c10::irange((int64_t)sizes_strides.size())) {
-    if (auto alloc_id = alloc.at(i); alloc_id->hasExpandedExtent()) {
-      auto [_, stride] = sizes_strides.at(i);
-      TORCH_CHECK(
-          stride == 0,
-          "Expecting an expanded dimension on dimension ",
-          i,
-          " but found stride ",
-          stride);
-    }
-  }
-  return sizes_strides;
-}
-
 PrimDataType TensorArgAbstract::getSmallestIndexType() const {
   KernelIndexTypeCompute index_type_helper;
   for (const auto dim_i : c10::irange(tensor_.ndimension())) {
@@ -637,8 +340,8 @@ std::vector<std::byte> getTensorArgBuffer(
   auto struct_ = metadata.as<Struct>();
   std::vector<std::byte> buffer;
   void* ptr = (void*)struct_["data"];
-  std::vector<int64_t> sizes = (std::vector<int64_t>)struct_["size"];
-  std::vector<int64_t> strides = (std::vector<int64_t>)struct_["stride"];
+  std::vector<int64_t> sizes = (std::vector<int64_t>)struct_["logical_size"];
+  std::vector<int64_t> strides = (std::vector<int64_t>)struct_["alloc_stride"];
   if (index_type == PrimDataType::Int) {
     buffer.reserve(
         sizeof(ptr) + sizeof(int64_t) * (sizes.size() + strides.size()));
@@ -686,10 +389,8 @@ std::vector<std::byte> getKernelArgument(
           (std::byte*)tensor.data_ptr(),
           (std::byte*)tensor.data_ptr() + tensor.element_size());
     } else {
-      auto resolved_arg = getTensorArg(tensor, tv, ee, index_type);
-      return std::vector<std::byte>(
-          (std::byte*)resolved_arg->arg(),
-          (std::byte*)resolved_arg->arg() + resolved_arg->argSize());
+      auto metadata = ee.evaluate(IrBuilder::metadataExpr(tv));
+      return getTensorArgBuffer(metadata, index_type);
     }
   } else if (isIntegralType(parameter->dtype())) {
     int64_t v = pv.as<int64_t>();