From a05351165811f75d0c15074190b5378fc8be369d Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 24 Feb 2025 06:20:19 -0800
Subject: [PATCH 01/35] Store alloc and logical info in global buffer info.

---
 csrc/runtime/allocations.cpp | 57 ++++++++++++++++++++++++++--------
 csrc/runtime/allocations.h   | 15 +++++++--
 csrc/runtime/executor.cpp    | 60 +++++++++++++++++++++++++-----------
 csrc/serde/fusion_cache.fbs  |  6 ++--
 4 files changed, 104 insertions(+), 34 deletions(-)
diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index b3bc66a1947..75f3e4fcabb 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -284,8 +284,8 @@ at::Tensor allocateTensor(
   switch (alias_info.type) {
     case AllocationType::New: {
       auto alloc_tensor = at::native::empty_strided_cuda(
-          out_info.sizes,
-          out_info.strides,
+          out_info.shape_info.logical_sizes,
+          out_info.shape_info.logical_strides,
           out_info.type,
           c10::nullopt,
           device,
@@ -380,7 +380,7 @@ GlobalBufferInfo getBufferInfo(
   FUSER_PERF_SCOPE("fusion_executor::allocations::getBufferInfo");
   GlobalBufferInfo info;
   info.tv = tv;
-  std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval);
+  info.shape_info = inferTensorShapes(info.tv, expr_eval);
   auto dtype =
       (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
   info.type = data_type_to_aten(dtype);
@@ -711,16 +711,9 @@ at::Tensor transformFromAllocationToLogical(
   return tensor.permute(dims);
 }
 
-} // namespace
-
-std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
+std::pair<std::vector<int64_t>, std::vector<int64_t>> inferAllocationShape(
     TensorView* tv,
     const ExpressionEvaluator& expr_eval) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::inferShapeOfOutput");
-  // Fusion outputs do not come with Allocate and
-  // need to be allocated while taking expanded broadcasts into
-  // account.
-
   std::vector<Val*> symbolic_sizes;
   std::vector<bool> expand_flags;
 
@@ -745,8 +738,20 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
       expand_flags.push_back(false);
     }
   }
+  return inferShape(tv, symbolic_sizes, expand_flags, expr_eval);
+}
+
+} // namespace
 
-  auto size_stride = inferShape(tv, symbolic_sizes, expand_flags, expr_eval);
+std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
+    TensorView* tv,
+    const ExpressionEvaluator& expr_eval) {
+  FUSER_PERF_SCOPE("fusion_executor::allocations::inferShapeOfOutput");
+  // Fusion outputs do not come with Allocate and
+  // need to be allocated while taking expanded broadcasts into
+  // account.
+
+  auto size_stride = inferAllocationShape(tv, expr_eval);
   if (!tv->hasAllocation()) {
     return size_stride;
   }
@@ -761,4 +766,32 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
   return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()};
 }
 
+TensorShapeInfo inferTensorShapes(
+    TensorView* tv,
+    const ExpressionEvaluator& expr_eval) {
+  auto allocation_size_stride = inferAllocationShape(tv, expr_eval);
+  if (!tv->hasAllocation()) {
+    return TensorShapeInfo{
+        allocation_size_stride.first,
+        allocation_size_stride.second,
+        allocation_size_stride.first,
+        allocation_size_stride.second};
+  }
+
+  auto options =
+      c10::TensorOptions().device(c10::Device(c10::DeviceType::Meta));
+  auto logical_meta_tensor = at::empty_strided(
+      allocation_size_stride.first, allocation_size_stride.second, options);
+  // TODO(jiej): we should refactor it here, there's no need to use
+  // logical_meta_tensor at all, size + stride should be used directly in the
+  // `transformFromAllocationToLogical`
+  logical_meta_tensor =
+      transformFromAllocationToLogical(logical_meta_tensor, tv, expr_eval);
+  return {
+      logical_meta_tensor.sizes().vec(),
+      logical_meta_tensor.strides().vec(),
+      allocation_size_stride.first,
+      allocation_size_stride.second};
+}
+
 } // namespace nvfuser
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
index 01fb9a041c1..8b4ca523e9d 100644
--- a/csrc/runtime/allocations.h
+++ b/csrc/runtime/allocations.h
@@ -15,10 +15,16 @@
 
 namespace nvfuser {
 
+struct TensorShapeInfo {
+  std::vector<int64_t> logical_sizes;
+  std::vector<int64_t> logical_strides;
+  std::vector<int64_t> allocation_sizes;
+  std::vector<int64_t> allocation_strides;
+};
+
 struct GlobalBufferInfo {
   TensorView* tv = nullptr;
-  std::vector<int64_t> sizes;
-  std::vector<int64_t> strides;
+  TensorShapeInfo shape_info;
   at::ScalarType type = at::ScalarType::Undefined;
   bool zero_init = false;
   bool resets_to_zero = false;
@@ -61,6 +67,11 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
     TensorView* tv,
     const ExpressionEvaluator& expr_eval);
 
+// Infer the sizes and strides of an output tensor
+TensorShapeInfo inferTensorShapes(
+    TensorView* tv,
+    const ExpressionEvaluator& expr_eval);
+
 // Allocate an `at::Tensor` for `out_info` or compute it as an alias.
 at::Tensor allocateTensor(
     const GlobalBufferInfo& out_info,
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 54c3b8bf61c..88911015cc5 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -480,9 +480,13 @@ std::vector<GlobalBufferInfo> KernelExecutor::getIntermediateBufferInfo(
         tv->getMaybeAllocationDomain().begin(),
         tv->getMaybeAllocationDomain().end(),
         [](IterDomain* id) { return id->hasExpandedExtent(); });
-    std::tie(info.sizes, info.strides) = has_expanded_domains
+    auto [sizes, strides] = has_expanded_domains
         ? inferShapeOfOutput(tv, expr_eval)
         : inferShapeOfIntermediate(tv, alloc, expr_eval);
+    info.shape_info.allocation_sizes = sizes;
+    info.shape_info.allocation_strides = strides;
+    info.shape_info.logical_sizes = sizes;
+    info.shape_info.logical_strides = strides;
     auto dtype = (tv->dtype() == DataType::Index ? index_type : tv->dtype());
     info.type = data_type_to_aten(dtype);
 
@@ -664,10 +668,13 @@ void KernelExecutor::initializeExecutorEntry(
     // future uses of this ExecutorEntry may not be provided with
     // allocated outputs
     for (const auto& output : outputs) {
-      output_info.emplace_back(GlobalBufferInfo{
-          .sizes = output.sizes().vec(),
-          .strides = output.strides().vec(),
-          .type = output.scalar_type()});
+      GlobalBufferInfo info;
+      info.type = output.scalar_type();
+      info.shape_info.allocation_sizes = output.sizes().vec();
+      info.shape_info.allocation_strides = output.strides().vec();
+      info.shape_info.logical_sizes = output.sizes().vec();
+      info.shape_info.logical_strides = output.strides().vec();
+      output_info.emplace_back(info);
     }
   }
 
@@ -998,14 +1005,17 @@ std::vector<at::Tensor> KernelExecutor::run(
       const auto& buf_info = executor_entry->intermediates.at(i);
       bool has_expansion = false;
       std::vector<int64_t> unexpanded_sizes;
-      unexpanded_sizes.reserve(buf_info.sizes.size());
-      NVF_ERROR(buf_info.sizes.size() == buf_info.strides.size())
-      for (const auto j : c10::irange(buf_info.sizes.size())) {
-        if (buf_info.strides[j] == 0) {
+      unexpanded_sizes.reserve(buf_info.shape_info.allocation_sizes.size());
+      NVF_ERROR(
+          buf_info.shape_info.allocation_sizes.size() ==
+          buf_info.shape_info.allocation_strides.size())
+      for (const auto j :
+           c10::irange(buf_info.shape_info.allocation_sizes.size())) {
+        if (buf_info.shape_info.allocation_strides[j] == 0) {
           has_expansion = true;
           unexpanded_sizes.push_back(1L);
         } else {
-          unexpanded_sizes.push_back(buf_info.sizes[j]);
+          unexpanded_sizes.push_back(buf_info.shape_info.allocation_sizes[j]);
         }
       }
       at::Tensor intermediate_buffer;
@@ -1036,8 +1046,8 @@ std::vector<at::Tensor> KernelExecutor::run(
         }
       }
       if (has_expansion) {
-        intermediate_buffer =
-            at::native::expand(intermediate_buffer, buf_info.sizes);
+        intermediate_buffer = at::native::expand(
+            intermediate_buffer, buf_info.shape_info.allocation_sizes);
       }
       args.push(intermediate_buffer);
       intermediates.push_back(intermediate_buffer);
@@ -1309,8 +1319,10 @@ flatbuffers::Offset<serde::GlobalBufferInfo> KernelExecutor::serialize(
   return serde::CreateGlobalBufferInfoDirect(
       builder,
       tv_position,
-      &data.sizes,
-      &data.strides,
+      &data.shape_info.logical_sizes,
+      &data.shape_info.logical_strides,
+      &data.shape_info.allocation_sizes,
+      &data.shape_info.allocation_strides,
       nvfuser::toUnderlying(data.type),
       data.zero_init,
       data.resets_to_zero,
@@ -1430,14 +1442,26 @@ GlobalBufferInfo KernelExecutor::deserialize(
     info.tv = dynamic_cast<TensorView*>(out_val->buffer());
   }
 
-  for (auto dim_size : *buffer->sizes()) {
-    info.sizes.emplace_back(dim_size);
+  TensorShapeInfo shape_info;
+
+  for (auto dim_size : *buffer->logical_sizes()) {
+    shape_info.logical_sizes.emplace_back(dim_size);
+  }
+
+  for (auto dim_stride : *buffer->logical_strides()) {
+    shape_info.logical_strides.emplace_back(dim_stride);
   }
 
-  for (auto dim_stride : *buffer->strides()) {
-    info.strides.emplace_back(dim_stride);
+  for (auto dim_size : *buffer->alloc_sizes()) {
+    shape_info.allocation_sizes.emplace_back(dim_size);
   }
 
+  for (auto dim_stride : *buffer->alloc_strides()) {
+    shape_info.allocation_strides.emplace_back(dim_stride);
+  }
+
+  info.shape_info = shape_info;
+
   info.type = serde::mapToAtenDtype(buffer->dtype());
   info.zero_init = buffer->zero_init();
   info.resets_to_zero = buffer->resets_to_zero();
diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs
index 562f35d9e59..30a75729147 100644
--- a/csrc/serde/fusion_cache.fbs
+++ b/csrc/serde/fusion_cache.fbs
@@ -215,8 +215,10 @@ table LaunchParams {
 // For intermediate tensors, we use its position in the KernelSummary global_allocations.
 table GlobalBufferInfo {
   tv : long = -1;
-  sizes : [long];
-  strides : [long];
+  logical_sizes : [long];
+  logical_strides : [long];
+  alloc_sizes : [long];
+  alloc_strides : [long];
   dtype : long;
   zero_init : bool;
   resets_to_zero : bool;

From c6eb652a0a50e65f1e3186eb875ce034dba9b3a8 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 24 Feb 2025 07:26:22 -0800
Subject: [PATCH 02/35] Draft removing expr_eval from KernelExecutor::run.

---
 csrc/runtime/allocations.cpp         |  42 +++++++++++
 csrc/runtime/allocations.h           |   6 ++
 csrc/runtime/executor.cpp            | 105 ++++++++++++++++++++++-----
 csrc/runtime/executor.h              |   7 ++
 csrc/runtime/executor_kernel_arg.cpp |  32 ++++++++
 csrc/runtime/executor_kernel_arg.h   |   5 ++
 csrc/tensor_metadata.cpp             |   3 +-
 7 files changed, 181 insertions(+), 19 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 75f3e4fcabb..53da6c08728 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -372,6 +372,48 @@ std::vector<at::Tensor> allocateOutputs(
   return out_tensors;
 }
 
+std::vector<at::Tensor> allocateKernelOutputs(
+    const Fusion* fusion,
+    const std::vector<GlobalBufferInfo>& output_infos,
+    const c10::Device& device,
+    const KernelArgumentHolder& args) {
+  FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs");
+
+  NVF_ERROR(
+      std::any_of(
+          output_infos.output_aliased_to_output.begin(),
+          output_infos.output_aliased_to_output.end(),
+          [](int idx) { return idx != -1; }),
+      "Kernel's don't support output to output aliasing.");
+
+  std::vector<at::Tensor> out_tensors;
+  out_tensors.reserve(output_infos.size());
+  for (auto out_idx : c10::irange(output_infos.size())) {
+    auto out_info = output_infos.at(out_idx);
+    if (output_infos.output_aliased_to_input.at(out_info.tv->startIdx()) ==
+        -1) {
+      auto alloc_tensor = at::native::empty_strided_cuda(
+          out_info.shape_info.logical_sizes,
+          out_info.shape_info.logical_strides,
+          out_info.type,
+          c10::nullopt,
+          device,
+          c10::nullopt);
+      if (shouldFillAllocationWithNan()) {
+        fillTensorWithNan(alloc_tensor);
+      }
+      out_tensors.emplace_back(alloc_tensor);
+    } else {
+      auto input_arg = args[output_infos.output_aliased_to_input.at(out_idx)];
+      NVF_ERROR(
+          input_arg.is<at::Tensor>(),
+          "Aliased input argument is not a tensor.");
+      out_tensors.emplace_back(input_arg.as<at::Tensor>());
+    }
+  }
+  return out_tensors;
+}
+
 namespace {
 GlobalBufferInfo getBufferInfo(
     ExpressionEvaluator& expr_eval,
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
index 8b4ca523e9d..06333a91caa 100644
--- a/csrc/runtime/allocations.h
+++ b/csrc/runtime/allocations.h
@@ -87,6 +87,12 @@ std::vector<at::Tensor> allocateOutputs(
     const c10::Device& device,
     ExpressionEvaluator& ee);
 
+std::vector<at::Tensor> allocateKernelOutputs(
+    const Fusion* fusion,
+    const std::vector<GlobalBufferInfo>& output_infos,
+    const c10::Device& device,
+    const KernelArgumentHolder& args);
+
 //! Return information necessary for allocating output tensors. Input
 //! and output tensors are allowed to alias each other, which is
 //! specified by the list of int pairs of input and output indices
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 88911015cc5..f8b2742b8ae 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -667,9 +667,16 @@ void KernelExecutor::initializeExecutorEntry(
     // Need to save the information necessary for allocations as
     // future uses of this ExecutorEntry may not be provided with
     // allocated outputs
-    for (const auto& output : outputs) {
+    for (auto output_idx : c10::irange(outputs.size())) {
+      const auto& output = outputs[output_idx];
       GlobalBufferInfo info;
       info.type = output.scalar_type();
+      info.tv = compiled_kernel_->kernel()->outputs()[output_idx];
+      NVF_ERROR(
+          !info.tv->hasAllocation(),
+          "Accepting allocated outputs is not currently supported with allocation domain. ",
+          "Allocation domain found for tv: ",
+          info.tv->toString());
       info.shape_info.allocation_sizes = output.sizes().vec();
       info.shape_info.allocation_strides = output.strides().vec();
       info.shape_info.logical_sizes = output.sizes().vec();
@@ -678,11 +685,49 @@ void KernelExecutor::initializeExecutorEntry(
     }
   }
 
+  std::vector<int> output_aliased_to_input(outputs.size(), -1);
+  std::vector<int> output_aliased_to_output(outputs.size(), -1);
+
+  for (auto output_idx : c10::irange(outputs.size())) {
+    auto out_info = output_info[output_idx];
+    auto fusion = compiled_kernel_->kernel()->as<Fusion>();
+    auto alias_info = fusion->getOutputAliasInfo(out_info.tv);
+    NVF_ERROR(
+        alias_info.type != AllocationType::Evaluate,
+        "Outputs should not be evaluate type for kernels.");
+    if (alias_info.type == AllocationType::New) {
+      continue;
+    }
+    auto aliased_to = alias_info.aliased_io->as<TensorView>();
+    auto aliased_to_idx =
+        std::find(
+            fusion->inputs().begin(), fusion->inputs().end(), aliased_to) -
+        fusion->inputs().begin();
+    if (aliased_to_idx < fusion->inputs().size()) {
+      output_aliased_to_input[output_idx] = aliased_to_idx;
+    } else {
+      output_aliased_to_output[output_idx] =
+          std::find(
+              fusion->outputs().begin(), fusion->outputs().end(), aliased_to) -
+          fusion->outputs().begin();
+      NVF_ERROR(
+          output_aliased_to_output[output_idx] < fusion->outputs().size(),
+          "Alias found but is not an output or input of the fusion. ",
+          "Aliased to tv: ",
+          aliased_to->toString(),
+          "\nFusion Inputs:\n  ",
+          fusion->inputs(),
+          "\nFusion Outputs:\n  ",
+          fusion->outputs());
+    }
+  }
+
   auto intermediates = getIntermediateBufferInfo(expr_eval, index_type);
 
   // All information is gathered. Save it to ExecutorEntry
   executor_entry.launch_params = launch_params;
   executor_entry.outputs = output_info;
+  executor_entry.output_aliased_to = output_aliased_to;
   executor_entry.intermediates = intermediates;
   executor_entry.init = true;
 }
@@ -782,6 +827,32 @@ void KernelExecutor::computeArgs(
   }
 }
 
+// set the arguments that we'll pass to cuLaunchKernel
+// TODO: Add to header
+void KernelExecutor::computeArgs2(
+    ExecutorEntry& entry,
+    const std::vector<at::Tensor>& outputs,
+    const std::vector<at::Tensor>& intermediates) const {
+  FUSER_PERF_SCOPE("KernelExecutor::computeArgs2");
+
+  entry.args.resize(outputs.size() + intermediates.size());
+  entry.arg_ptrs.resize(outputs.size() + intermediates.size());
+
+  NVF_ERROR(entry.outputs.size() == outputs.size(), "Outputs size mismatch");
+  NVF_ERROR(entry.intermediates.size() == intermediates.size(), "Intermediates size mismatch");
+
+  const PrimDataType idx_type = compiled_kernel_->kernel()->indexType();
+  for (size_t out_idx = 0; out_idx < outputs.size(); ++out_idx) {
+    entry.args[out_idx] = getKernelArgument(outputs[out_idx], entry.outputs[out_idx], idx_type);
+    entry.arg_ptrs[out_idx] = entry.args[out_idx].data();
+  }
+
+  for (size_t inter_idx = 0; inter_idx < intermediates.size(); ++inter_idx) {
+    entry.args[out_idx + inter_idx] = getKernelArgument(intermediates[inter_idx], entry.intermediates[inter_idx], idx_type);
+    entry.arg_ptrs[out_idx + inter_idx] = entry.args[out_idx + inter_idx].data();
+  }
+}
+
 // Reset the arguments that we'll pass to cuLaunchKernel. This needs to be
 // invoked on every shape change.
 void KernelExecutor::recomputeArgs(
@@ -972,15 +1043,15 @@ std::vector<at::Tensor> KernelExecutor::run(
   at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
 
   // Bind fusion inputs
-  auto expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel());
+  // auto expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel());
 
   // only allocate outputs when not given
   if (outputs.empty()) {
-    outputs = allocateOutputs(
+    outputs = allocateKernelOutputs(
         compiled_kernel_->kernel()->as<Fusion>(),
         executor_entry->outputs,
         compiled_kernel_->device(),
-        expr_eval);
+        args);
   }
   args.push(outputs);
 
@@ -993,8 +1064,8 @@ std::vector<at::Tensor> KernelExecutor::run(
       // Skip trivially forwarded outputs because they are just placeholders
       continue;
     }
-    expr_eval.bind(
-        output, args[compiled_kernel_->kernel()->inputs().size() + i]);
+    // expr_eval.bind(
+    //     output, args[compiled_kernel_->kernel()->inputs().size() + i]);
   }
 
   std::vector<at::Tensor> intermediates;
@@ -1051,23 +1122,21 @@ std::vector<at::Tensor> KernelExecutor::run(
       }
       args.push(intermediate_buffer);
       intermediates.push_back(intermediate_buffer);
-      expr_eval.bind(
-          compiled_kernel_->kernel()
-              ->summary()
-              .global_allocations.at(i)
-              ->buffer(),
-          args
-              [compiled_kernel_->kernel()->inputs().size() + outputs.size() +
-               i]);
+      // expr_eval.bind(
+      //     compiled_kernel_->kernel()
+      //         ->summary()
+      //         .global_allocations.at(i)
+      //         ->buffer(),
+      //     args
+      //         [compiled_kernel_->kernel()->inputs().size() + outputs.size() +
+      //          i]);
       if (buf_info.is_profile_buffer) {
         profile_buffer = intermediate_buffer;
       }
     }
   }
 
-  if (executor_entry->args.empty()) {
-    computeArgs(*executor_entry, expr_eval, compiled_kernel_->kernel());
-  }
+  computeArgs2(*executor_entry, outputs, intermediates);
 
   if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) {
     launch_params_.print();
@@ -1093,7 +1162,7 @@ std::vector<at::Tensor> KernelExecutor::run(
     FUSER_PERF_SCOPE("KernelExecutor::runFusion::execute_kernel");
     ensureAvailableDynamicSmemSize(executor_entry->launch_params.smem());
 
-    recomputeArgs(*executor_entry, expr_eval, compiled_kernel_->kernel());
+    // recomputeArgs(*executor_entry, expr_eval, compiled_kernel_->kernel());
 
     if (isDebugDumpEnabled(DebugDumpOption::Occupancy) ||
         isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index bdce798369b..ece1c2987e3 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -122,6 +122,10 @@ class KernelExecutor : public ExecutorAbstract {
     bool init = false;
     LaunchParams launch_params;
     std::vector<GlobalBufferInfo> outputs;
+    // If an output is aliased to an input, this will hold the index of the
+    // input that it is aliased to. If not aliased, it will hold -1.
+    std::vector<int> output_aliased_to_input;
+    std::vector<int> output_aliased_to_output;
     // Temporary work buffers and intemediate global-memory tensors
     std::vector<GlobalBufferInfo> intermediates;
     // The arguments to the kernel. These are configured in computeArgs and
@@ -228,6 +232,9 @@ class KernelExecutor : public ExecutorAbstract {
   // to we have now.
   void computeArgs(ExecutorEntry&, ExpressionEvaluator&, const kir::Kernel*)
       const;
+
+  void computeArgs2(ExecutorEntry& entry, const std::vector<at::Tensor>& outputs) const;
+  
   // Updates an existing set of arguments based on the current arguments. It is
   // is an error to call this before `computeArgs` has been invoked.
   // recomputeArgs will fail if the arity of the function changes, or the rank
diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp
index 8a0fc4fe7ce..9197cd63771 100644
--- a/csrc/runtime/executor_kernel_arg.cpp
+++ b/csrc/runtime/executor_kernel_arg.cpp
@@ -377,6 +377,38 @@ std::vector<std::byte> getKernelArgument(
   return polymorphicValueToBytes(pv, parameter->dtype(), index_type);
 }
 
+std::vector<std::byte> getKernelArgument(
+    at::Tensor tensor,
+    const GlobalBufferInfo& output_info,
+    PrimDataType index_type) {
+  FUSER_PERF_SCOPE("getKernelArgument");
+
+  NVF_ERROR(
+      tensor.is_cuda() || tensor.is_meta(),
+      "GetMetaData expects a CUDA/meta tensor as input, but got: ",
+      tensor);
+
+  std::shared_ptr<Struct> struct_ = std::make_shared<TensorMetaData>();
+  TensorMetaData* metadata = (TensorMetaData*)struct_.get();
+  metadata->dtype =
+      std::get<PrimDataType>(aten_to_data_type(tensor.scalar_type()).type);
+  metadata->data = tensor.data_ptr();
+
+  metadata->logical_size_data = output_info.shape_info.logical_sizes;
+  metadata->logical_size = c10::makeArrayRef(metadata->logical_size_data);
+  metadata->logical_stride_data = output_info.shape_info.logical_strides;
+  metadata->logical_stride = c10::makeArrayRef(metadata->logical_stride_data);
+  metadata->alloc_size_data = output_info.shape_info.allocation_sizes;
+  metadata->alloc_size = c10::makeArrayRef(metadata->alloc_size_data);
+  metadata->alloc_stride_data = output_info.shape_info.allocation_strides;
+  metadata->alloc_stride = c10::makeArrayRef(metadata->alloc_stride_data);
+
+  return polymorphicValueToBytes(
+      PolymorphicValue(std::move(struct_)),
+      output_info.tv->dtype(),
+      index_type);
+}
+
 int64_t computeBytes(const KernelArgumentHolder& args) {
   int64_t num_bytes = 0;
   // Figure how many bytes are inputs, outputs, and temporary buffers
diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h
index e4a8c6b4ea9..a583104f66c 100644
--- a/csrc/runtime/executor_kernel_arg.h
+++ b/csrc/runtime/executor_kernel_arg.h
@@ -222,6 +222,11 @@ std::vector<std::byte> getKernelArgument(
     Val* parameter,
     PrimDataType index_type);
 
+std::vector<std::byte> getKernelArgument(
+    at::Tensor tensor,
+    const GlobalBufferInfo& output_info,
+    PrimDataType index_type);
+
 int64_t computeBytes(const KernelArgumentHolder& args);
 
 int64_t computeBytes(const std::vector<at::Tensor>& outputs);
diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp
index e82aeef08c0..fd7d4a60df2 100644
--- a/csrc/tensor_metadata.cpp
+++ b/csrc/tensor_metadata.cpp
@@ -356,7 +356,8 @@ std::vector<PolymorphicValue> GetMetaData::evaluate(
   } else {
     metadata->logical_size = input.sizes();
   }
-  metadata->logical_stride = input.strides();
+  metadata->logical_stride_data = input.strides();
+  metadata->logical_stride = c10::makeArrayRef(metadata->logical_stride_data);
 
   auto [allocation_sizes, allocation_strides] =
       inferAndValidateAllocationSizesAndStrides(input, tv, ee);

From 543f9107be6c4ec15250c8d9dd1d83fa89a2fcea Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 24 Feb 2025 07:27:42 -0800
Subject: [PATCH 03/35] Lint.

---
 csrc/runtime/executor.cpp | 16 +++++++++++-----
 csrc/runtime/executor.h   |  6 ++++--
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index f8b2742b8ae..dbd231f6d69 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -839,17 +839,22 @@ void KernelExecutor::computeArgs2(
   entry.arg_ptrs.resize(outputs.size() + intermediates.size());
 
   NVF_ERROR(entry.outputs.size() == outputs.size(), "Outputs size mismatch");
-  NVF_ERROR(entry.intermediates.size() == intermediates.size(), "Intermediates size mismatch");
+  NVF_ERROR(
+      entry.intermediates.size() == intermediates.size(),
+      "Intermediates size mismatch");
 
   const PrimDataType idx_type = compiled_kernel_->kernel()->indexType();
   for (size_t out_idx = 0; out_idx < outputs.size(); ++out_idx) {
-    entry.args[out_idx] = getKernelArgument(outputs[out_idx], entry.outputs[out_idx], idx_type);
+    entry.args[out_idx] =
+        getKernelArgument(outputs[out_idx], entry.outputs[out_idx], idx_type);
     entry.arg_ptrs[out_idx] = entry.args[out_idx].data();
   }
 
   for (size_t inter_idx = 0; inter_idx < intermediates.size(); ++inter_idx) {
-    entry.args[out_idx + inter_idx] = getKernelArgument(intermediates[inter_idx], entry.intermediates[inter_idx], idx_type);
-    entry.arg_ptrs[out_idx + inter_idx] = entry.args[out_idx + inter_idx].data();
+    entry.args[out_idx + inter_idx] = getKernelArgument(
+        intermediates[inter_idx], entry.intermediates[inter_idx], idx_type);
+    entry.arg_ptrs[out_idx + inter_idx] =
+        entry.args[out_idx + inter_idx].data();
   }
 }
 
@@ -1043,7 +1048,8 @@ std::vector<at::Tensor> KernelExecutor::run(
   at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
 
   // Bind fusion inputs
-  // auto expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel());
+  // auto expr_eval = executor_utils::bindInputs(args,
+  // compiled_kernel_->kernel());
 
   // only allocate outputs when not given
   if (outputs.empty()) {
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index ece1c2987e3..e0b016423f2 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -233,8 +233,10 @@ class KernelExecutor : public ExecutorAbstract {
   void computeArgs(ExecutorEntry&, ExpressionEvaluator&, const kir::Kernel*)
       const;
 
-  void computeArgs2(ExecutorEntry& entry, const std::vector<at::Tensor>& outputs) const;
-  
+  void computeArgs2(
+      ExecutorEntry& entry,
+      const std::vector<at::Tensor>& outputs) const;
+
   // Updates an existing set of arguments based on the current arguments. It is
   // is an error to call this before `computeArgs` has been invoked.
   // recomputeArgs will fail if the arity of the function changes, or the rank

From f3d4320c0bf24cd07ece855cb15792a3fa85faeb Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 24 Feb 2025 08:05:22 -0800
Subject: [PATCH 04/35] Build fixes.

---
 csrc/runtime/allocations.cpp         | 18 +++---
 csrc/runtime/allocations.h           |  4 +-
 csrc/runtime/executor.cpp            | 61 +++++++++---------
 csrc/runtime/executor.h              | 95 +++++++++++++++-------------
 csrc/runtime/executor_kernel_arg.cpp |  1 +
 csrc/runtime/executor_kernel_arg.h   |  2 +
 csrc/serde/fusion_cache.fbs          |  8 +--
 csrc/tensor_metadata.cpp             |  3 +-
 8 files changed, 103 insertions(+), 89 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 53da6c08728..89aed454a4f 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -11,6 +11,7 @@
 #include <expr_evaluator.h>
 #include <instrumentation.h>
 #include <polymorphic_value.h>
+#include <runtime/executor.h>
 #include <runtime/executor_kernel_arg.h>
 #include <runtime/executor_utils.h>
 #include <tensor_metadata.h>
@@ -374,24 +375,23 @@ std::vector<at::Tensor> allocateOutputs(
 
 std::vector<at::Tensor> allocateKernelOutputs(
     const Fusion* fusion,
-    const std::vector<GlobalBufferInfo>& output_infos,
+    const KernelExecutorEntry& entry,
     const c10::Device& device,
     const KernelArgumentHolder& args) {
   FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs");
 
   NVF_ERROR(
       std::any_of(
-          output_infos.output_aliased_to_output.begin(),
-          output_infos.output_aliased_to_output.end(),
+          entry.output_aliased_to_output.begin(),
+          entry.output_aliased_to_output.end(),
           [](int idx) { return idx != -1; }),
       "Kernel's don't support output to output aliasing.");
 
   std::vector<at::Tensor> out_tensors;
-  out_tensors.reserve(output_infos.size());
-  for (auto out_idx : c10::irange(output_infos.size())) {
-    auto out_info = output_infos.at(out_idx);
-    if (output_infos.output_aliased_to_input.at(out_info.tv->startIdx()) ==
-        -1) {
+  out_tensors.reserve(entry.outputs.size());
+  for (auto out_idx : c10::irange(entry.outputs.size())) {
+    auto out_info = entry.outputs.at(out_idx);
+    if (entry.output_aliased_to_input.at(out_idx) == -1) {
       auto alloc_tensor = at::native::empty_strided_cuda(
           out_info.shape_info.logical_sizes,
           out_info.shape_info.logical_strides,
@@ -404,7 +404,7 @@ std::vector<at::Tensor> allocateKernelOutputs(
       }
       out_tensors.emplace_back(alloc_tensor);
     } else {
-      auto input_arg = args[output_infos.output_aliased_to_input.at(out_idx)];
+      auto input_arg = args[entry.output_aliased_to_input.at(out_idx)];
       NVF_ERROR(
           input_arg.is<at::Tensor>(),
           "Aliased input argument is not a tensor.");
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
index 06333a91caa..999692f936e 100644
--- a/csrc/runtime/allocations.h
+++ b/csrc/runtime/allocations.h
@@ -15,6 +15,8 @@
 
 namespace nvfuser {
 
+struct KernelExecutorEntry;
+
 struct TensorShapeInfo {
   std::vector<int64_t> logical_sizes;
   std::vector<int64_t> logical_strides;
@@ -89,7 +91,7 @@ std::vector<at::Tensor> allocateOutputs(
 
 std::vector<at::Tensor> allocateKernelOutputs(
     const Fusion* fusion,
-    const std::vector<GlobalBufferInfo>& output_infos,
+    const KernelExecutorEntry& entry,
     const c10::Device& device,
     const KernelArgumentHolder& args);
 
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index dbd231f6d69..61ad8992bc2 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -616,7 +616,7 @@ void dumpKernelArgs(
 } // namespace
 
 void KernelExecutor::initializeExecutorEntry(
-    ExecutorEntry& executor_entry,
+    KernelExecutorEntry& executor_entry,
     const KernelArgumentHolder& args,
     const LaunchParams& launch_constraints,
     const CompileParams& compile_params,
@@ -665,13 +665,15 @@ void KernelExecutor::initializeExecutorEntry(
         expr_eval, index_type, compiled_kernel_->kernel()->outputs());
   } else {
     // Need to save the information necessary for allocations as
-    // future uses of this ExecutorEntry may not be provided with
+    // future uses of this KernelExecutorEntry may not be provided with
     // allocated outputs
     for (auto output_idx : c10::irange(outputs.size())) {
       const auto& output = outputs[output_idx];
       GlobalBufferInfo info;
       info.type = output.scalar_type();
-      info.tv = compiled_kernel_->kernel()->outputs()[output_idx];
+      auto out_val = compiled_kernel_->kernel()->outputs()[output_idx];
+      NVF_ERROR(out_val->isA<TensorView>(), "Output is not a TensorView");
+      info.tv = out_val->as<TensorView>();
       NVF_ERROR(
           !info.tv->hasAllocation(),
           "Accepting allocated outputs is not currently supported with allocation domain. ",
@@ -691,7 +693,7 @@ void KernelExecutor::initializeExecutorEntry(
   for (auto output_idx : c10::irange(outputs.size())) {
     auto out_info = output_info[output_idx];
     auto fusion = compiled_kernel_->kernel()->as<Fusion>();
-    auto alias_info = fusion->getOutputAliasInfo(out_info.tv);
+    auto alias_info = fusion->getOutputAlias(out_info.tv);
     NVF_ERROR(
         alias_info.type != AllocationType::Evaluate,
         "Outputs should not be evaluate type for kernels.");
@@ -703,7 +705,7 @@ void KernelExecutor::initializeExecutorEntry(
         std::find(
             fusion->inputs().begin(), fusion->inputs().end(), aliased_to) -
         fusion->inputs().begin();
-    if (aliased_to_idx < fusion->inputs().size()) {
+    if (aliased_to_idx < (int64_t)fusion->inputs().size()) {
       output_aliased_to_input[output_idx] = aliased_to_idx;
     } else {
       output_aliased_to_output[output_idx] =
@@ -711,7 +713,7 @@ void KernelExecutor::initializeExecutorEntry(
               fusion->outputs().begin(), fusion->outputs().end(), aliased_to) -
           fusion->outputs().begin();
       NVF_ERROR(
-          output_aliased_to_output[output_idx] < fusion->outputs().size(),
+          output_aliased_to_output[output_idx] < (int)fusion->outputs().size(),
           "Alias found but is not an output or input of the fusion. ",
           "Aliased to tv: ",
           aliased_to->toString(),
@@ -724,10 +726,11 @@ void KernelExecutor::initializeExecutorEntry(
 
   auto intermediates = getIntermediateBufferInfo(expr_eval, index_type);
 
-  // All information is gathered. Save it to ExecutorEntry
+  // All information is gathered. Save it to KernelExecutorEntry
   executor_entry.launch_params = launch_params;
   executor_entry.outputs = output_info;
-  executor_entry.output_aliased_to = output_aliased_to;
+  executor_entry.output_aliased_to_input = output_aliased_to_input;
+  executor_entry.output_aliased_to_output = output_aliased_to_output;
   executor_entry.intermediates = intermediates;
   executor_entry.init = true;
 }
@@ -750,7 +753,7 @@ void KernelExecutor::initializeExecutorEntry(
 /// @param idx_type_size generally sizeof(int32_t) or sizeof(int64_t); used for
 ///                      computing how large the arrays to copy are.
 static void fillTensorArgMetadata(
-    KernelExecutor::ExecutorEntry& entry,
+    KernelExecutorEntry& entry,
     const PolymorphicValue& tensor_metadata,
     size_t idx,
     size_t idx_type_size) {
@@ -812,7 +815,7 @@ static void fillTensorArgMetadata(
 // It does not need to happen when only shapes change---use recomputeArgs for
 // that.
 void KernelExecutor::computeArgs(
-    ExecutorEntry& entry,
+    KernelExecutorEntry& entry,
     ExpressionEvaluator& expr_eval,
     const kir::Kernel* kernel) const {
   FUSER_PERF_SCOPE("KernelExecutor::computeArgs");
@@ -830,7 +833,7 @@ void KernelExecutor::computeArgs(
 // set the arguments that we'll pass to cuLaunchKernel
 // TODO: Add to header
 void KernelExecutor::computeArgs2(
-    ExecutorEntry& entry,
+    KernelExecutorEntry& entry,
     const std::vector<at::Tensor>& outputs,
     const std::vector<at::Tensor>& intermediates) const {
   FUSER_PERF_SCOPE("KernelExecutor::computeArgs2");
@@ -851,17 +854,17 @@ void KernelExecutor::computeArgs2(
   }
 
   for (size_t inter_idx = 0; inter_idx < intermediates.size(); ++inter_idx) {
-    entry.args[out_idx + inter_idx] = getKernelArgument(
+    entry.args[outputs.size() + inter_idx] = getKernelArgument(
         intermediates[inter_idx], entry.intermediates[inter_idx], idx_type);
-    entry.arg_ptrs[out_idx + inter_idx] =
-        entry.args[out_idx + inter_idx].data();
+    entry.arg_ptrs[outputs.size() + inter_idx] =
+        entry.args[outputs.size() + inter_idx].data();
   }
 }
 
 // Reset the arguments that we'll pass to cuLaunchKernel. This needs to be
 // invoked on every shape change.
 void KernelExecutor::recomputeArgs(
-    ExecutorEntry& entry,
+    KernelExecutorEntry& entry,
     ExpressionEvaluator& expr_eval,
     const kir::Kernel* kernel) const {
   FUSER_PERF_SCOPE("KernelExecutor::recomputeArgs");
@@ -1015,9 +1018,9 @@ std::vector<at::Tensor> KernelExecutor::run(
   NVF_ERROR(compiled_kernel_->lowered());
 
   // Placeholder for the case where parameter cache is not used
-  ExecutorEntry temporary_executor_entry;
+  KernelExecutorEntry temporary_executor_entry;
 
-  ExecutorEntry* executor_entry = args.getCacheId().has_value() &&
+  KernelExecutorEntry* executor_entry = args.getCacheId().has_value() &&
           !compiled_kernel_->disablePaarameterCache()
       ? &executor_entry_lookup_[*args.getCacheId()]
       : &temporary_executor_entry;
@@ -1054,8 +1057,8 @@ std::vector<at::Tensor> KernelExecutor::run(
   // only allocate outputs when not given
   if (outputs.empty()) {
     outputs = allocateKernelOutputs(
-        compiled_kernel_->kernel()->as<Fusion>(),
-        executor_entry->outputs,
+        compiled_kernel_->kernel(),
+        *executor_entry,
         compiled_kernel_->device(),
         args);
   }
@@ -1247,7 +1250,7 @@ std::vector<at::Tensor> KernelExecutor::run(
 flatbuffers::Offset<serde::KernelExecutor> KernelExecutor::serialize(
     flatbuffers::FlatBufferBuilder& builder) const {
   // See table definition for KernelExecutor in serde/fusion_cache.fbs
-  using fb_executor_entry = flatbuffers::Offset<serde::ExecutorEntry>;
+  using fb_executor_entry = flatbuffers::Offset<serde::KernelExecutorEntry>;
 
   // Separate unordered_map for executor_entry_lookup into key and value
   // vectors. The key value is the cache_id value in the KernelArgumentHolder.
@@ -1328,10 +1331,10 @@ flatbuffers::Offset<serde::CudaKernel> KernelExecutor::serialize(
   return ckb.Finish();
 }
 
-flatbuffers::Offset<serde::ExecutorEntry> KernelExecutor::serialize(
+flatbuffers::Offset<serde::KernelExecutorEntry> KernelExecutor::serialize(
     flatbuffers::FlatBufferBuilder& builder,
-    const ExecutorEntry& data) const {
-  // See table definition for ExecutorEntry in serde/fusion_cache.fbs
+    const KernelExecutorEntry& data) const {
+  // See table definition for KernelExecutorEntry in serde/fusion_cache.fbs
 
   // Serialize GlobalBufferInfo for outputs.
   // We map the output TensorView pointer to its corresponding position in
@@ -1377,7 +1380,7 @@ flatbuffers::Offset<serde::ExecutorEntry> KernelExecutor::serialize(
         serialize(builder, buffer, tv_position, false /* is_fusion_output */));
   }
 
-  return serde::CreateExecutorEntryDirect(
+  return serde::CreateKernelExecutorEntryDirect(
       builder,
       data.init,
       data.launch_params.serialize(builder),
@@ -1469,13 +1472,13 @@ void KernelExecutor::deserialize(
   }
 }
 
-KernelExecutor::ExecutorEntry KernelExecutor::deserialize(
-    const serde::ExecutorEntry* buffer) {
-  // See table definition for ExecutorEntry in serde/fusion_cache.fbs
+KernelExecutorEntry KernelExecutor::deserialize(
+    const serde::KernelExecutorEntry* buffer) {
+  // See table definition for KernelExecutorEntry in serde/fusion_cache.fbs
 
-  NVF_ERROR(buffer != nullptr, "serde::ExecutorEntry is nullptr.");
+  NVF_ERROR(buffer != nullptr, "serde::KernelExecutorEntry is nullptr.");
 
-  ExecutorEntry entry;
+  KernelExecutorEntry entry;
 
   entry.init = buffer->init();
 
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index e0b016423f2..1fecd1c32c8 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -58,6 +58,35 @@ class ExprEvalExecutor : public ExecutorAbstract {
   std::unique_ptr<Fusion> fusion_;
 };
 
+// struct used to hold necessary information to launch compiled kernel on a
+// given input set.
+//
+// TODO: strides would also be important when we handle permutations in
+//       codegen.
+//
+struct KernelExecutorEntry {
+  bool init = false;
+  LaunchParams launch_params;
+  std::vector<GlobalBufferInfo> outputs;
+  // If an output is aliased to an input, this will hold the index of the
+  // input that it is aliased to. If not aliased, it will hold -1.
+  std::vector<int> output_aliased_to_input;
+  std::vector<int> output_aliased_to_output;
+  // Temporary work buffers and intemediate global-memory tensors
+  std::vector<GlobalBufferInfo> intermediates;
+  // The arguments to the kernel. These are configured in computeArgs and
+  // recomputeArgs.
+  // For the common case of a tensor argument, these correspond to the
+  // `struct Tensor` data in runtime/tensor.cu. That means each tensor
+  // element in `args` would be a sizeof(void*) + len(shape)*sizeof(int) +
+  // len(shape)*sizeof(int) byte array (here "int" is used in place of the
+  // index type, which varies in practice).
+  std::vector<std::vector<std::byte>> args;
+  // This is just the data() pointers to the above `args`; cuLaunchKernel
+  // requires an array of this form.
+  std::vector<void*> arg_ptrs;
+};
+
 class KernelExecutor : public ExecutorAbstract {
  public:
   // NVF_API was added for nvfuser_extension. See examples/sinh_extension.
@@ -112,35 +141,6 @@ class KernelExecutor : public ExecutorAbstract {
     executor_entry_lookup_.erase(cache_id);
   }
 
-  // struct used to hold necessary information to launch compiled kernel on a
-  // given input set.
-  //
-  // TODO: strides would also be important when we handle permutations in
-  //       codegen.
-  //
-  struct ExecutorEntry {
-    bool init = false;
-    LaunchParams launch_params;
-    std::vector<GlobalBufferInfo> outputs;
-    // If an output is aliased to an input, this will hold the index of the
-    // input that it is aliased to. If not aliased, it will hold -1.
-    std::vector<int> output_aliased_to_input;
-    std::vector<int> output_aliased_to_output;
-    // Temporary work buffers and intemediate global-memory tensors
-    std::vector<GlobalBufferInfo> intermediates;
-    // The arguments to the kernel. These are configured in computeArgs and
-    // recomputeArgs.
-    // For the common case of a tensor argument, these correspond to the
-    // `struct Tensor` data in runtime/tensor.cu. That means each tensor
-    // element in `args` would be a sizeof(void*) + len(shape)*sizeof(int) +
-    // len(shape)*sizeof(int) byte array (here "int" is used in place of the
-    // index type, which varies in practice).
-    std::vector<std::vector<std::byte>> args;
-    // This is just the data() pointers to the above `args`; cuLaunchKernel
-    // requires an array of this form.
-    std::vector<void*> arg_ptrs;
-  };
-
   using ExecutorCompileTimeInfoCache =
       executor_utils::caching::ExecutorCompileTimeInfoCache;
 
@@ -217,9 +217,9 @@ class KernelExecutor : public ExecutorAbstract {
     return &compile_time_info_cache_;
   }
 
-  //! TODO: Consider changing this to a constructor of ExecutorEntry
+  //! TODO: Consider changing this to a constructor of KernelExecutorEntry
   void initializeExecutorEntry(
-      ExecutorEntry& executor_entry,
+      KernelExecutorEntry& executor_entry,
       const KernelArgumentHolder& args,
       const LaunchParams& launch_constraints,
       const CompileParams& compile_params,
@@ -230,35 +230,40 @@ class KernelExecutor : public ExecutorAbstract {
 
   // Creates the initial set of arguments to a kernel, based on the arguments
   // to we have now.
-  void computeArgs(ExecutorEntry&, ExpressionEvaluator&, const kir::Kernel*)
-      const;
+  void computeArgs(
+      KernelExecutorEntry&,
+      ExpressionEvaluator&,
+      const kir::Kernel*) const;
 
   void computeArgs2(
-      ExecutorEntry& entry,
-      const std::vector<at::Tensor>& outputs) const;
+      KernelExecutorEntry& entry,
+      const std::vector<at::Tensor>& outputs,
+      const std::vector<at::Tensor>& intermediates) const;
 
   // Updates an existing set of arguments based on the current arguments. It is
   // is an error to call this before `computeArgs` has been invoked.
   // recomputeArgs will fail if the arity of the function changes, or the rank
   // of any tensor changes (as these are compiled-in to the generated kernel
   // and therefore would require us to do a larger recompilation).
-  void recomputeArgs(ExecutorEntry&, ExpressionEvaluator&, const kir::Kernel*)
-      const;
+  void recomputeArgs(
+      KernelExecutorEntry&,
+      ExpressionEvaluator&,
+      const kir::Kernel*) const;
 
   //! Serialize CompiledKernel using flatbuffers
   flatbuffers::Offset<serde::CudaKernel> serialize(
       flatbuffers::FlatBufferBuilder& builder,
       const executor_utils::CudaExecutable* kernel) const;
 
-  // ExecutorEntry is an internal POD struct for the KernelExecutor class.
-  // We define ExecutorEntry's serialize and deserialize as private methods in
-  // KernelExecutor.
-  flatbuffers::Offset<serde::ExecutorEntry> serialize(
+  // KernelExecutorEntry is an internal POD struct for the KernelExecutor class.
+  // We define KernelExecutorEntry's serialize and deserialize as private
+  // methods in KernelExecutor.
+  flatbuffers::Offset<serde::KernelExecutorEntry> serialize(
       flatbuffers::FlatBufferBuilder& builder,
-      const ExecutorEntry& data) const;
+      const KernelExecutorEntry& data) const;
 
-  //! Deserialize ExecutorEntry using flatbuffers
-  ExecutorEntry deserialize(const serde::ExecutorEntry* buffer);
+  //! Deserialize KernelExecutorEntry using flatbuffers
+  KernelExecutorEntry deserialize(const serde::KernelExecutorEntry* buffer);
 
   // GlobalBufferInfo is an internal POD struct for the KernelExecutor class.
   // We define GlobalBufferInfo's serialize and deserialize as private methods
@@ -307,7 +312,7 @@ class KernelExecutor : public ExecutorAbstract {
 
   // lookup table to take short cut to retrieve recorded information in order to
   // launch kernels without re-inference parameters.
-  std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;
+  std::unordered_map<size_t, KernelExecutorEntry> executor_entry_lookup_;
 
   // Compile time information caching. This is used for shape inference
   //  support. The cache stores graph information that are available
diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp
index 9197cd63771..74819f0fea9 100644
--- a/csrc/runtime/executor_kernel_arg.cpp
+++ b/csrc/runtime/executor_kernel_arg.cpp
@@ -8,6 +8,7 @@
 #include <c10/util/irange.h>
 
 // Extract size and strides
+#include <runtime/allocations.h>
 #include <runtime/fusion_executor_cache.h>
 
 #include <instrumentation.h>
diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h
index a583104f66c..f2026ebe2e6 100644
--- a/csrc/runtime/executor_kernel_arg.h
+++ b/csrc/runtime/executor_kernel_arg.h
@@ -22,6 +22,8 @@
 
 namespace nvfuser {
 
+class GlobalBufferInfo;
+
 //! KernelArgumentHolder copies meta information from kernel inputs, including
 //! tensor sizes/shapes/dtype/memory_ptr and copies scalar inputs. It is used
 //! for both compilation as well as kernel execution. It takes ownership of
diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs
index 30a75729147..0d7cdd97d99 100644
--- a/csrc/serde/fusion_cache.fbs
+++ b/csrc/serde/fusion_cache.fbs
@@ -189,7 +189,7 @@ table KernelArgumentHolder {
 
 //
 // =====================================================================================
-// Tables for LaunchParams, GlobalBufferInfo, ExecutorEntry, and TensorShape used in KernelExecutor
+// Tables for LaunchParams, GlobalBufferInfo, KernelExecutorEntry, and TensorShape used in KernelExecutor
 
 // Data representing a tensor shape used in LaunchParam
 table TensorShape {
@@ -226,8 +226,8 @@ table GlobalBufferInfo {
   is_fusion_output : bool;
 }
 
-// This table describes the cached ExecutorEntry for a kernel.
-table ExecutorEntry {
+// This table describes the cached KernelExecutorEntry for a kernel.
+table KernelExecutorEntry {
   init : bool;
   launch_params : LaunchParams;
   outputs : [GlobalBufferInfo];
@@ -370,7 +370,7 @@ table KernelExecutor {
   group_id: long;
   kernel_code: string;
   executor_entry_lookup_keys: [ulong];
-  executor_entry_lookup_values: [ExecutorEntry];
+  executor_entry_lookup_values: [KernelExecutorEntry];
   // Is this kernel being compiled with int32 or int64 indexing?
   index_type : long;
   compiled_kernel: CudaKernel;
diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp
index fd7d4a60df2..e5dd17f4aca 100644
--- a/csrc/tensor_metadata.cpp
+++ b/csrc/tensor_metadata.cpp
@@ -356,7 +356,8 @@ std::vector<PolymorphicValue> GetMetaData::evaluate(
   } else {
     metadata->logical_size = input.sizes();
   }
-  metadata->logical_stride_data = input.strides();
+  metadata->logical_stride_data =
+      std::vector<int64_t>(input.strides().begin(), input.strides().end());
   metadata->logical_stride = c10::makeArrayRef(metadata->logical_stride_data);
 
   auto [allocation_sizes, allocation_strides] =

From 66777eb2d4e28c9a0bc0ce3403cdc3b6b6ace5e8 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 24 Feb 2025 09:11:44 -0800
Subject: [PATCH 05/35] Getting closer, just an IMA now.

---
 csrc/runtime/allocations.cpp         | 10 ++++------
 csrc/runtime/executor.cpp            | 13 ++++++++++---
 csrc/runtime/executor_kernel_arg.cpp |  9 +++++----
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 89aed454a4f..3f242bfb63d 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -380,12 +380,7 @@ std::vector<at::Tensor> allocateKernelOutputs(
     const KernelArgumentHolder& args) {
   FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs");
 
-  NVF_ERROR(
-      std::any_of(
-          entry.output_aliased_to_output.begin(),
-          entry.output_aliased_to_output.end(),
-          [](int idx) { return idx != -1; }),
-      "Kernel's don't support output to output aliasing.");
+  // TODO: Figure out if output to output aliasing is needed
 
   std::vector<at::Tensor> out_tensors;
   out_tensors.reserve(entry.outputs.size());
@@ -404,6 +399,9 @@ std::vector<at::Tensor> allocateKernelOutputs(
       }
       out_tensors.emplace_back(alloc_tensor);
     } else {
+      NVF_ERROR(
+          entry.output_aliased_to_input.at(out_idx) <= (int64_t)args.size(),
+          "Tried to grab an out of range input argument.");
       auto input_arg = args[entry.output_aliased_to_input.at(out_idx)];
       NVF_ERROR(
           input_arg.is<at::Tensor>(),
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 61ad8992bc2..541f0fa30ab 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -661,6 +661,7 @@ void KernelExecutor::initializeExecutorEntry(
   std::vector<GlobalBufferInfo> output_info;
 
   if (outputs.empty()) {
+    std::cout << compiled_kernel_->kernel()->outputs() << std::endl;
     output_info = getBufferInfos(
         expr_eval, index_type, compiled_kernel_->kernel()->outputs());
   } else {
@@ -687,10 +688,10 @@ void KernelExecutor::initializeExecutorEntry(
     }
   }
 
-  std::vector<int> output_aliased_to_input(outputs.size(), -1);
-  std::vector<int> output_aliased_to_output(outputs.size(), -1);
+  std::vector<int> output_aliased_to_input(output_info.size(), -1);
+  std::vector<int> output_aliased_to_output(output_info.size(), -1);
 
-  for (auto output_idx : c10::irange(outputs.size())) {
+  for (auto output_idx : c10::irange(output_info.size())) {
     auto out_info = output_info[output_idx];
     auto fusion = compiled_kernel_->kernel()->as<Fusion>();
     auto alias_info = fusion->getOutputAlias(out_info.tv);
@@ -721,6 +722,12 @@ void KernelExecutor::initializeExecutorEntry(
           fusion->inputs(),
           "\nFusion Outputs:\n  ",
           fusion->outputs());
+      TORCH_WARN(
+          "Kernel found with output to output aliasing, this is unsupported in a kernel and will beignored.\n",
+          "Output: ",
+          out_info.tv->toString(),
+          "\nAliased to: ",
+          aliased_to->toString());
     }
   }
 
diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp
index 74819f0fea9..5d50eab4dbe 100644
--- a/csrc/runtime/executor_kernel_arg.cpp
+++ b/csrc/runtime/executor_kernel_arg.cpp
@@ -295,11 +295,11 @@ std::vector<std::byte> polymorphicValueToBytes(
   } else if (argument.is<StructHandle>()) {
     // FUSER_PERF_SCOPE("polymorphicValueToBytes(StructHandle)");
     std::vector<std::byte> buffer;
-    const auto& dtype_ = std::get<StructType>(dtype.type);
-    auto& data = argument->*&TensorMetaData::data;
-    auto& logical_size = argument->*&TensorMetaData::logical_size;
-    auto& alloc_stride = argument->*&TensorMetaData::alloc_stride;
     if (argument.as<StructHandle>().is<TensorMetaData>()) {
+      auto& data = argument->*&TensorMetaData::data;
+      auto& logical_size = argument->*&TensorMetaData::logical_size;
+      auto& alloc_stride = argument->*&TensorMetaData::alloc_stride;
+
       // special handle for TensorMetaData so that CPU overhead is minimal.
       if (index_type == PrimDataType::Int) {
         buffer.reserve(
@@ -340,6 +340,7 @@ std::vector<std::byte> polymorphicValueToBytes(
       }
       return buffer;
     } else {
+      const auto& dtype_ = std::get<StructType>(dtype.type);
       for (const auto& field : dtype_.fields) {
         if (!field.used_in_kernel) {
           continue;

From 0a30236da9c021b212c47b875b76cc1816b536dc Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 24 Feb 2025 17:14:51 -0800
Subject: [PATCH 06/35] Fix basic tests.

---
 csrc/runtime/executor.cpp            |  43 +++++-----
 csrc/runtime/executor.h              |   3 +-
 csrc/runtime/executor_kernel_arg.cpp | 120 +++++++++++++++------------
 3 files changed, 92 insertions(+), 74 deletions(-)

diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 541f0fa30ab..ef99a21319f 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -661,7 +661,6 @@ void KernelExecutor::initializeExecutorEntry(
   std::vector<GlobalBufferInfo> output_info;
 
   if (outputs.empty()) {
-    std::cout << compiled_kernel_->kernel()->outputs() << std::endl;
     output_info = getBufferInfos(
         expr_eval, index_type, compiled_kernel_->kernel()->outputs());
   } else {
@@ -841,30 +840,32 @@ void KernelExecutor::computeArgs(
 // TODO: Add to header
 void KernelExecutor::computeArgs2(
     KernelExecutorEntry& entry,
-    const std::vector<at::Tensor>& outputs,
-    const std::vector<at::Tensor>& intermediates) const {
+    const KernelArgumentHolder& args) const {
   FUSER_PERF_SCOPE("KernelExecutor::computeArgs2");
+  if (entry.args.size() != args.size()) {
+    entry.args.resize(args.size());
+    entry.arg_ptrs.resize(args.size());
+  }
 
-  entry.args.resize(outputs.size() + intermediates.size());
-  entry.arg_ptrs.resize(outputs.size() + intermediates.size());
-
-  NVF_ERROR(entry.outputs.size() == outputs.size(), "Outputs size mismatch");
   NVF_ERROR(
-      entry.intermediates.size() == intermediates.size(),
-      "Intermediates size mismatch");
+      args.size() ==
+              compiled_kernel_->kernel()->inputs().size() +
+                  entry.outputs.size() + entry.intermediates.size() &&
+          args.size() == compiled_kernel_->kernel()->parameters().size(),
+      "Argument size mismatch, expected: ",
+      compiled_kernel_->kernel()->inputs().size() + entry.outputs.size() +
+          entry.intermediates.size(),
+      " got: ",
+      args.size());
 
   const PrimDataType idx_type = compiled_kernel_->kernel()->indexType();
-  for (size_t out_idx = 0; out_idx < outputs.size(); ++out_idx) {
-    entry.args[out_idx] =
-        getKernelArgument(outputs[out_idx], entry.outputs[out_idx], idx_type);
-    entry.arg_ptrs[out_idx] = entry.args[out_idx].data();
-  }
-
-  for (size_t inter_idx = 0; inter_idx < intermediates.size(); ++inter_idx) {
-    entry.args[outputs.size() + inter_idx] = getKernelArgument(
-        intermediates[inter_idx], entry.intermediates[inter_idx], idx_type);
-    entry.arg_ptrs[outputs.size() + inter_idx] =
-        entry.args[outputs.size() + inter_idx].data();
+  for (size_t i = 0; i < args.size(); ++i) {
+    auto bytes = polymorphicValueToBytes(
+        args[i],
+        compiled_kernel_->kernel()->parameters()[i]->dtype(),
+        idx_type);
+    entry.args[i] = bytes;
+    entry.arg_ptrs[i] = entry.args[i].data();
   }
 }
 
@@ -1152,7 +1153,7 @@ std::vector<at::Tensor> KernelExecutor::run(
     }
   }
 
-  computeArgs2(*executor_entry, outputs, intermediates);
+  computeArgs2(*executor_entry, args);
 
   if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) {
     launch_params_.print();
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index 1fecd1c32c8..faa6d67fc53 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -237,8 +237,7 @@ class KernelExecutor : public ExecutorAbstract {
 
   void computeArgs2(
       KernelExecutorEntry& entry,
-      const std::vector<at::Tensor>& outputs,
-      const std::vector<at::Tensor>& intermediates) const;
+      const KernelArgumentHolder& args) const;
 
   // Updates an existing set of arguments based on the current arguments. It is
   // is an error to call this before `computeArgs` has been invoked.
diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp
index 5d50eab4dbe..ee4ebf8091f 100644
--- a/csrc/runtime/executor_kernel_arg.cpp
+++ b/csrc/runtime/executor_kernel_arg.cpp
@@ -180,25 +180,75 @@ std::vector<std::byte> polymorphicValueToBytes(
   if (argument.is<at::Tensor>()) {
     // FUSER_PERF_SCOPE("polymorphicValueToBytes(at::Tensor)");
     const auto& tensor = argument.as<at::Tensor>();
-    NVF_ERROR(
-        tensor.is_cpu() && tensor.numel() == 1,
-        "Only CPU scalar tensors are supported here. ",
-        "For GPU tensors, please use their metadata.");
-    auto scalar_type = tensor.scalar_type();
-    NVF_ERROR(
-        dtype == aten_to_data_type(scalar_type),
-        "Expected ",
-        dtype,
-        " but got ",
-        aten_to_data_type(scalar_type),
-        ".");
-    std::vector<std::byte> buffer;
-    buffer.reserve(tensor.element_size());
-    buffer.insert(
-        buffer.end(),
-        (std::byte*)tensor.data_ptr(),
-        (std::byte*)tensor.data_ptr() + tensor.element_size());
-    return buffer;
+    if (tensor.is_cpu()) {
+      NVF_ERROR(
+          tensor.numel() == 1,
+          "Only CPU scalar tensors are supported here. ",
+          "For GPU tensors, please use their metadata.");
+      auto scalar_type = tensor.scalar_type();
+      NVF_ERROR(
+          dtype == aten_to_data_type(scalar_type),
+          "Expected ",
+          dtype,
+          " but got ",
+          aten_to_data_type(scalar_type),
+          ".");
+      std::vector<std::byte> buffer;
+      buffer.reserve(tensor.element_size());
+      buffer.insert(
+          buffer.end(),
+          (std::byte*)tensor.data_ptr(),
+          (std::byte*)tensor.data_ptr() + tensor.element_size());
+      return buffer;
+    } else {
+      NVF_ERROR(
+          tensor.is_cuda(), "Only accepts CUDA tensors or CPU scalar tensors.");
+
+      std::vector<std::byte> buffer;
+      auto data = tensor.data_ptr();
+      auto logical_size = tensor.sizes();
+      auto alloc_stride = tensor.strides();
+
+      // special handle for TensorMetaData so that CPU overhead is minimal.
+      if (index_type == PrimDataType::Int) {
+        buffer.reserve(
+            sizeof(void*) + sizeof(int64_t) * logical_size.size() +
+            sizeof(int64_t) * alloc_stride.size());
+        buffer.insert(
+            buffer.end(), (std::byte*)data, (std::byte*)data + sizeof(void*));
+        buffer.insert(
+            buffer.end(),
+            (std::byte*)logical_size.data(),
+            (std::byte*)logical_size.data() +
+                sizeof(int64_t) * logical_size.size());
+        buffer.insert(
+            buffer.end(),
+            (std::byte*)alloc_stride.data(),
+            (std::byte*)alloc_stride.data() +
+                sizeof(int64_t) * alloc_stride.size());
+      } else {
+        buffer.reserve(
+            sizeof(void*) + sizeof(int32_t) * logical_size.size() +
+            sizeof(int32_t) * alloc_stride.size());
+        buffer.insert(
+            buffer.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*));
+        std::vector<int32_t> logical_size32(
+            logical_size.begin(), logical_size.end());
+        buffer.insert(
+            buffer.end(),
+            (std::byte*)logical_size32.data(),
+            (std::byte*)logical_size32.data() +
+                sizeof(int32_t) * logical_size32.size());
+        std::vector<int32_t> alloc_stride32(
+            alloc_stride.begin(), alloc_stride.end());
+        buffer.insert(
+            buffer.end(),
+            (std::byte*)alloc_stride32.data(),
+            (std::byte*)alloc_stride32.data() +
+                sizeof(int32_t) * alloc_stride32.size());
+      }
+      return buffer;
+    }
   } else if (argument.is<Pointer>()) {
     // FUSER_PERF_SCOPE("polymorphicValueToBytes(Pointer)");
     NVF_ERROR(
@@ -379,38 +429,6 @@ std::vector<std::byte> getKernelArgument(
   return polymorphicValueToBytes(pv, parameter->dtype(), index_type);
 }
 
-std::vector<std::byte> getKernelArgument(
-    at::Tensor tensor,
-    const GlobalBufferInfo& output_info,
-    PrimDataType index_type) {
-  FUSER_PERF_SCOPE("getKernelArgument");
-
-  NVF_ERROR(
-      tensor.is_cuda() || tensor.is_meta(),
-      "GetMetaData expects a CUDA/meta tensor as input, but got: ",
-      tensor);
-
-  std::shared_ptr<Struct> struct_ = std::make_shared<TensorMetaData>();
-  TensorMetaData* metadata = (TensorMetaData*)struct_.get();
-  metadata->dtype =
-      std::get<PrimDataType>(aten_to_data_type(tensor.scalar_type()).type);
-  metadata->data = tensor.data_ptr();
-
-  metadata->logical_size_data = output_info.shape_info.logical_sizes;
-  metadata->logical_size = c10::makeArrayRef(metadata->logical_size_data);
-  metadata->logical_stride_data = output_info.shape_info.logical_strides;
-  metadata->logical_stride = c10::makeArrayRef(metadata->logical_stride_data);
-  metadata->alloc_size_data = output_info.shape_info.allocation_sizes;
-  metadata->alloc_size = c10::makeArrayRef(metadata->alloc_size_data);
-  metadata->alloc_stride_data = output_info.shape_info.allocation_strides;
-  metadata->alloc_stride = c10::makeArrayRef(metadata->alloc_stride_data);
-
-  return polymorphicValueToBytes(
-      PolymorphicValue(std::move(struct_)),
-      output_info.tv->dtype(),
-      index_type);
-}
-
 int64_t computeBytes(const KernelArgumentHolder& args) {
   int64_t num_bytes = 0;
   // Figure how many bytes are inputs, outputs, and temporary buffers

From 93f7ebd79c136d0589e34de2183eda456d4a9593 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 24 Feb 2025 18:00:33 -0800
Subject: [PATCH 07/35] Still debugging.

---
 csrc/runtime/executor.cpp            | 146 +++++++++++++++++++++++++--
 csrc/runtime/executor.h              |   1 +
 csrc/runtime/executor_kernel_arg.cpp |  88 ++++------------
 3 files changed, 156 insertions(+), 79 deletions(-)

diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index ef99a21319f..0d9a1815001 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -658,6 +658,46 @@ void KernelExecutor::initializeExecutorEntry(
       "Expected blockDim.x >= 32 but found ",
       launch_params.bdimx());
 
+  std::vector<GlobalBufferInfo> input_info;
+  NVF_ERROR(
+      compiled_kernel_->kernel()->inputs().size() == args.size(),
+      "Input size mismatch, expected: ",
+      compiled_kernel_->kernel()->inputs().size(),
+      " got: ",
+      args.size());
+  for (auto inp_idx :
+       c10::irange(compiled_kernel_->kernel()->inputs().size())) {
+    auto input = compiled_kernel_->kernel()->inputs()[inp_idx];
+    if (input->isA<TensorView>()) {
+      if (input->as<TensorView>()->hasAllocation()) {
+        input_info.emplace_back(getBufferInfos(
+            expr_eval, index_type, {input->as<TensorView>()})[0]);
+      } else {
+        TensorShapeInfo shape_info;
+        shape_info.logical_sizes = args[inp_idx].as<at::Tensor>().sizes().vec();
+        shape_info.logical_strides =
+            args[inp_idx].as<at::Tensor>().strides().vec();
+        shape_info.allocation_sizes =
+            args[inp_idx].as<at::Tensor>().sizes().vec();
+        shape_info.allocation_strides =
+            args[inp_idx].as<at::Tensor>().strides().vec();
+        input_info.emplace_back(GlobalBufferInfo(
+            input->as<TensorView>(),
+            shape_info,
+            data_type_to_aten(input->dtype()),
+            false,
+            false,
+            false));
+      }
+    } else {
+      input_info.emplace_back(GlobalBufferInfo());
+    }
+    std::cout << "input_info.back().shape_info.logical_sizes: "
+              << input_info.back().shape_info.logical_sizes << std::endl;
+    std::cout << "input_info.back().shape_info.allocation_strides: "
+              << input_info.back().shape_info.allocation_strides << std::endl;
+  }
+
   std::vector<GlobalBufferInfo> output_info;
 
   if (outputs.empty()) {
@@ -738,6 +778,7 @@ void KernelExecutor::initializeExecutorEntry(
   executor_entry.output_aliased_to_input = output_aliased_to_input;
   executor_entry.output_aliased_to_output = output_aliased_to_output;
   executor_entry.intermediates = intermediates;
+  executor_entry.inputs = input_info;
   executor_entry.init = true;
 }
 
@@ -841,6 +882,8 @@ void KernelExecutor::computeArgs(
 void KernelExecutor::computeArgs2(
     KernelExecutorEntry& entry,
     const KernelArgumentHolder& args) const {
+  std::cout << "Args: " << args.toString() << std::endl;
+
   FUSER_PERF_SCOPE("KernelExecutor::computeArgs2");
   if (entry.args.size() != args.size()) {
     entry.args.resize(args.size());
@@ -849,23 +892,106 @@ void KernelExecutor::computeArgs2(
 
   NVF_ERROR(
       args.size() ==
-              compiled_kernel_->kernel()->inputs().size() +
-                  entry.outputs.size() + entry.intermediates.size() &&
-          args.size() == compiled_kernel_->kernel()->parameters().size(),
+          compiled_kernel_->kernel()->inputs().size() + entry.outputs.size() +
+              entry.intermediates.size(),
       "Argument size mismatch, expected: ",
       compiled_kernel_->kernel()->inputs().size() + entry.outputs.size() +
           entry.intermediates.size(),
       " got: ",
       args.size());
 
+  NVF_ERROR(
+      args.size() == compiled_kernel_->kernel()->parameters().size(),
+      "Argument size mismatch, expected: ",
+      compiled_kernel_->kernel()->parameters().size(),
+      " got: ",
+      args.size());
+
+  auto buffer_info = [&](size_t idx) -> GlobalBufferInfo& {
+    if (idx < entry.inputs.size()) {
+      return entry.inputs[idx];
+    } else if (idx < entry.inputs.size() + entry.outputs.size()) {
+      return entry.outputs[idx - entry.inputs.size()];
+    } else if (
+        idx < entry.inputs.size() + entry.outputs.size() +
+            entry.intermediates.size()) {
+      return entry
+          .intermediates[idx - entry.inputs.size() - entry.outputs.size()];
+    } else {
+      NVF_CHECK(
+          0,
+          "Invalid buffer index: ",
+          idx,
+          " input size: ",
+          entry.inputs.size(),
+          " output size: ",
+          entry.outputs.size(),
+          " intermediate size: ",
+          entry.intermediates.size());
+    }
+  };
+
   const PrimDataType idx_type = compiled_kernel_->kernel()->indexType();
-  for (size_t i = 0; i < args.size(); ++i) {
-    auto bytes = polymorphicValueToBytes(
-        args[i],
-        compiled_kernel_->kernel()->parameters()[i]->dtype(),
-        idx_type);
-    entry.args[i] = bytes;
-    entry.arg_ptrs[i] = entry.args[i].data();
+  for (size_t arg_idx = 0; arg_idx < args.size(); ++arg_idx) {
+    std::vector<std::byte> bytes;
+    if (args[arg_idx].is<at::Tensor>()) {
+      auto tensor = args[arg_idx].as<at::Tensor>();
+      NVF_ERROR(
+          tensor.is_cuda(), "Only accepts CUDA tensors or CPU scalar tensors.");
+
+      auto data = tensor.data_ptr();
+      const auto& logical_size = buffer_info(arg_idx).shape_info.logical_sizes;
+      const auto& alloc_stride =
+          buffer_info(arg_idx).shape_info.allocation_strides;
+
+      // special handle for TensorMetaData so that CPU overhead is minimal.
+      if (idx_type == PrimDataType::Int) {
+        bytes.reserve(
+            sizeof(void*) + sizeof(int64_t) * logical_size.size() +
+            sizeof(int64_t) * alloc_stride.size());
+        bytes.insert(
+            bytes.end(), (std::byte*)data, (std::byte*)data + sizeof(void*));
+        bytes.insert(
+            bytes.end(),
+            (std::byte*)logical_size.data(),
+            (std::byte*)logical_size.data() +
+                sizeof(int64_t) * logical_size.size());
+        bytes.insert(
+            bytes.end(),
+            (std::byte*)alloc_stride.data(),
+            (std::byte*)alloc_stride.data() +
+                sizeof(int64_t) * alloc_stride.size());
+      } else {
+        bytes.reserve(
+            sizeof(void*) + sizeof(int32_t) * logical_size.size() +
+            sizeof(int32_t) * alloc_stride.size());
+        bytes.insert(
+            bytes.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*));
+        std::vector<int32_t> logical_size32(
+            logical_size.begin(), logical_size.end());
+        bytes.insert(
+            bytes.end(),
+            (std::byte*)logical_size32.data(),
+            (std::byte*)logical_size32.data() +
+                sizeof(int32_t) * logical_size32.size());
+        std::vector<int32_t> alloc_stride32(
+            alloc_stride.begin(), alloc_stride.end());
+        bytes.insert(
+            bytes.end(),
+            (std::byte*)alloc_stride32.data(),
+            (std::byte*)alloc_stride32.data() +
+                sizeof(int32_t) * alloc_stride32.size());
+      }
+      entry.args[arg_idx] = bytes;
+      entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data();
+    } else {
+      auto bytes = polymorphicValueToBytes(
+          args[arg_idx],
+          compiled_kernel_->kernel()->parameters()[arg_idx]->dtype(),
+          idx_type);
+      entry.args[arg_idx] = bytes;
+      entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data();
+    }
   }
 }
 
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index faa6d67fc53..70f372312d6 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -74,6 +74,7 @@ struct KernelExecutorEntry {
   std::vector<int> output_aliased_to_output;
   // Temporary work buffers and intemediate global-memory tensors
   std::vector<GlobalBufferInfo> intermediates;
+  std::vector<GlobalBufferInfo> inputs;
   // The arguments to the kernel. These are configured in computeArgs and
   // recomputeArgs.
   // For the common case of a tensor argument, these correspond to the
diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp
index ee4ebf8091f..0392ae65b9a 100644
--- a/csrc/runtime/executor_kernel_arg.cpp
+++ b/csrc/runtime/executor_kernel_arg.cpp
@@ -180,75 +180,25 @@ std::vector<std::byte> polymorphicValueToBytes(
   if (argument.is<at::Tensor>()) {
     // FUSER_PERF_SCOPE("polymorphicValueToBytes(at::Tensor)");
     const auto& tensor = argument.as<at::Tensor>();
-    if (tensor.is_cpu()) {
-      NVF_ERROR(
-          tensor.numel() == 1,
-          "Only CPU scalar tensors are supported here. ",
-          "For GPU tensors, please use their metadata.");
-      auto scalar_type = tensor.scalar_type();
-      NVF_ERROR(
-          dtype == aten_to_data_type(scalar_type),
-          "Expected ",
-          dtype,
-          " but got ",
-          aten_to_data_type(scalar_type),
-          ".");
-      std::vector<std::byte> buffer;
-      buffer.reserve(tensor.element_size());
-      buffer.insert(
-          buffer.end(),
-          (std::byte*)tensor.data_ptr(),
-          (std::byte*)tensor.data_ptr() + tensor.element_size());
-      return buffer;
-    } else {
-      NVF_ERROR(
-          tensor.is_cuda(), "Only accepts CUDA tensors or CPU scalar tensors.");
-
-      std::vector<std::byte> buffer;
-      auto data = tensor.data_ptr();
-      auto logical_size = tensor.sizes();
-      auto alloc_stride = tensor.strides();
-
-      // special handle for TensorMetaData so that CPU overhead is minimal.
-      if (index_type == PrimDataType::Int) {
-        buffer.reserve(
-            sizeof(void*) + sizeof(int64_t) * logical_size.size() +
-            sizeof(int64_t) * alloc_stride.size());
-        buffer.insert(
-            buffer.end(), (std::byte*)data, (std::byte*)data + sizeof(void*));
-        buffer.insert(
-            buffer.end(),
-            (std::byte*)logical_size.data(),
-            (std::byte*)logical_size.data() +
-                sizeof(int64_t) * logical_size.size());
-        buffer.insert(
-            buffer.end(),
-            (std::byte*)alloc_stride.data(),
-            (std::byte*)alloc_stride.data() +
-                sizeof(int64_t) * alloc_stride.size());
-      } else {
-        buffer.reserve(
-            sizeof(void*) + sizeof(int32_t) * logical_size.size() +
-            sizeof(int32_t) * alloc_stride.size());
-        buffer.insert(
-            buffer.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*));
-        std::vector<int32_t> logical_size32(
-            logical_size.begin(), logical_size.end());
-        buffer.insert(
-            buffer.end(),
-            (std::byte*)logical_size32.data(),
-            (std::byte*)logical_size32.data() +
-                sizeof(int32_t) * logical_size32.size());
-        std::vector<int32_t> alloc_stride32(
-            alloc_stride.begin(), alloc_stride.end());
-        buffer.insert(
-            buffer.end(),
-            (std::byte*)alloc_stride32.data(),
-            (std::byte*)alloc_stride32.data() +
-                sizeof(int32_t) * alloc_stride32.size());
-      }
-      return buffer;
-    }
+    NVF_ERROR(
+        tensor.is_cpu() && tensor.numel() == 1,
+        "Only CPU scalar tensors are supported here. ",
+        "For GPU tensors, please use their metadata.");
+    auto scalar_type = tensor.scalar_type();
+    NVF_ERROR(
+        dtype == aten_to_data_type(scalar_type),
+        "Expected ",
+        dtype,
+        " but got ",
+        aten_to_data_type(scalar_type),
+        ".");
+    std::vector<std::byte> buffer;
+    buffer.reserve(tensor.element_size());
+    buffer.insert(
+        buffer.end(),
+        (std::byte*)tensor.data_ptr(),
+        (std::byte*)tensor.data_ptr() + tensor.element_size());
+    return buffer;
   } else if (argument.is<Pointer>()) {
     // FUSER_PERF_SCOPE("polymorphicValueToBytes(Pointer)");
     NVF_ERROR(

From ddd62d3ed3f094f6977cd764f296a883a093f0fb Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Tue, 25 Feb 2025 11:50:05 -0800
Subject: [PATCH 08/35] Fix TMA Support.

---
 csrc/runtime/executor.cpp | 132 +++++++++++++++++++++++---------------
 csrc/runtime/executor.h   |   4 ++
 2 files changed, 84 insertions(+), 52 deletions(-)

diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 0d9a1815001..99cf5ffbc73 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -689,13 +689,7 @@ void KernelExecutor::initializeExecutorEntry(
             false,
             false));
       }
-    } else {
-      input_info.emplace_back(GlobalBufferInfo());
     }
-    std::cout << "input_info.back().shape_info.logical_sizes: "
-              << input_info.back().shape_info.logical_sizes << std::endl;
-    std::cout << "input_info.back().shape_info.allocation_strides: "
-              << input_info.back().shape_info.allocation_strides << std::endl;
   }
 
   std::vector<GlobalBufferInfo> output_info;
@@ -877,29 +871,43 @@ void KernelExecutor::computeArgs(
   }
 }
 
-// set the arguments that we'll pass to cuLaunchKernel
-// TODO: Add to header
+namespace {
+GlobalBufferInfo& linear_buffer_info_getter(
+    KernelExecutorEntry& entry,
+    size_t idx) {
+  if (idx < entry.inputs.size()) {
+    return entry.inputs[idx];
+  } else if (idx < entry.inputs.size() + entry.outputs.size()) {
+    return entry.outputs[idx - entry.inputs.size()];
+  } else if (
+      idx <
+      entry.inputs.size() + entry.outputs.size() + entry.intermediates.size()) {
+    return entry
+        .intermediates[idx - entry.inputs.size() - entry.outputs.size()];
+  } else {
+    NVF_CHECK(
+        0,
+        "Invalid buffer index: ",
+        idx,
+        " input size: ",
+        entry.inputs.size(),
+        " output size: ",
+        entry.outputs.size(),
+        " intermediate size: ",
+        entry.intermediates.size());
+  }
+};
+} // namespace
+
 void KernelExecutor::computeArgs2(
     KernelExecutorEntry& entry,
     const KernelArgumentHolder& args) const {
-  std::cout << "Args: " << args.toString() << std::endl;
-
   FUSER_PERF_SCOPE("KernelExecutor::computeArgs2");
   if (entry.args.size() != args.size()) {
     entry.args.resize(args.size());
     entry.arg_ptrs.resize(args.size());
   }
 
-  NVF_ERROR(
-      args.size() ==
-          compiled_kernel_->kernel()->inputs().size() + entry.outputs.size() +
-              entry.intermediates.size(),
-      "Argument size mismatch, expected: ",
-      compiled_kernel_->kernel()->inputs().size() + entry.outputs.size() +
-          entry.intermediates.size(),
-      " got: ",
-      args.size());
-
   NVF_ERROR(
       args.size() == compiled_kernel_->kernel()->parameters().size(),
       "Argument size mismatch, expected: ",
@@ -907,31 +915,8 @@ void KernelExecutor::computeArgs2(
       " got: ",
       args.size());
 
-  auto buffer_info = [&](size_t idx) -> GlobalBufferInfo& {
-    if (idx < entry.inputs.size()) {
-      return entry.inputs[idx];
-    } else if (idx < entry.inputs.size() + entry.outputs.size()) {
-      return entry.outputs[idx - entry.inputs.size()];
-    } else if (
-        idx < entry.inputs.size() + entry.outputs.size() +
-            entry.intermediates.size()) {
-      return entry
-          .intermediates[idx - entry.inputs.size() - entry.outputs.size()];
-    } else {
-      NVF_CHECK(
-          0,
-          "Invalid buffer index: ",
-          idx,
-          " input size: ",
-          entry.inputs.size(),
-          " output size: ",
-          entry.outputs.size(),
-          " intermediate size: ",
-          entry.intermediates.size());
-    }
-  };
-
   const PrimDataType idx_type = compiled_kernel_->kernel()->indexType();
+  int64_t buffer_info_idx = 0;
   for (size_t arg_idx = 0; arg_idx < args.size(); ++arg_idx) {
     std::vector<std::byte> bytes;
     if (args[arg_idx].is<at::Tensor>()) {
@@ -940,17 +925,19 @@ void KernelExecutor::computeArgs2(
           tensor.is_cuda(), "Only accepts CUDA tensors or CPU scalar tensors.");
 
       auto data = tensor.data_ptr();
-      const auto& logical_size = buffer_info(arg_idx).shape_info.logical_sizes;
+      const auto& logical_size =
+          linear_buffer_info_getter(entry, buffer_info_idx)
+              .shape_info.logical_sizes;
       const auto& alloc_stride =
-          buffer_info(arg_idx).shape_info.allocation_strides;
-
+          linear_buffer_info_getter(entry, buffer_info_idx)
+              .shape_info.allocation_strides;
+      buffer_info_idx++;
       // special handle for TensorMetaData so that CPU overhead is minimal.
       if (idx_type == PrimDataType::Int) {
         bytes.reserve(
             sizeof(void*) + sizeof(int64_t) * logical_size.size() +
             sizeof(int64_t) * alloc_stride.size());
-        bytes.insert(
-            bytes.end(), (std::byte*)data, (std::byte*)data + sizeof(void*));
+        bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1));
         bytes.insert(
             bytes.end(),
             (std::byte*)logical_size.data(),
@@ -965,8 +952,7 @@ void KernelExecutor::computeArgs2(
         bytes.reserve(
             sizeof(void*) + sizeof(int32_t) * logical_size.size() +
             sizeof(int32_t) * alloc_stride.size());
-        bytes.insert(
-            bytes.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*));
+        bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1));
         std::vector<int32_t> logical_size32(
             logical_size.begin(), logical_size.end());
         bytes.insert(
@@ -1002,7 +988,6 @@ void KernelExecutor::recomputeArgs(
     ExpressionEvaluator& expr_eval,
     const kir::Kernel* kernel) const {
   FUSER_PERF_SCOPE("KernelExecutor::recomputeArgs");
-  // assert(entry.init && "entry was never initialized");
 
   const std::vector<Val*>& params = kernel->parameters();
   const PrimDataType idx_type = kernel->indexType();
@@ -1107,6 +1092,38 @@ void KernelExecutor::resetCompiledKernelProperties() {
   static_smem_size_.reset();
 }
 
+KernelArgumentHolder KernelExecutor::resolveTMA(
+    KernelExecutorEntry& entry,
+    const KernelArgumentHolder& args) const {
+  ExpressionEvaluator expr_eval;
+  int64_t arg_idx = 0;
+  NVF_ERROR(
+      entry.inputs.size() == compiled_kernel_->kernel()->inputs().size(),
+      "Input size mismatch");
+  for (auto inp_idx : c10::irange(entry.inputs.size())) {
+    expr_eval.bind(
+        compiled_kernel_->kernel()->inputs()[inp_idx], args[arg_idx++]);
+  }
+
+  NVF_ERROR(
+      entry.outputs.size() == compiled_kernel_->kernel()->outputs().size(),
+      "Output size mismatch");
+  for (auto out_idx : c10::irange(entry.outputs.size())) {
+    expr_eval.bind(
+        compiled_kernel_->kernel()->outputs()[out_idx], args[arg_idx++]);
+  }
+
+  for (auto intermediate_entry : entry.intermediates) {
+    expr_eval.bind(intermediate_entry.tv, args[arg_idx++]);
+  }
+
+  KernelArgumentHolder resolved_args;
+  for (auto param : compiled_kernel_->kernel()->parameters()) {
+    resolved_args.push(expr_eval.evaluate(param));
+  }
+  return resolved_args;
+}
+
 std::vector<at::Tensor> KernelExecutor::run(
     KernelArgumentHolder args,
     std::vector<at::Tensor> outputs,
@@ -1279,6 +1296,17 @@ std::vector<at::Tensor> KernelExecutor::run(
     }
   }
 
+  if (args.size() != compiled_kernel_->kernel()->parameters().size()) {
+    std::vector<Expr*> exprs = compiled_kernel_->kernel()->exprs();
+    NVF_ERROR(
+        std::any_of(
+            exprs.begin(),
+            exprs.end(),
+            [](Expr* e) { return ir_utils::isCpAsyncBulk(e); }),
+        "Argument mismatch detected in run, but is not resolveable.");
+    args = resolveTMA(*executor_entry, args);
+  }
+
   computeArgs2(*executor_entry, args);
 
   if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) {
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index 70f372312d6..50c97e2eece 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -240,6 +240,10 @@ class KernelExecutor : public ExecutorAbstract {
       KernelExecutorEntry& entry,
       const KernelArgumentHolder& args) const;
 
+  KernelArgumentHolder resolveTMA(
+      KernelExecutorEntry& entry,
+      const KernelArgumentHolder& args) const;
+
   // Updates an existing set of arguments based on the current arguments. It is
   // is an error to call this before `computeArgs` has been invoked.
   // recomputeArgs will fail if the arity of the function changes, or the rank

From f91c252185dee1ffef30f0d49141415c9698c21f Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Tue, 25 Feb 2025 12:36:25 -0800
Subject: [PATCH 09/35] Drop.

---
 csrc/runtime/allocations.cpp         | 40 ++++++++++++++++++++++++++++
 csrc/runtime/allocations.h           |  6 +++++
 csrc/runtime/executor.cpp            | 33 +++++++++++++++++++++--
 tests/cpp/test_allocation_domain.cpp |  4 +++
 4 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 3f242bfb63d..407b135004d 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -806,6 +806,46 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
   return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()};
 }
 
+std::vector<GlobalBufferInfo> getInputBufferInfos(
+    ExpressionEvaluator& expr_eval,
+    DataType index_dtype,
+    const std::vector<Val*>& fusion_inputs,
+    const std::vector<at::Tensor>& inputs) {
+  NVF_ERROR(
+      fusion_inputs.size() == inputs.size(),
+      "Mismatch in inputs provided, expected ",
+      fusion_inputs.size(),
+      " but got ",
+      inputs.size());
+  std::vector<GlobalBufferInfo> buffer_infos;
+  for (auto i : c10::irange(fusion_inputs.size())) {
+    GlobalBufferInfo buffer_info;
+    buffer_info.tv = fusion_inputs[i]->as<TensorView>();
+    auto logical_sizes = inputs[i].sizes().vec();
+    auto logical_strides = inputs[i].strides().vec();
+    TensorShapeInfo shape_info;
+    shape_info.logical_sizes = logical_sizes;
+    shape_info.logical_strides = logical_strides;
+    buffer_info.shape_info = shape_info;
+    buffer_info.type = inputs[i].scalar_type();
+
+    // TODO: Handle input allocation domains that aren't permutes
+    // of the logical domain
+    if (buffer_info.tv->hasAllocation()) {
+      auto allocation_size_stride = inferAllocationShape(
+          buffer_info.tv, expr_eval);
+      buffer_info.shape_info.allocation_sizes = allocation_size_stride.first;
+      buffer_info.shape_info.allocation_strides = allocation_size_stride.second;
+    } else {
+      buffer_info.shape_info.allocation_sizes = logical_sizes;
+      buffer_info.shape_info.allocation_strides = logical_strides;
+    }
+
+    buffer_infos.emplace_back(buffer_info);
+  }
+  return buffer_infos;
+    }
+
 TensorShapeInfo inferTensorShapes(
     TensorView* tv,
     const ExpressionEvaluator& expr_eval) {
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
index 999692f936e..0f6e5b4a3f1 100644
--- a/csrc/runtime/allocations.h
+++ b/csrc/runtime/allocations.h
@@ -103,4 +103,10 @@ std::vector<GlobalBufferInfo> getBufferInfos(
     DataType index_dtype,
     const std::vector<Val*>& fusion_outputs);
 
+std::vector<GlobalBufferInfo> getInputBufferInfos(
+    ExpressionEvaluator& expr_eval,
+    DataType index_dtype,
+    const std::vector<Val*>& fusion_outputs,
+    const std::vector<at::Tensor>& inputs);
+
 } // namespace nvfuser
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 99cf5ffbc73..fe00df2f742 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -670,8 +670,21 @@ void KernelExecutor::initializeExecutorEntry(
     auto input = compiled_kernel_->kernel()->inputs()[inp_idx];
     if (input->isA<TensorView>()) {
       if (input->as<TensorView>()->hasAllocation()) {
-        input_info.emplace_back(getBufferInfos(
-            expr_eval, index_type, {input->as<TensorView>()})[0]);
+        std::cout << "Get buffer info for: " << input->toString() << std::endl;
+        std::cout << "Arg: "
+                  << PolymorphicValue_functions::toString(args[inp_idx])
+                  << std::endl;
+        auto buffer_info =
+            getInputBufferInfos(expr_eval, index_type, {input->as<TensorView>()})[0];
+        std::cout << "Buffer info allocation sizes: "
+                  << buffer_info.shape_info.allocation_sizes << std::endl;
+        std::cout << "Buffer info allocation strides: "
+                  << buffer_info.shape_info.allocation_strides << std::endl;
+        std::cout << "Buffer info logical sizes: "
+                  << buffer_info.shape_info.logical_sizes << std::endl;
+        std::cout << "Buffer info logical strides: "
+                  << buffer_info.shape_info.logical_strides << std::endl;
+        input_info.emplace_back(buffer_info);
       } else {
         TensorShapeInfo shape_info;
         shape_info.logical_sizes = args[inp_idx].as<at::Tensor>().sizes().vec();
@@ -915,6 +928,19 @@ void KernelExecutor::computeArgs2(
       " got: ",
       args.size());
 
+  for (auto inp : compiled_kernel_->kernel()->inputs()) {
+    if (!inp->isA<TensorView>()) {
+      continue;
+    }
+    std::cout << "Input root: " << inp->as<TensorView>()->getRootDomain()
+              << std::endl;
+    std::cout << "Input logical: " << inp->as<TensorView>()->getLogicalDomain()
+              << std::endl;
+    std::cout << "Input alloc: "
+              << inp->as<TensorView>()->getMaybeAllocationDomain() << std::endl;
+  }
+  std::cout << "Args: " << args.toString() << std::endl;
+
   const PrimDataType idx_type = compiled_kernel_->kernel()->indexType();
   int64_t buffer_info_idx = 0;
   for (size_t arg_idx = 0; arg_idx < args.size(); ++arg_idx) {
@@ -931,6 +957,9 @@ void KernelExecutor::computeArgs2(
       const auto& alloc_stride =
           linear_buffer_info_getter(entry, buffer_info_idx)
               .shape_info.allocation_strides;
+      std::cout << "Binding Tensor: " << (int64_t)data
+                << " size: " << logical_size << " stride: " << alloc_stride
+                << std::endl;
       buffer_info_idx++;
       // special handle for TensorMetaData so that CPU overhead is minimal.
       if (idx_type == PrimDataType::Int) {
diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp
index 849cd140219..2972a883692 100644
--- a/tests/cpp/test_allocation_domain.cpp
+++ b/tests/cpp/test_allocation_domain.cpp
@@ -1098,7 +1098,11 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) {
   fusion->addOutput(tv1);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  // allocate [8,8] strided [8, 1]
+  // modify to [4,8] strided [1, 8]
+  // Tell nvFuser it's allocated as [4, 8] strided [8, 1]
   at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8});
+  std::cout << debug_str(t0) << std::endl;
   FusionExecutorCache executor_cache(std::move(fusion));
   auto outputs = executor_cache.runFusionWithInputs({t0});
 

From cf15f4385027ed0d5e9e439c4539dedcea57ece1 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Tue, 25 Feb 2025 14:25:09 -0800
Subject: [PATCH 10/35] All but one allocation domain test working.

---
 csrc/runtime/allocations.cpp | 30 ++++++++++++---
 csrc/runtime/executor.cpp    | 71 ++++++++++++++++++++----------------
 2 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 407b135004d..8f66ea6889e 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -832,10 +832,30 @@ std::vector<GlobalBufferInfo> getInputBufferInfos(
     // TODO: Handle input allocation domains that aren't permutes
     // of the logical domain
     if (buffer_info.tv->hasAllocation()) {
-      auto allocation_size_stride = inferAllocationShape(
-          buffer_info.tv, expr_eval);
-      buffer_info.shape_info.allocation_sizes = allocation_size_stride.first;
-      buffer_info.shape_info.allocation_strides = allocation_size_stride.second;
+      auto logical_domain =
+          TensorDomain::noReductions(buffer_info.tv->getLogicalDomain());
+      auto allocation_domain = TensorDomain::noReductions(
+          buffer_info.tv->getMaybeAllocationDomain());
+      std::unordered_map<int64_t, int64_t> logical_to_allocation_map;
+      for (int64_t logical_idx : c10::irange(logical_domain.size())) {
+        auto allocation_id = std::find(
+            allocation_domain.begin(),
+            allocation_domain.end(),
+            logical_domain[logical_idx]);
+        NVF_ERROR(
+            allocation_id != allocation_domain.end(),
+            "Logical domain and allocation domain have different sets of IterDomains, this is not supported yet.");
+        logical_to_allocation_map[logical_idx] =
+            std::distance(allocation_domain.begin(), allocation_id);
+      }
+      std::vector<int64_t> allocation_sizes(allocation_domain.size());
+      std::vector<int64_t> allocation_strides(allocation_domain.size());
+      for (auto i : c10::irange(allocation_domain.size())) {
+        allocation_sizes[i] = logical_sizes[logical_to_allocation_map[i]];
+        allocation_strides[i] = logical_strides[logical_to_allocation_map[i]];
+      }
+      buffer_info.shape_info.allocation_sizes = allocation_sizes;
+      buffer_info.shape_info.allocation_strides = allocation_strides;
     } else {
       buffer_info.shape_info.allocation_sizes = logical_sizes;
       buffer_info.shape_info.allocation_strides = logical_strides;
@@ -844,7 +864,7 @@ std::vector<GlobalBufferInfo> getInputBufferInfos(
     buffer_infos.emplace_back(buffer_info);
   }
   return buffer_infos;
-    }
+}
 
 TensorShapeInfo inferTensorShapes(
     TensorView* tv,
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index fe00df2f742..e8f13620084 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -668,40 +668,47 @@ void KernelExecutor::initializeExecutorEntry(
   for (auto inp_idx :
        c10::irange(compiled_kernel_->kernel()->inputs().size())) {
     auto input = compiled_kernel_->kernel()->inputs()[inp_idx];
-    if (input->isA<TensorView>()) {
-      if (input->as<TensorView>()->hasAllocation()) {
-        std::cout << "Get buffer info for: " << input->toString() << std::endl;
-        std::cout << "Arg: "
-                  << PolymorphicValue_functions::toString(args[inp_idx])
-                  << std::endl;
-        auto buffer_info =
-            getInputBufferInfos(expr_eval, index_type, {input->as<TensorView>()})[0];
-        std::cout << "Buffer info allocation sizes: "
-                  << buffer_info.shape_info.allocation_sizes << std::endl;
-        std::cout << "Buffer info allocation strides: "
-                  << buffer_info.shape_info.allocation_strides << std::endl;
-        std::cout << "Buffer info logical sizes: "
-                  << buffer_info.shape_info.logical_sizes << std::endl;
-        std::cout << "Buffer info logical strides: "
-                  << buffer_info.shape_info.logical_strides << std::endl;
-        input_info.emplace_back(buffer_info);
+    if (auto input_tv = dynamic_cast<TensorView*>(input)) {
+      std::cout << "Get buffer info for: " << input_tv->toString() << std::endl;
+      std::cout << "Arg: "
+                << PolymorphicValue_functions::toString(args[inp_idx])
+                << std::endl;
+
+      auto at_tensor = args[inp_idx].as<at::Tensor>();
+      std::vector<int64_t> alloc_sizes;
+      std::vector<int64_t> alloc_strides;
+      if (input_tv->hasAllocation()) {
+        std::tie(alloc_sizes, alloc_strides) =
+            inferAndValidateAllocationSizesAndStrides(
+                at_tensor, input_tv, expr_eval);
       } else {
-        TensorShapeInfo shape_info;
-        shape_info.logical_sizes = args[inp_idx].as<at::Tensor>().sizes().vec();
-        shape_info.logical_strides =
-            args[inp_idx].as<at::Tensor>().strides().vec();
-        shape_info.allocation_sizes =
-            args[inp_idx].as<at::Tensor>().sizes().vec();
-        shape_info.allocation_strides =
-            args[inp_idx].as<at::Tensor>().strides().vec();
-        input_info.emplace_back(GlobalBufferInfo(
-            input->as<TensorView>(),
-            shape_info,
-            data_type_to_aten(input->dtype()),
-            false,
-            false,
-            false));
+        alloc_sizes = at_tensor.sizes().vec();
+        alloc_strides = at_tensor.strides().vec();
       }
+
+      TensorShapeInfo shape_info;
+      shape_info.logical_sizes = args[inp_idx].as<at::Tensor>().sizes().vec();
+      shape_info.logical_strides =
+          args[inp_idx].as<at::Tensor>().strides().vec();
+      shape_info.allocation_sizes = alloc_sizes;
+      shape_info.allocation_strides = alloc_strides;
+      auto buffer_info = GlobalBufferInfo(
+          input->as<TensorView>(),
+          shape_info,
+          data_type_to_aten(input->dtype()),
+          false,
+          false,
+          false);
+
+      std::cout << "Buffer info allocation sizes: "
+                << buffer_info.shape_info.allocation_sizes << std::endl;
+      std::cout << "Buffer info allocation strides: "
+                << buffer_info.shape_info.allocation_strides << std::endl;
+      std::cout << "Buffer info logical sizes: "
+                << buffer_info.shape_info.logical_sizes << std::endl;
+      std::cout << "Buffer info logical strides: "
+                << buffer_info.shape_info.logical_strides << std::endl;
+      input_info.emplace_back(buffer_info);
     }
   }
 

From eaca8a8f8a607a18535f8f606592ecf4331ff730 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Tue, 25 Feb 2025 14:32:53 -0800
Subject: [PATCH 11/35] All allocation tests working.

---
 csrc/runtime/executor.cpp            | 39 ++++++----------------------
 tests/cpp/test_allocation_domain.cpp |  2 +-
 2 files changed, 9 insertions(+), 32 deletions(-)

diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index e8f13620084..f65fda23d49 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -669,12 +669,8 @@ void KernelExecutor::initializeExecutorEntry(
        c10::irange(compiled_kernel_->kernel()->inputs().size())) {
     auto input = compiled_kernel_->kernel()->inputs()[inp_idx];
     if (auto input_tv = dynamic_cast<TensorView*>(input)) {
-      std::cout << "Get buffer info for: " << input_tv->toString() << std::endl;
-      std::cout << "Arg: "
-                << PolymorphicValue_functions::toString(args[inp_idx])
-                << std::endl;
-
       auto at_tensor = args[inp_idx].as<at::Tensor>();
+
       std::vector<int64_t> alloc_sizes;
       std::vector<int64_t> alloc_strides;
       if (input_tv->hasAllocation()) {
@@ -692,23 +688,14 @@ void KernelExecutor::initializeExecutorEntry(
           args[inp_idx].as<at::Tensor>().strides().vec();
       shape_info.allocation_sizes = alloc_sizes;
       shape_info.allocation_strides = alloc_strides;
-      auto buffer_info = GlobalBufferInfo(
-          input->as<TensorView>(),
+
+      input_info.emplace_back(GlobalBufferInfo(
+          input_tv,
           shape_info,
-          data_type_to_aten(input->dtype()),
+          data_type_to_aten(input_tv->dtype()),
           false,
           false,
-          false);
-
-      std::cout << "Buffer info allocation sizes: "
-                << buffer_info.shape_info.allocation_sizes << std::endl;
-      std::cout << "Buffer info allocation strides: "
-                << buffer_info.shape_info.allocation_strides << std::endl;
-      std::cout << "Buffer info logical sizes: "
-                << buffer_info.shape_info.logical_sizes << std::endl;
-      std::cout << "Buffer info logical strides: "
-                << buffer_info.shape_info.logical_strides << std::endl;
-      input_info.emplace_back(buffer_info);
+          false));
     }
   }
 
@@ -748,9 +735,9 @@ void KernelExecutor::initializeExecutorEntry(
     auto out_info = output_info[output_idx];
     auto fusion = compiled_kernel_->kernel()->as<Fusion>();
     auto alias_info = fusion->getOutputAlias(out_info.tv);
-    NVF_ERROR(
+    TORCH_WARN_ONCE(
         alias_info.type != AllocationType::Evaluate,
-        "Outputs should not be evaluate type for kernels.");
+        "Outputs should not be evaluate type for kernels, this will be ignored and a kernel will produce the output tensor.");
     if (alias_info.type == AllocationType::New) {
       continue;
     }
@@ -939,14 +926,7 @@ void KernelExecutor::computeArgs2(
     if (!inp->isA<TensorView>()) {
       continue;
     }
-    std::cout << "Input root: " << inp->as<TensorView>()->getRootDomain()
-              << std::endl;
-    std::cout << "Input logical: " << inp->as<TensorView>()->getLogicalDomain()
-              << std::endl;
-    std::cout << "Input alloc: "
-              << inp->as<TensorView>()->getMaybeAllocationDomain() << std::endl;
   }
-  std::cout << "Args: " << args.toString() << std::endl;
 
   const PrimDataType idx_type = compiled_kernel_->kernel()->indexType();
   int64_t buffer_info_idx = 0;
@@ -964,9 +944,6 @@ void KernelExecutor::computeArgs2(
       const auto& alloc_stride =
           linear_buffer_info_getter(entry, buffer_info_idx)
               .shape_info.allocation_strides;
-      std::cout << "Binding Tensor: " << (int64_t)data
-                << " size: " << logical_size << " stride: " << alloc_stride
-                << std::endl;
       buffer_info_idx++;
       // special handle for TensorMetaData so that CPU overhead is minimal.
       if (idx_type == PrimDataType::Int) {
diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp
index 2972a883692..ead0014bdcc 100644
--- a/tests/cpp/test_allocation_domain.cpp
+++ b/tests/cpp/test_allocation_domain.cpp
@@ -1102,7 +1102,7 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) {
   // modify to [4,8] strided [1, 8]
   // Tell nvFuser it's allocated as [4, 8] strided [8, 1]
   at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8});
-  std::cout << debug_str(t0) << std::endl;
+
   FusionExecutorCache executor_cache(std::move(fusion));
   auto outputs = executor_cache.runFusionWithInputs({t0});
 

From 651dd68f0f2e07d6f00f85683e3972e83bad795c Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Wed, 26 Feb 2025 21:56:13 +0000
Subject: [PATCH 12/35] Debugging in process.

---
 csrc/runtime/allocations.cpp       | 66 ++++++++++++++++++++-
 csrc/runtime/executor.cpp          | 95 +++++++++++++++++++++++-------
 csrc/runtime/executor.h            |  9 +++
 csrc/runtime/executor_kernel_arg.h |  4 ++
 csrc/utils.cpp                     |  8 ++-
 tests/cpp/test_gpu3.cpp            | 10 ++++
 6 files changed, 168 insertions(+), 24 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 8f66ea6889e..54fa2e08e46 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -394,6 +394,8 @@ std::vector<at::Tensor> allocateKernelOutputs(
           c10::nullopt,
           device,
           c10::nullopt);
+      std::cout << "Allocating output tensor: " << debug_str(alloc_tensor)
+                << std::endl;
       if (shouldFillAllocationWithNan()) {
         fillTensorWithNan(alloc_tensor);
       }
@@ -403,10 +405,38 @@ std::vector<at::Tensor> allocateKernelOutputs(
           entry.output_aliased_to_input.at(out_idx) <= (int64_t)args.size(),
           "Tried to grab an out of range input argument.");
       auto input_arg = args[entry.output_aliased_to_input.at(out_idx)];
+      ExpressionEvaluator ee;
+      ee.bind(
+          fusion->inputs()[entry.output_aliased_to_input.at(out_idx)],
+          input_arg);
+      std::cout << "Input arg: " << debug_str(input_arg.as<at::Tensor>())
+              << std::endl;
+      std::cout << "Aliased output tensor: "
+                << debug_str(ee.evaluate(out_info.tv).as<at::Tensor>())
+                << std::endl;
       NVF_ERROR(
           input_arg.is<at::Tensor>(),
           "Aliased input argument is not a tensor.");
-      out_tensors.emplace_back(input_arg.as<at::Tensor>());
+      if (input_arg.as<at::Tensor>().sizes() !=
+              out_info.shape_info.logical_sizes ||
+          input_arg.as<at::Tensor>().strides() !=
+              out_info.shape_info.logical_strides) {
+        out_tensors.emplace_back(input_arg.as<at::Tensor>().as_strided(
+            out_info.shape_info.logical_sizes,
+            out_info.shape_info.logical_strides));
+      } else {
+        out_tensors.emplace_back(input_arg.as<at::Tensor>());
+      }
+      std::cout
+          << "Aliasing T" << out_info.tv->name() << " to T"
+          << fusion->inputs()[entry.output_aliased_to_input.at(out_idx)]->name()
+          << std::endl;
+      std::cout << "Aliased output tensor: " << debug_str(out_tensors.back())
+                << std::endl;
+      std::cout << "Aliased output tensor logical sizes: "
+                << out_info.shape_info.logical_sizes << std::endl;
+      std::cout << "Aliased output tensor logical strides: "
+                << out_info.shape_info.logical_strides << std::endl;
     }
   }
   return out_tensors;
@@ -424,10 +454,20 @@ GlobalBufferInfo getBufferInfo(
   auto dtype =
       (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
   info.type = data_type_to_aten(dtype);
+  std::cout << " Getting global buffer info for T" << tv->name() << " "
+            << tv->getLogicalDomain() << std::endl;
+  std::cout << "Logical sizes: " << info.shape_info.logical_sizes << std::endl;
+  std::cout << "Allocation sizes: " << info.shape_info.allocation_sizes
+            << std::endl;
+  std::cout << "Logical strides: " << info.shape_info.logical_strides
+            << std::endl;
+  std::cout << "Allocation strides: " << info.shape_info.allocation_strides
+            << std::endl;
   return info;
 }
 
 } // namespace
+
 std::vector<GlobalBufferInfo> getBufferInfos(
     ExpressionEvaluator& expr_eval,
     DataType index_dtype,
@@ -869,6 +909,30 @@ std::vector<GlobalBufferInfo> getInputBufferInfos(
 TensorShapeInfo inferTensorShapes(
     TensorView* tv,
     const ExpressionEvaluator& expr_eval) {
+  
+  // Alias handling:
+  auto alias_info = tv->fusion()->getOutputAlias(tv);
+  if(alias_info.type != AllocationType::New){
+    auto tensor = expr_eval.evaluate(tv);
+    std::pair<std::vector<int64_t>, std::vector<int64_t>> logical_size_stride = {tensor.sizes().vec(), tensor.strides().vec()};
+    if(!tv->hasAllocation()){
+      return TensorShapeInfo{
+        logical_size_stride.first,
+        logical_size_stride.second,
+        logical_size_stride.first,
+        logical_size_stride.second};
+    }
+
+    auto allocation_size_stride = inferAndValidateAllocationSizesAndStrides(
+        tensor, tv, expr_eval);
+    return TensorShapeInfo{
+        logical_size_stride.first,
+        logical_size_stride.second,
+        allocation_size_stride.first,
+        allocation_size_stride.second};
+  }
+
+  // Non-alias handling:
   auto allocation_size_stride = inferAllocationShape(tv, expr_eval);
   if (!tv->hasAllocation()) {
     return TensorShapeInfo{
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index f65fda23d49..252ff2c2f25 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -282,6 +282,36 @@ void KernelExecutor::compile(
   if (isProfilerEnabled()) {
     FusionProfiler::segment(group_id_).stopCompile();
   }
+
+  for(auto expr : exprs){
+    if (ir_utils::isCpAsyncBulk(expr)) {
+      has_tma_ = true;
+    }
+    if (expr->isA<RNGOp>()) {
+      has_rng_ = true;
+    }
+  }
+
+  for(auto output : fusion->outputs()){
+    if(output->isA<TensorView>()){
+      auto out_tv = output->as<TensorView>();
+      auto alias_info = fusion->getOutputAlias(out_tv);
+      if(alias_info.type == AllocationType::New){
+        continue;
+      }
+      auto aliased_to = alias_info.aliased_io->as<TensorView>();
+      auto inputs = InputsOf::output(out_tv);
+      for(auto input : inputs){
+        if(input->isA<TensorView>() && input->sameAs(aliased_to)){
+          continue;
+        }
+      }
+      if(val->isConst()){
+        continue;
+      }
+      has_dynamic_alias_ = true;
+    }
+  }
 }
 
 LaunchParams KernelExecutor::computeLaunchParams(
@@ -487,7 +517,7 @@ std::vector<GlobalBufferInfo> KernelExecutor::getIntermediateBufferInfo(
     info.shape_info.allocation_strides = strides;
     info.shape_info.logical_sizes = sizes;
     info.shape_info.logical_strides = strides;
-    auto dtype = (tv->dtype() == DataType::Index ? index_type : tv->dtype());
+    auto dtype = tv->dtype() == DataType::Index ? index_type : tv->dtype();
     info.type = data_type_to_aten(dtype);
 
     // Remember the tensor buffer used for storing kernel profile
@@ -688,14 +718,14 @@ void KernelExecutor::initializeExecutorEntry(
           args[inp_idx].as<at::Tensor>().strides().vec();
       shape_info.allocation_sizes = alloc_sizes;
       shape_info.allocation_strides = alloc_strides;
-
-      input_info.emplace_back(GlobalBufferInfo(
+      GlobalBufferInfo info(
           input_tv,
           shape_info,
           data_type_to_aten(input_tv->dtype()),
           false,
           false,
-          false));
+          false);
+      input_info.emplace_back(info);
     }
   }
 
@@ -932,10 +962,9 @@ void KernelExecutor::computeArgs2(
   int64_t buffer_info_idx = 0;
   for (size_t arg_idx = 0; arg_idx < args.size(); ++arg_idx) {
     std::vector<std::byte> bytes;
-    if (args[arg_idx].is<at::Tensor>()) {
+    if (args[arg_idx].is<at::Tensor>() &&
+        args[arg_idx].as<at::Tensor>().is_cuda()) {
       auto tensor = args[arg_idx].as<at::Tensor>();
-      NVF_ERROR(
-          tensor.is_cuda(), "Only accepts CUDA tensors or CPU scalar tensors.");
 
       auto data = tensor.data_ptr();
       const auto& logical_size =
@@ -1105,6 +1134,26 @@ void KernelExecutor::resetCompiledKernelProperties() {
   static_smem_size_.reset();
 }
 
+namespace {
+KernelArgumentHolder resolveRNGSeed(
+    const kir::Kernel* kernel,
+    KernelArgumentHolder& args) {
+  ExpressionEvaluator expr_eval;
+  KernelArgumentHolder resolved_args;
+  resolved_args.reserve(args.size());
+  int64_t arg_idx = 0;
+  for (auto param : kernel->parameters()) {
+    if (param->definition() &&
+        param->definition()->isA<kir::GetRNGSeedAndOffsetFromHost>()) {
+      resolved_args.push(expr_eval.evaluate(param));
+    } else {
+      resolved_args.push(args[arg_idx++]);
+    }
+  }
+  return resolved_args;
+}
+} // namespace
+
 KernelArgumentHolder KernelExecutor::resolveTMA(
     KernelExecutorEntry& entry,
     const KernelArgumentHolder& args) const {
@@ -1156,6 +1205,11 @@ std::vector<at::Tensor> KernelExecutor::run(
     sprof.startKernel();
   }
 
+  ExpressionEvaluator expr_eval;
+  if(has_dynamic_alias_ || has_tma_){
+    expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel());
+  }
+
   NVF_ERROR(isCompiled());
   NVF_ERROR(
       outputs.empty() ||
@@ -1214,10 +1268,6 @@ std::vector<at::Tensor> KernelExecutor::run(
   // context manager to disable auto grad for `empty_cuda` calls later
   at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
 
-  // Bind fusion inputs
-  // auto expr_eval = executor_utils::bindInputs(args,
-  // compiled_kernel_->kernel());
-
   // only allocate outputs when not given
   if (outputs.empty()) {
     outputs = allocateKernelOutputs(
@@ -1310,14 +1360,20 @@ std::vector<at::Tensor> KernelExecutor::run(
   }
 
   if (args.size() != compiled_kernel_->kernel()->parameters().size()) {
-    std::vector<Expr*> exprs = compiled_kernel_->kernel()->exprs();
-    NVF_ERROR(
-        std::any_of(
-            exprs.begin(),
-            exprs.end(),
-            [](Expr* e) { return ir_utils::isCpAsyncBulk(e); }),
-        "Argument mismatch detected in run, but is not resolveable.");
-    args = resolveTMA(*executor_entry, args);
+    NVF_ERROR(has_tma_ || has_rng_, "No TMA or RNG found in the kernel, but detected an argument size mismatch.");
+    // If args don't match one of two things is happening. We need to add TMA
+    // related args or RNG related args. Resolve these scenarios.
+    if (has_tma_) {
+      // Resolving TMA requires binding all values and evaluating the TMA
+      // arguments
+      args = resolveTMA(*executor_entry, args);
+    }
+    if (has_rng_) {
+      // Resolving RNG seed requires evaluating and adding those values, but
+      // doesn't require binding all values as getting RNG seed and offset
+      // doesn't depend on other values
+      args = resolveRNGSeed(compiled_kernel_->kernel(), args);
+    }
   }
 
   computeArgs2(*executor_entry, args);
@@ -1418,7 +1474,6 @@ std::vector<at::Tensor> KernelExecutor::run(
     sprof.stopKernel();
     sprof.outputBytesAccessed(computeBytes(outputs));
   }
-
   return outputs;
 }
 
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index 50c97e2eece..6d79300a067 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -314,6 +314,15 @@ class KernelExecutor : public ExecutorAbstract {
 
   int64_t warp_size_ = 0;
 
+  // Has an RNG kernel and therefore needs to infer RNG state through expression evaluator
+  bool has_rng_ = false;
+
+  // Has a TMA kernel and therefore needs to infer TMA inputs through expression evaluator
+  bool has_TMA_ = false;
+
+  // Has a dynamic alias and therefore needs to infer what they are through expression evaluator
+  bool has_dynamic_alias_ = false;
+
   // lookup table to take short cut to retrieve recorded information in order to
   // launch kernels without re-inference parameters.
   std::unordered_map<size_t, KernelExecutorEntry> executor_entry_lookup_;
diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h
index f2026ebe2e6..40d44866165 100644
--- a/csrc/runtime/executor_kernel_arg.h
+++ b/csrc/runtime/executor_kernel_arg.h
@@ -74,6 +74,10 @@ class NVF_API KernelArgumentHolder {
     }
   }
 
+  void reserve(size_t size) {
+    arguments_.reserve(size);
+  }
+
   void push(const std::vector<at::Tensor>& tensors);
   void push(const c10::ArrayRef<c10::IValue>& args);
   void push(std::initializer_list<c10::IValue> args) {
diff --git a/csrc/utils.cpp b/csrc/utils.cpp
index c24ae356cbb..0358908a6e1 100644
--- a/csrc/utils.cpp
+++ b/csrc/utils.cpp
@@ -43,13 +43,15 @@ std::string debug_str(const at::Tensor& tensor) {
   std::stringstream ss;
   ss << "Tensor:";
   ss << " shape: " << tensor.sizes();
-  ss << ", dtype: " << tensor.dtype();
-  ss << ", device: " << tensor.device();
-  ss << ", pointer: " << reinterpret_cast<size_t>(tensor.data_ptr());
 
   if (!tensor.is_contiguous()) {
     ss << ", strides: " << tensor.strides();
   }
+
+  ss << ", dtype: " << tensor.dtype();
+  ss << ", device: " << tensor.device();
+  ss << ", pointer: " << reinterpret_cast<size_t>(tensor.data_ptr());
+
   return ss.str();
 }
 
diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
index 3adf985a898..ac5ed810ec2 100644
--- a/tests/cpp/test_gpu3.cpp
+++ b/tests/cpp/test_gpu3.cpp
@@ -4302,6 +4302,16 @@ TEST_F(NVFuserTest, FusionIssue2068_CUDA) {
   FusionExecutorCache executor_cache(std::move(fusion_ptr));
   auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1, t2, t3, t4});
 
+  std::cout << "Generated outputs:" << std::endl;
+  for (auto output : cg_outputs) {
+    std::cout << "  " << debug_str(output) << std::endl;
+  }
+
+  for (auto output : fusion.outputs()) {
+    std::cout << "T" << output->name() << " "
+              << output->as<TensorView>()->getLogicalDomain() << std::endl;
+  }
+
   testValidate(
       executor_cache.fusion(),
       cg_outputs,

From ce150d97fab5f15e63e41b0cbd36082b806cf7ae Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Wed, 26 Feb 2025 15:12:19 -0800
Subject: [PATCH 13/35] Down to 5 failures in test_nvfuser.

---
 csrc/runtime/allocations.cpp | 28 +++++++++++++---------------
 csrc/runtime/executor.cpp    | 29 +++++++++++++++--------------
 csrc/runtime/executor.h      |  9 ++++++---
 3 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 54fa2e08e46..4688e8a3338 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -410,7 +410,7 @@ std::vector<at::Tensor> allocateKernelOutputs(
           fusion->inputs()[entry.output_aliased_to_input.at(out_idx)],
           input_arg);
       std::cout << "Input arg: " << debug_str(input_arg.as<at::Tensor>())
-              << std::endl;
+                << std::endl;
       std::cout << "Aliased output tensor: "
                 << debug_str(ee.evaluate(out_info.tv).as<at::Tensor>())
                 << std::endl;
@@ -909,25 +909,23 @@ std::vector<GlobalBufferInfo> getInputBufferInfos(
 TensorShapeInfo inferTensorShapes(
     TensorView* tv,
     const ExpressionEvaluator& expr_eval) {
-  
   // Alias handling:
   auto alias_info = tv->fusion()->getOutputAlias(tv);
-  if(alias_info.type != AllocationType::New){
-    auto tensor = expr_eval.evaluate(tv);
-    std::pair<std::vector<int64_t>, std::vector<int64_t>> logical_size_stride = {tensor.sizes().vec(), tensor.strides().vec()};
-    if(!tv->hasAllocation()){
+  if (alias_info.type != AllocationType::New) {
+    auto val = expr_eval.evaluate(tv);
+    auto tensor = val.as<at::Tensor>();
+    if (!tv->hasAllocation()) {
       return TensorShapeInfo{
-        logical_size_stride.first,
-        logical_size_stride.second,
-        logical_size_stride.first,
-        logical_size_stride.second};
+          tensor.sizes().vec(),
+          tensor.strides().vec(),
+          tensor.sizes().vec(),
+          tensor.strides().vec()};
     }
-
-    auto allocation_size_stride = inferAndValidateAllocationSizesAndStrides(
-        tensor, tv, expr_eval);
+    auto allocation_size_stride =
+        inferAndValidateAllocationSizesAndStrides(tensor, tv, expr_eval);
     return TensorShapeInfo{
-        logical_size_stride.first,
-        logical_size_stride.second,
+        tensor.sizes().vec(),
+        tensor.strides().vec(),
         allocation_size_stride.first,
         allocation_size_stride.second};
   }
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 252ff2c2f25..8bfd8cb7ef1 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -283,30 +283,30 @@ void KernelExecutor::compile(
     FusionProfiler::segment(group_id_).stopCompile();
   }
 
-  for(auto expr : exprs){
+  for (auto expr : exprs) {
     if (ir_utils::isCpAsyncBulk(expr)) {
-      has_tma_ = true;
+      has_TMA_ = true;
     }
     if (expr->isA<RNGOp>()) {
       has_rng_ = true;
     }
   }
 
-  for(auto output : fusion->outputs()){
-    if(output->isA<TensorView>()){
+  for (auto output : fusion->outputs()) {
+    if (output->isA<TensorView>()) {
       auto out_tv = output->as<TensorView>();
       auto alias_info = fusion->getOutputAlias(out_tv);
-      if(alias_info.type == AllocationType::New){
+      if (alias_info.type == AllocationType::New) {
         continue;
       }
       auto aliased_to = alias_info.aliased_io->as<TensorView>();
       auto inputs = InputsOf::output(out_tv);
-      for(auto input : inputs){
-        if(input->isA<TensorView>() && input->sameAs(aliased_to)){
+      for (auto input : inputs) {
+        if (input->isA<TensorView>() && input->sameAs(aliased_to)) {
           continue;
         }
       }
-      if(val->isConst()){
+      if (out_tv->isConst()) {
         continue;
       }
       has_dynamic_alias_ = true;
@@ -654,10 +654,9 @@ void KernelExecutor::initializeExecutorEntry(
     DataType index_type) {
   FUSER_PERF_SCOPE("KernelExecutor::initializeExecutorEntry");
 
-  ExpressionEvaluator expr_eval;
-  evaluatorPrecomputedValues()->bindInputs(args);
+  ExpressionEvaluator expr_eval =
+      executor_utils::bindInputs(args, compiled_kernel_->kernel());
   expr_eval.precomputedValues() = evaluatorPrecomputedValues().get();
-
   auto launch_params = computeLaunchParams(
       launch_constraints, expr_eval, warp_size_, index_type);
 
@@ -1206,7 +1205,7 @@ std::vector<at::Tensor> KernelExecutor::run(
   }
 
   ExpressionEvaluator expr_eval;
-  if(has_dynamic_alias_ || has_tma_){
+  if (has_dynamic_alias_ || has_TMA_) {
     expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel());
   }
 
@@ -1360,10 +1359,12 @@ std::vector<at::Tensor> KernelExecutor::run(
   }
 
   if (args.size() != compiled_kernel_->kernel()->parameters().size()) {
-    NVF_ERROR(has_tma_ || has_rng_, "No TMA or RNG found in the kernel, but detected an argument size mismatch.");
+    NVF_ERROR(
+        has_TMA_ || has_rng_,
+        "No TMA or RNG found in the kernel, but detected an argument size mismatch.");
     // If args don't match one of two things is happening. We need to add TMA
     // related args or RNG related args. Resolve these scenarios.
-    if (has_tma_) {
+    if (has_TMA_) {
       // Resolving TMA requires binding all values and evaluating the TMA
       // arguments
       args = resolveTMA(*executor_entry, args);
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index 6d79300a067..2fa54473a12 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -314,13 +314,16 @@ class KernelExecutor : public ExecutorAbstract {
 
   int64_t warp_size_ = 0;
 
-  // Has an RNG kernel and therefore needs to infer RNG state through expression evaluator
+  // Has an RNG kernel and therefore needs to infer RNG state through expression
+  // evaluator
   bool has_rng_ = false;
 
-  // Has a TMA kernel and therefore needs to infer TMA inputs through expression evaluator
+  // Has a TMA kernel and therefore needs to infer TMA inputs through expression
+  // evaluator
   bool has_TMA_ = false;
 
-  // Has a dynamic alias and therefore needs to infer what they are through expression evaluator
+  // Has a dynamic alias and therefore needs to infer what they are through
+  // expression evaluator
   bool has_dynamic_alias_ = false;
 
   // lookup table to take short cut to retrieve recorded information in order to

From 561dfb1a10e4a8bf34341a36799c99f74ee60135 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Wed, 26 Feb 2025 17:11:54 -0800
Subject: [PATCH 14/35] Slice fix.

---
 csrc/runtime/allocations.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 4688e8a3338..c98b7509bab 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -409,11 +409,13 @@ std::vector<at::Tensor> allocateKernelOutputs(
       ee.bind(
           fusion->inputs()[entry.output_aliased_to_input.at(out_idx)],
           input_arg);
+      auto output = ee.evaluate(out_info.tv).as<at::Tensor>();
       std::cout << "Input arg: " << debug_str(input_arg.as<at::Tensor>())
                 << std::endl;
+      std::cout << "Aliased output tennsor: "
+                << fusion->outputs()[out_idx]->toString() << std::endl;
       std::cout << "Aliased output tensor: "
-                << debug_str(ee.evaluate(out_info.tv).as<at::Tensor>())
-                << std::endl;
+                << debug_str(output) << std::endl;
       NVF_ERROR(
           input_arg.is<at::Tensor>(),
           "Aliased input argument is not a tensor.");
@@ -421,11 +423,12 @@ std::vector<at::Tensor> allocateKernelOutputs(
               out_info.shape_info.logical_sizes ||
           input_arg.as<at::Tensor>().strides() !=
               out_info.shape_info.logical_strides) {
-        out_tensors.emplace_back(input_arg.as<at::Tensor>().as_strided(
+                std::cout<<"As strided?"<<std::endl;
+        out_tensors.emplace_back(output.as_strided(
             out_info.shape_info.logical_sizes,
             out_info.shape_info.logical_strides));
       } else {
-        out_tensors.emplace_back(input_arg.as<at::Tensor>());
+        out_tensors.emplace_back(output);
       }
       std::cout
           << "Aliasing T" << out_info.tv->name() << " to T"

From 868afb72aefde3266c35d3f624dcf52eeb7d054c Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Wed, 26 Feb 2025 18:01:03 -0800
Subject: [PATCH 15/35] Debugging.

---
 csrc/runtime/executor.cpp               |  5 +++++
 tests/cpp/test_multidevice_sharding.cpp | 16 ++++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 8bfd8cb7ef1..140c42f862c 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -717,6 +717,11 @@ void KernelExecutor::initializeExecutorEntry(
           args[inp_idx].as<at::Tensor>().strides().vec();
       shape_info.allocation_sizes = alloc_sizes;
       shape_info.allocation_strides = alloc_strides;
+      std::cout<<"Input shape info: "<<std::endl;
+      std::cout<<"Logical sizes: "<<shape_info.logical_sizes<<std::endl;
+      std::cout<<"Allocation sizes: "<<shape_info.allocation_sizes<<std::endl;
+      std::cout<<"Logical strides: "<<shape_info.logical_strides<<std::endl;
+      std::cout<<"Allocation strides: "<<shape_info.allocation_strides<<std::endl;
       GlobalBufferInfo info(
           input_tv,
           shape_info,
diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp
index 4e470137c8f..5675a81273e 100644
--- a/tests/cpp/test_multidevice_sharding.cpp
+++ b/tests/cpp/test_multidevice_sharding.cpp
@@ -240,15 +240,19 @@ TEST_F(MultiDeviceTest, DivideBySum) {
   for (auto* tv : {x, y}) {
     tv->setAllocationDomain(tv->getLoopDomain(), true);
   }
-
-  const int64_t b = 2;
-  const int64_t h = d * 3;
-  const int64_t s = 5;
+  x->setContiguity({false, false, true, true, true});
+  const int64_t b = 1;
+  const int64_t h = d * 2;
+  const int64_t s = 2;
   at::Tensor unsharded_x_tensor = at::randint(5, {b, h, s, s}, tensor_options);
-  at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x);
-
+  
+  // at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x);
+  at::Tensor x_tensor = at::randint(5, {b, h/d, s, s}, tensor_options);
+  std::cout<<"Properties: "<<debug_str(x_tensor)<<"\n";
+  std::cout<<"input:"<<x_tensor<<std::endl;
   FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor y_tensor = executor_cache.runFusionWithInputs({x_tensor})[0];
+  std::cout<<"output:"<<y_tensor<<std::endl;
   testValidate(
       executor_cache.fusion(),
       {y_tensor},

From f6f57ea15c181561b72ec4d9ffb347d5508b7f94 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Thu, 27 Feb 2025 14:43:49 -0800
Subject: [PATCH 16/35] Debugging still.

---
 csrc/runtime/allocations.cpp            |  31 +++++-
 csrc/runtime/allocations.h              |   1 +
 csrc/runtime/executor.cpp               | 120 +++++++++++++++++++++---
 csrc/serde/fusion_cache.fbs             |   1 +
 csrc/tensor_metadata.cpp                |   5 +
 tests/cpp/test_multidevice_sharding.cpp |  16 ++--
 6 files changed, 148 insertions(+), 26 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index c98b7509bab..cf9626599a9 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -10,6 +10,7 @@
 
 #include <expr_evaluator.h>
 #include <instrumentation.h>
+#include <multidevice/utils.h>
 #include <polymorphic_value.h>
 #include <runtime/executor.h>
 #include <runtime/executor_kernel_arg.h>
@@ -262,6 +263,8 @@ at::Tensor allocateTensor(
   // Handle a fusion with duplicated outputs.
   TensorView* out_tv = out_info.tv;
   if (ee.isKnown(out_tv)) {
+    std::cout << "Evaluated 2: \n"
+              << ee.evaluate(out_tv).as<at::Tensor>() << std::endl;
     return ee.evaluate(out_tv).as<at::Tensor>();
   }
 
@@ -294,6 +297,7 @@ at::Tensor allocateTensor(
       if (shouldFillAllocationWithNan()) {
         fillTensorWithNan(alloc_tensor);
       }
+      std::cout << "Allocated 0: \n" << alloc_tensor << std::endl;
       return alloc_tensor;
     }
     case AllocationType::ReuseBuffer:
@@ -314,6 +318,7 @@ at::Tensor allocateTensor(
             aliased_io->toString());
         inferAndValidateAllocationSizesAndStrides(out_tensor, out_tv, ee);
       }
+      std::cout << "Evaluated 1: \n" << out_tensor << std::endl;
       return out_tensor;
     }
     default:
@@ -394,8 +399,8 @@ std::vector<at::Tensor> allocateKernelOutputs(
           c10::nullopt,
           device,
           c10::nullopt);
-      std::cout << "Allocating output tensor: " << debug_str(alloc_tensor)
-                << std::endl;
+      std::cout << "Allocating output tensor:\n"
+                << debug_str(alloc_tensor) << std::endl;
       if (shouldFillAllocationWithNan()) {
         fillTensorWithNan(alloc_tensor);
       }
@@ -414,8 +419,7 @@ std::vector<at::Tensor> allocateKernelOutputs(
                 << std::endl;
       std::cout << "Aliased output tennsor: "
                 << fusion->outputs()[out_idx]->toString() << std::endl;
-      std::cout << "Aliased output tensor: "
-                << debug_str(output) << std::endl;
+      std::cout << "Aliased output tensor: " << debug_str(output) << std::endl;
       NVF_ERROR(
           input_arg.is<at::Tensor>(),
           "Aliased input argument is not a tensor.");
@@ -423,7 +427,12 @@ std::vector<at::Tensor> allocateKernelOutputs(
               out_info.shape_info.logical_sizes ||
           input_arg.as<at::Tensor>().strides() !=
               out_info.shape_info.logical_strides) {
-                std::cout<<"As strided?"<<std::endl;
+        std::cout << "As strided?" << std::endl;
+        std::cout << "Aliasing output tensor: \n"
+                  << output.as_strided(
+                         out_info.shape_info.logical_sizes,
+                         out_info.shape_info.logical_strides)
+                  << std::endl;
         out_tensors.emplace_back(output.as_strided(
             out_info.shape_info.logical_sizes,
             out_info.shape_info.logical_strides));
@@ -869,6 +878,10 @@ std::vector<GlobalBufferInfo> getInputBufferInfos(
     TensorShapeInfo shape_info;
     shape_info.logical_sizes = logical_sizes;
     shape_info.logical_strides = logical_strides;
+    if (isSharded(buffer_info.tv)) {
+      shape_info.unsharded_logical_sizes =
+          unshardedSizes(buffer_info.tv, logical_sizes);
+    }
     buffer_info.shape_info = shape_info;
     buffer_info.type = inputs[i].scalar_type();
 
@@ -921,6 +934,8 @@ TensorShapeInfo inferTensorShapes(
       return TensorShapeInfo{
           tensor.sizes().vec(),
           tensor.strides().vec(),
+          isSharded(tv) ? unshardedSizes(tv, tensor.sizes().vec())
+                        : tensor.sizes().vec(),
           tensor.sizes().vec(),
           tensor.strides().vec()};
     }
@@ -929,6 +944,8 @@ TensorShapeInfo inferTensorShapes(
     return TensorShapeInfo{
         tensor.sizes().vec(),
         tensor.strides().vec(),
+        isSharded(tv) ? unshardedSizes(tv, tensor.sizes().vec())
+                      : tensor.sizes().vec(),
         allocation_size_stride.first,
         allocation_size_stride.second};
   }
@@ -939,6 +956,8 @@ TensorShapeInfo inferTensorShapes(
     return TensorShapeInfo{
         allocation_size_stride.first,
         allocation_size_stride.second,
+        isSharded(tv) ? unshardedSizes(tv, allocation_size_stride.first)
+                      : allocation_size_stride.first,
         allocation_size_stride.first,
         allocation_size_stride.second};
   }
@@ -955,6 +974,8 @@ TensorShapeInfo inferTensorShapes(
   return {
       logical_meta_tensor.sizes().vec(),
       logical_meta_tensor.strides().vec(),
+      isSharded(tv) ? unshardedSizes(tv, logical_meta_tensor.sizes().vec())
+                    : logical_meta_tensor.sizes().vec(),
       allocation_size_stride.first,
       allocation_size_stride.second};
 }
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
index 0f6e5b4a3f1..b2984d48629 100644
--- a/csrc/runtime/allocations.h
+++ b/csrc/runtime/allocations.h
@@ -20,6 +20,7 @@ struct KernelExecutorEntry;
 struct TensorShapeInfo {
   std::vector<int64_t> logical_sizes;
   std::vector<int64_t> logical_strides;
+  std::vector<int64_t> unsharded_logical_sizes;
   std::vector<int64_t> allocation_sizes;
   std::vector<int64_t> allocation_strides;
 };
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 140c42f862c..bbf652fa06d 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -20,6 +20,7 @@
 #include <ir/utils.h>
 #include <iter_visitor.h>
 #include <kernel_ir.h>
+#include <multidevice/utils.h>
 #include <options.h>
 #include <polymorphic_value.h>
 #include <runtime/allocations.h>
@@ -172,7 +173,24 @@ void KernelExecutor::compile(
     CompileParams compile_params,
     SchedulerType scheduler_type) {
   FUSER_PERF_SCOPE("KernelExecutor::compile");
-
+  for (auto input : fusion->inputs()) {
+    if (input->isA<TensorView>()) {
+      auto tv = input->as<TensorView>();
+      tv->setContiguity(std::vector<std::optional<bool>>(
+          TensorDomain::noReductions(tv->getMaybeAllocationDomain()).size(),
+          false));
+    }
+  }
+  for (auto output : fusion->outputs()) {
+    if (output->isA<TensorView>()) {
+      auto tv = output->as<TensorView>();
+      tv->setContiguity(std::vector<std::optional<bool>>(
+          TensorDomain::noReductions(tv->getMaybeAllocationDomain()).size(),
+          false));
+    }
+  }
+  fusion->print();
+  fusion->printKernel();
   NVF_ERROR(
       supported(fusion),
       "KernelExecutor does not support the Fusion provided.");
@@ -360,7 +378,7 @@ LaunchParams KernelExecutor::computeLaunchParams(
         parallel_iter_extents, launch_constraints);
     expr_eval.precomputedValues()->evaluate();
   }
-
+  std::cout << "====================" << std::endl;
   // If any dimension was set in launch constraints we need to run through
   // IterDomains that have been parallelized, and bind those values. Or make
   // sure if they could be inferred the inference matches what was set.
@@ -371,6 +389,9 @@ LaunchParams KernelExecutor::computeLaunchParams(
       for (auto extent : parallel_extents) {
         auto inferred_val = expr_eval.evaluate(extent);
         if (inferred_val.hasValue()) {
+          std::cout << "Inferring val: " << extent->toInlineString()
+                    << std::endl;
+          std::cout << inferred_val.as<int64_t>() << std::endl;
           // This value could have been inferred, make sure it was set right.
           bool valid =
               inferred_val.as<int64_t>() == launch_constraints.getDim(p_type) ||
@@ -381,9 +402,13 @@ LaunchParams KernelExecutor::computeLaunchParams(
                 "this may be due to mixed broadcast axes that are parallelized.");
           }
         } else if (!expr_eval.precomputedValues()) {
+          std::cout << "Binding val: " << extent->toInlineString() << std::endl;
+          std::cout << launch_constraints.getDim(p_type) << std::endl;
           expr_eval.bind(extent, launch_constraints.getDim(p_type));
         }
         if (!launch_params.hasDim(p_type)) {
+          std::cout << "Binding val: " << p_type << std::endl;
+          std::cout << launch_constraints.getDim(p_type) << std::endl;
           // Bind the launch constraint into our evaluation context
           launch_params.bind(launch_constraints.getDim(p_type), p_type);
           // Makes sure the p-types bound to evaluators are the
@@ -396,10 +421,13 @@ LaunchParams KernelExecutor::computeLaunchParams(
     }
   }
 
+  std::cout << "====================" << std::endl;
   // Run through the rest of the parallel IterDomains and infer their size
   for (auto [p_type, extent] : simplified_parallel_iter_extents) {
     FUSER_PERF_SCOPE("KernelExecutor::ParallelBindingResolution");
     auto val = expr_eval.evaluate(extent);
+    std::cout << "Evaluating val: " << extent->toInlineString() << std::endl;
+    std::cout << val.as<int64_t>() << std::endl;
     NVF_ERROR(
         val.hasValue(),
         "Tried to evaluate the extent, ",
@@ -414,6 +442,7 @@ LaunchParams KernelExecutor::computeLaunchParams(
     }
   }
 
+  std::cout << "====================" << std::endl;
   // Re-run the integer machine with all
   //  the thread sizes now determined.
   if (expr_eval.precomputedValues()) {
@@ -659,7 +688,13 @@ void KernelExecutor::initializeExecutorEntry(
   expr_eval.precomputedValues() = evaluatorPrecomputedValues().get();
   auto launch_params = computeLaunchParams(
       launch_constraints, expr_eval, warp_size_, index_type);
+  std::cout << "Constraints: " << launch_constraints.toString() << std::endl;
+  std::cout << "Launch params: " << launch_params.toString() << std::endl;
+
+  std::cout << "Expr eval:\n";
+  expr_eval.print();
 
+  // NVF_THROW("Stop here");
   for (const auto& entry : compiled_kernel_->kernel()->summary().validations) {
     NVF_CHECK(expr_eval.evaluate(entry.first).as<bool>(), entry.second);
   }
@@ -715,13 +750,23 @@ void KernelExecutor::initializeExecutorEntry(
       shape_info.logical_sizes = args[inp_idx].as<at::Tensor>().sizes().vec();
       shape_info.logical_strides =
           args[inp_idx].as<at::Tensor>().strides().vec();
+      if (isSharded(input_tv)) {
+        std::cout << "input_tv is sharded" << std::endl;
+        shape_info.unsharded_logical_sizes =
+            unshardedSizes(input_tv, shape_info.logical_sizes);
+        std::cout << "unsharded_logical_sizes: "
+                  << shape_info.unsharded_logical_sizes << std::endl;
+      }
       shape_info.allocation_sizes = alloc_sizes;
       shape_info.allocation_strides = alloc_strides;
-      std::cout<<"Input shape info: "<<std::endl;
-      std::cout<<"Logical sizes: "<<shape_info.logical_sizes<<std::endl;
-      std::cout<<"Allocation sizes: "<<shape_info.allocation_sizes<<std::endl;
-      std::cout<<"Logical strides: "<<shape_info.logical_strides<<std::endl;
-      std::cout<<"Allocation strides: "<<shape_info.allocation_strides<<std::endl;
+      std::cout << "Input shape info: " << std::endl;
+      std::cout << "Logical sizes: " << shape_info.logical_sizes << std::endl;
+      std::cout << "Allocation sizes: " << shape_info.allocation_sizes
+                << std::endl;
+      std::cout << "Logical strides: " << shape_info.logical_strides
+                << std::endl;
+      std::cout << "Allocation strides: " << shape_info.allocation_strides
+                << std::endl;
       GlobalBufferInfo info(
           input_tv,
           shape_info,
@@ -971,12 +1016,21 @@ void KernelExecutor::computeArgs2(
       auto tensor = args[arg_idx].as<at::Tensor>();
 
       auto data = tensor.data_ptr();
-      const auto& logical_size =
-          linear_buffer_info_getter(entry, buffer_info_idx)
-              .shape_info.logical_sizes;
-      const auto& alloc_stride =
-          linear_buffer_info_getter(entry, buffer_info_idx)
-              .shape_info.allocation_strides;
+      const auto& buffer_info =
+          linear_buffer_info_getter(entry, buffer_info_idx);
+      const auto& logical_size = buffer_info.shape_info.logical_sizes.size() ==
+              buffer_info.shape_info.unsharded_logical_sizes.size()
+          ? buffer_info.shape_info.unsharded_logical_sizes
+          : buffer_info.shape_info.logical_sizes;
+      const auto& alloc_stride = buffer_info.shape_info.allocation_strides;
+      std::cout << "Populating buffer info for tensor";
+      std::cout << " pointer: " << data;
+      std::cout << " logical size: " << buffer_info.shape_info.logical_sizes
+                << "\n";
+      std::cout << " unsharded logical size: "
+                << buffer_info.shape_info.unsharded_logical_sizes << "\n";
+      std::cout << " Unsharded logical size: " << logical_size << "\n";
+      std::cout << " alloc stride: " << alloc_stride << "\n";
       buffer_info_idx++;
       // special handle for TensorMetaData so that CPU overhead is minimal.
       if (idx_type == PrimDataType::Int) {
@@ -1438,6 +1492,10 @@ std::vector<at::Tensor> KernelExecutor::run(
               << ", warps_per_sm=" << warps_per_sm
               << ", occupancy=" << oss.str() << std::endl;
     }
+    std::cout << "Running KE with args:\n";
+    for (const auto& arg : args) {
+      std::cout << debug_str(arg.as<at::Tensor>()) << "\n";
+    }
 
     if (!compiled_kernel_->kernel()->summary().has_cooperative_grid_reduction) {
       FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchKernel");
@@ -1480,6 +1538,37 @@ std::vector<at::Tensor> KernelExecutor::run(
     sprof.stopKernel();
     sprof.outputBytesAccessed(computeBytes(outputs));
   }
+  std::cout << "\n\n";
+  std::cout << "Entry input info:\n";
+  for (auto entry : executor_entry->inputs) {
+    std::cout << "T" << entry.tv->name() << "\n";
+    std::cout << "  Logical sizes: " << entry.shape_info.logical_sizes << "\n";
+    std::cout << "  Logical strides: " << entry.shape_info.logical_strides
+              << "\n";
+    std::cout << "  Allocation sizes: " << entry.shape_info.allocation_sizes
+              << "\n";
+    std::cout << "  Allocation strides: " << entry.shape_info.allocation_strides
+              << "\n";
+  }
+
+  std::cout << "Entry output info:\n";
+  for (auto entry : executor_entry->outputs) {
+    std::cout << "T" << entry.tv->name() << "\n";
+    std::cout << "  Logical sizes: " << entry.shape_info.logical_sizes << "\n";
+    std::cout << "  Logical strides: " << entry.shape_info.logical_strides
+              << "\n";
+    std::cout << "  Allocation sizes: " << entry.shape_info.allocation_sizes
+              << "\n";
+    std::cout << "  Allocation strides: " << entry.shape_info.allocation_strides
+              << "\n";
+  }
+
+  std::cout << "Ran KE with args:\n";
+  for (const auto& arg : args) {
+    std::cout << debug_str(arg.as<at::Tensor>()) << "\n";
+  }
+  std::cout << "Launch params: " << launch_params_.toString() << "\n";
+  std::cout << std::endl;
   return outputs;
 }
 
@@ -1635,6 +1724,7 @@ flatbuffers::Offset<serde::GlobalBufferInfo> KernelExecutor::serialize(
       tv_position,
       &data.shape_info.logical_sizes,
       &data.shape_info.logical_strides,
+      &data.shape_info.unsharded_logical_sizes,
       &data.shape_info.allocation_sizes,
       &data.shape_info.allocation_strides,
       nvfuser::toUnderlying(data.type),
@@ -1766,6 +1856,10 @@ GlobalBufferInfo KernelExecutor::deserialize(
     shape_info.logical_strides.emplace_back(dim_stride);
   }
 
+  for (auto dim_size : *buffer->unsharded_logical_sizes()) {
+    shape_info.unsharded_logical_sizes.emplace_back(dim_size);
+  }
+
   for (auto dim_size : *buffer->alloc_sizes()) {
     shape_info.allocation_sizes.emplace_back(dim_size);
   }
diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs
index 0d7cdd97d99..3ff755b2c15 100644
--- a/csrc/serde/fusion_cache.fbs
+++ b/csrc/serde/fusion_cache.fbs
@@ -217,6 +217,7 @@ table GlobalBufferInfo {
   tv : long = -1;
   logical_sizes : [long];
   logical_strides : [long];
+  unsharded_logical_sizes : [long];
   alloc_sizes : [long];
   alloc_strides : [long];
   dtype : long;
diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp
index e5dd17f4aca..41fb5e50d47 100644
--- a/csrc/tensor_metadata.cpp
+++ b/csrc/tensor_metadata.cpp
@@ -310,6 +310,8 @@ inferAndValidateAllocationSizesAndStrides(
   std::vector<int64_t> allocation_strides;
   allocation_sizes.reserve(alloc.size());
   allocation_strides.reserve(alloc.size());
+  std::cout << "Processing allocation domain for: " << tv->name() << "\n";
+  std::cout << tv->getMaybeAllocationDomain() << std::endl;
   for (IterDomain* id : TensorDomain::noReductions(alloc)) {
     if (id->isDeviceDim()) {
       allocation_sizes.push_back(1);
@@ -317,7 +319,10 @@ inferAndValidateAllocationSizesAndStrides(
       allocation_sizes.push_back(active_ids.at(id).first);
     }
     allocation_strides.push_back(active_ids.at(id).second);
+    std::cout << allocation_sizes.back() << " " << allocation_strides.back()
+              << " for ID: " << id->toString() << "\n";
   }
+  std::cout << std::endl;
 
   // Only validate final sizes and strides when we have a non-empty tensor.
   if (tensor.numel() != 0) {
diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp
index 5675a81273e..2cf57bcf8f1 100644
--- a/tests/cpp/test_multidevice_sharding.cpp
+++ b/tests/cpp/test_multidevice_sharding.cpp
@@ -241,18 +241,18 @@ TEST_F(MultiDeviceTest, DivideBySum) {
     tv->setAllocationDomain(tv->getLoopDomain(), true);
   }
   x->setContiguity({false, false, true, true, true});
-  const int64_t b = 1;
-  const int64_t h = d * 2;
-  const int64_t s = 2;
+  const int64_t b = 7;
+  const int64_t h = d * 5;
+  const int64_t s = 3;
   at::Tensor unsharded_x_tensor = at::randint(5, {b, h, s, s}, tensor_options);
-  
+
   // at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x);
-  at::Tensor x_tensor = at::randint(5, {b, h/d, s, s}, tensor_options);
-  std::cout<<"Properties: "<<debug_str(x_tensor)<<"\n";
-  std::cout<<"input:"<<x_tensor<<std::endl;
+  at::Tensor x_tensor = at::randint(5, {b, h / d, s, s}, tensor_options);
+  std::cout << "Properties: " << debug_str(x_tensor) << "\n";
+  // std::cout<<"input:"<<x_tensor<<std::endl;
   FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor y_tensor = executor_cache.runFusionWithInputs({x_tensor})[0];
-  std::cout<<"output:"<<y_tensor<<std::endl;
+  // std::cout<<"output:"<<y_tensor<<std::endl;
   testValidate(
       executor_cache.fusion(),
       {y_tensor},

From c589bc5f845246c7ef6c41e2d67dee75c626a73e Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Thu, 27 Feb 2025 15:15:13 -0800
Subject: [PATCH 17/35] Distrbuted tests working.

---
 csrc/runtime/allocations.cpp | 36 ---------------
 csrc/runtime/executor.cpp    | 89 ------------------------------------
 csrc/tensor_metadata.cpp     |  5 --
 3 files changed, 130 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index cf9626599a9..5c56bd0d01e 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -263,8 +263,6 @@ at::Tensor allocateTensor(
   // Handle a fusion with duplicated outputs.
   TensorView* out_tv = out_info.tv;
   if (ee.isKnown(out_tv)) {
-    std::cout << "Evaluated 2: \n"
-              << ee.evaluate(out_tv).as<at::Tensor>() << std::endl;
     return ee.evaluate(out_tv).as<at::Tensor>();
   }
 
@@ -297,7 +295,6 @@ at::Tensor allocateTensor(
       if (shouldFillAllocationWithNan()) {
         fillTensorWithNan(alloc_tensor);
       }
-      std::cout << "Allocated 0: \n" << alloc_tensor << std::endl;
       return alloc_tensor;
     }
     case AllocationType::ReuseBuffer:
@@ -318,7 +315,6 @@ at::Tensor allocateTensor(
             aliased_io->toString());
         inferAndValidateAllocationSizesAndStrides(out_tensor, out_tv, ee);
       }
-      std::cout << "Evaluated 1: \n" << out_tensor << std::endl;
       return out_tensor;
     }
     default:
@@ -399,8 +395,6 @@ std::vector<at::Tensor> allocateKernelOutputs(
           c10::nullopt,
           device,
           c10::nullopt);
-      std::cout << "Allocating output tensor:\n"
-                << debug_str(alloc_tensor) << std::endl;
       if (shouldFillAllocationWithNan()) {
         fillTensorWithNan(alloc_tensor);
       }
@@ -415,11 +409,6 @@ std::vector<at::Tensor> allocateKernelOutputs(
           fusion->inputs()[entry.output_aliased_to_input.at(out_idx)],
           input_arg);
       auto output = ee.evaluate(out_info.tv).as<at::Tensor>();
-      std::cout << "Input arg: " << debug_str(input_arg.as<at::Tensor>())
-                << std::endl;
-      std::cout << "Aliased output tennsor: "
-                << fusion->outputs()[out_idx]->toString() << std::endl;
-      std::cout << "Aliased output tensor: " << debug_str(output) << std::endl;
       NVF_ERROR(
           input_arg.is<at::Tensor>(),
           "Aliased input argument is not a tensor.");
@@ -427,28 +416,12 @@ std::vector<at::Tensor> allocateKernelOutputs(
               out_info.shape_info.logical_sizes ||
           input_arg.as<at::Tensor>().strides() !=
               out_info.shape_info.logical_strides) {
-        std::cout << "As strided?" << std::endl;
-        std::cout << "Aliasing output tensor: \n"
-                  << output.as_strided(
-                         out_info.shape_info.logical_sizes,
-                         out_info.shape_info.logical_strides)
-                  << std::endl;
         out_tensors.emplace_back(output.as_strided(
             out_info.shape_info.logical_sizes,
             out_info.shape_info.logical_strides));
       } else {
         out_tensors.emplace_back(output);
       }
-      std::cout
-          << "Aliasing T" << out_info.tv->name() << " to T"
-          << fusion->inputs()[entry.output_aliased_to_input.at(out_idx)]->name()
-          << std::endl;
-      std::cout << "Aliased output tensor: " << debug_str(out_tensors.back())
-                << std::endl;
-      std::cout << "Aliased output tensor logical sizes: "
-                << out_info.shape_info.logical_sizes << std::endl;
-      std::cout << "Aliased output tensor logical strides: "
-                << out_info.shape_info.logical_strides << std::endl;
     }
   }
   return out_tensors;
@@ -466,15 +439,6 @@ GlobalBufferInfo getBufferInfo(
   auto dtype =
       (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
   info.type = data_type_to_aten(dtype);
-  std::cout << " Getting global buffer info for T" << tv->name() << " "
-            << tv->getLogicalDomain() << std::endl;
-  std::cout << "Logical sizes: " << info.shape_info.logical_sizes << std::endl;
-  std::cout << "Allocation sizes: " << info.shape_info.allocation_sizes
-            << std::endl;
-  std::cout << "Logical strides: " << info.shape_info.logical_strides
-            << std::endl;
-  std::cout << "Allocation strides: " << info.shape_info.allocation_strides
-            << std::endl;
   return info;
 }
 
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index bbf652fa06d..cf575b3c1c2 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -173,24 +173,6 @@ void KernelExecutor::compile(
     CompileParams compile_params,
     SchedulerType scheduler_type) {
   FUSER_PERF_SCOPE("KernelExecutor::compile");
-  for (auto input : fusion->inputs()) {
-    if (input->isA<TensorView>()) {
-      auto tv = input->as<TensorView>();
-      tv->setContiguity(std::vector<std::optional<bool>>(
-          TensorDomain::noReductions(tv->getMaybeAllocationDomain()).size(),
-          false));
-    }
-  }
-  for (auto output : fusion->outputs()) {
-    if (output->isA<TensorView>()) {
-      auto tv = output->as<TensorView>();
-      tv->setContiguity(std::vector<std::optional<bool>>(
-          TensorDomain::noReductions(tv->getMaybeAllocationDomain()).size(),
-          false));
-    }
-  }
-  fusion->print();
-  fusion->printKernel();
   NVF_ERROR(
       supported(fusion),
       "KernelExecutor does not support the Fusion provided.");
@@ -378,7 +360,6 @@ LaunchParams KernelExecutor::computeLaunchParams(
         parallel_iter_extents, launch_constraints);
     expr_eval.precomputedValues()->evaluate();
   }
-  std::cout << "====================" << std::endl;
   // If any dimension was set in launch constraints we need to run through
   // IterDomains that have been parallelized, and bind those values. Or make
   // sure if they could be inferred the inference matches what was set.
@@ -389,9 +370,6 @@ LaunchParams KernelExecutor::computeLaunchParams(
       for (auto extent : parallel_extents) {
         auto inferred_val = expr_eval.evaluate(extent);
         if (inferred_val.hasValue()) {
-          std::cout << "Inferring val: " << extent->toInlineString()
-                    << std::endl;
-          std::cout << inferred_val.as<int64_t>() << std::endl;
           // This value could have been inferred, make sure it was set right.
           bool valid =
               inferred_val.as<int64_t>() == launch_constraints.getDim(p_type) ||
@@ -402,13 +380,9 @@ LaunchParams KernelExecutor::computeLaunchParams(
                 "this may be due to mixed broadcast axes that are parallelized.");
           }
         } else if (!expr_eval.precomputedValues()) {
-          std::cout << "Binding val: " << extent->toInlineString() << std::endl;
-          std::cout << launch_constraints.getDim(p_type) << std::endl;
           expr_eval.bind(extent, launch_constraints.getDim(p_type));
         }
         if (!launch_params.hasDim(p_type)) {
-          std::cout << "Binding val: " << p_type << std::endl;
-          std::cout << launch_constraints.getDim(p_type) << std::endl;
           // Bind the launch constraint into our evaluation context
           launch_params.bind(launch_constraints.getDim(p_type), p_type);
           // Makes sure the p-types bound to evaluators are the
@@ -421,13 +395,10 @@ LaunchParams KernelExecutor::computeLaunchParams(
     }
   }
 
-  std::cout << "====================" << std::endl;
   // Run through the rest of the parallel IterDomains and infer their size
   for (auto [p_type, extent] : simplified_parallel_iter_extents) {
     FUSER_PERF_SCOPE("KernelExecutor::ParallelBindingResolution");
     auto val = expr_eval.evaluate(extent);
-    std::cout << "Evaluating val: " << extent->toInlineString() << std::endl;
-    std::cout << val.as<int64_t>() << std::endl;
     NVF_ERROR(
         val.hasValue(),
         "Tried to evaluate the extent, ",
@@ -442,7 +413,6 @@ LaunchParams KernelExecutor::computeLaunchParams(
     }
   }
 
-  std::cout << "====================" << std::endl;
   // Re-run the integer machine with all
   //  the thread sizes now determined.
   if (expr_eval.precomputedValues()) {
@@ -688,11 +658,6 @@ void KernelExecutor::initializeExecutorEntry(
   expr_eval.precomputedValues() = evaluatorPrecomputedValues().get();
   auto launch_params = computeLaunchParams(
       launch_constraints, expr_eval, warp_size_, index_type);
-  std::cout << "Constraints: " << launch_constraints.toString() << std::endl;
-  std::cout << "Launch params: " << launch_params.toString() << std::endl;
-
-  std::cout << "Expr eval:\n";
-  expr_eval.print();
 
   // NVF_THROW("Stop here");
   for (const auto& entry : compiled_kernel_->kernel()->summary().validations) {
@@ -751,22 +716,11 @@ void KernelExecutor::initializeExecutorEntry(
       shape_info.logical_strides =
           args[inp_idx].as<at::Tensor>().strides().vec();
       if (isSharded(input_tv)) {
-        std::cout << "input_tv is sharded" << std::endl;
         shape_info.unsharded_logical_sizes =
             unshardedSizes(input_tv, shape_info.logical_sizes);
-        std::cout << "unsharded_logical_sizes: "
-                  << shape_info.unsharded_logical_sizes << std::endl;
       }
       shape_info.allocation_sizes = alloc_sizes;
       shape_info.allocation_strides = alloc_strides;
-      std::cout << "Input shape info: " << std::endl;
-      std::cout << "Logical sizes: " << shape_info.logical_sizes << std::endl;
-      std::cout << "Allocation sizes: " << shape_info.allocation_sizes
-                << std::endl;
-      std::cout << "Logical strides: " << shape_info.logical_strides
-                << std::endl;
-      std::cout << "Allocation strides: " << shape_info.allocation_strides
-                << std::endl;
       GlobalBufferInfo info(
           input_tv,
           shape_info,
@@ -1023,14 +977,6 @@ void KernelExecutor::computeArgs2(
           ? buffer_info.shape_info.unsharded_logical_sizes
           : buffer_info.shape_info.logical_sizes;
       const auto& alloc_stride = buffer_info.shape_info.allocation_strides;
-      std::cout << "Populating buffer info for tensor";
-      std::cout << " pointer: " << data;
-      std::cout << " logical size: " << buffer_info.shape_info.logical_sizes
-                << "\n";
-      std::cout << " unsharded logical size: "
-                << buffer_info.shape_info.unsharded_logical_sizes << "\n";
-      std::cout << " Unsharded logical size: " << logical_size << "\n";
-      std::cout << " alloc stride: " << alloc_stride << "\n";
       buffer_info_idx++;
       // special handle for TensorMetaData so that CPU overhead is minimal.
       if (idx_type == PrimDataType::Int) {
@@ -1492,10 +1438,6 @@ std::vector<at::Tensor> KernelExecutor::run(
               << ", warps_per_sm=" << warps_per_sm
               << ", occupancy=" << oss.str() << std::endl;
     }
-    std::cout << "Running KE with args:\n";
-    for (const auto& arg : args) {
-      std::cout << debug_str(arg.as<at::Tensor>()) << "\n";
-    }
 
     if (!compiled_kernel_->kernel()->summary().has_cooperative_grid_reduction) {
       FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchKernel");
@@ -1538,37 +1480,6 @@ std::vector<at::Tensor> KernelExecutor::run(
     sprof.stopKernel();
     sprof.outputBytesAccessed(computeBytes(outputs));
   }
-  std::cout << "\n\n";
-  std::cout << "Entry input info:\n";
-  for (auto entry : executor_entry->inputs) {
-    std::cout << "T" << entry.tv->name() << "\n";
-    std::cout << "  Logical sizes: " << entry.shape_info.logical_sizes << "\n";
-    std::cout << "  Logical strides: " << entry.shape_info.logical_strides
-              << "\n";
-    std::cout << "  Allocation sizes: " << entry.shape_info.allocation_sizes
-              << "\n";
-    std::cout << "  Allocation strides: " << entry.shape_info.allocation_strides
-              << "\n";
-  }
-
-  std::cout << "Entry output info:\n";
-  for (auto entry : executor_entry->outputs) {
-    std::cout << "T" << entry.tv->name() << "\n";
-    std::cout << "  Logical sizes: " << entry.shape_info.logical_sizes << "\n";
-    std::cout << "  Logical strides: " << entry.shape_info.logical_strides
-              << "\n";
-    std::cout << "  Allocation sizes: " << entry.shape_info.allocation_sizes
-              << "\n";
-    std::cout << "  Allocation strides: " << entry.shape_info.allocation_strides
-              << "\n";
-  }
-
-  std::cout << "Ran KE with args:\n";
-  for (const auto& arg : args) {
-    std::cout << debug_str(arg.as<at::Tensor>()) << "\n";
-  }
-  std::cout << "Launch params: " << launch_params_.toString() << "\n";
-  std::cout << std::endl;
   return outputs;
 }
 
diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp
index 41fb5e50d47..e5dd17f4aca 100644
--- a/csrc/tensor_metadata.cpp
+++ b/csrc/tensor_metadata.cpp
@@ -310,8 +310,6 @@ inferAndValidateAllocationSizesAndStrides(
   std::vector<int64_t> allocation_strides;
   allocation_sizes.reserve(alloc.size());
   allocation_strides.reserve(alloc.size());
-  std::cout << "Processing allocation domain for: " << tv->name() << "\n";
-  std::cout << tv->getMaybeAllocationDomain() << std::endl;
   for (IterDomain* id : TensorDomain::noReductions(alloc)) {
     if (id->isDeviceDim()) {
       allocation_sizes.push_back(1);
@@ -319,10 +317,7 @@ inferAndValidateAllocationSizesAndStrides(
       allocation_sizes.push_back(active_ids.at(id).first);
     }
     allocation_strides.push_back(active_ids.at(id).second);
-    std::cout << allocation_sizes.back() << " " << allocation_strides.back()
-              << " for ID: " << id->toString() << "\n";
   }
-  std::cout << std::endl;
 
   // Only validate final sizes and strides when we have a non-empty tensor.
   if (tensor.numel() != 0) {

From f5628962f0fe53696f2e651c17d6a28cf3c46060 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sat, 1 Mar 2025 07:17:54 -0800
Subject: [PATCH 18/35] Remove precomputed values from initialize executor
 entry, for some reason it's not synced correctly.

---
 csrc/runtime/executor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index cf575b3c1c2..c85d05f6d0c 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -655,7 +655,7 @@ void KernelExecutor::initializeExecutorEntry(
 
   ExpressionEvaluator expr_eval =
       executor_utils::bindInputs(args, compiled_kernel_->kernel());
-  expr_eval.precomputedValues() = evaluatorPrecomputedValues().get();
+  // expr_eval.precomputedValues() = evaluatorPrecomputedValues().get();
   auto launch_params = computeLaunchParams(
       launch_constraints, expr_eval, warp_size_, index_type);
 

From eb291c5414c19bc250af81de94fd692eb6881da3 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sat, 1 Mar 2025 09:31:40 -0800
Subject: [PATCH 19/35] Fix most failures.

---
 csrc/runtime/allocations.cpp       | 104 ++++++++++++++++-------------
 csrc/runtime/allocations.h         |  10 ++-
 csrc/runtime/executor.cpp          |  44 +++++++++---
 csrc/runtime/executor_kernel_arg.h |   4 ++
 4 files changed, 104 insertions(+), 58 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 5c56bd0d01e..5d64275599b 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -374,17 +374,18 @@ std::vector<at::Tensor> allocateOutputs(
   return out_tensors;
 }
 
-std::vector<at::Tensor> allocateKernelOutputs(
+KernelArgumentHolder allocateKernelOutputs(
     const Fusion* fusion,
     const KernelExecutorEntry& entry,
     const c10::Device& device,
-    const KernelArgumentHolder& args) {
+    const KernelArgumentHolder& args,
+    bool dynamic_alias) {
   FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs");
 
   // TODO: Figure out if output to output aliasing is needed
 
-  std::vector<at::Tensor> out_tensors;
-  out_tensors.reserve(entry.outputs.size());
+  KernelArgumentHolder out_tensors;
+  out_tensors.resize(entry.outputs.size());
   for (auto out_idx : c10::irange(entry.outputs.size())) {
     auto out_info = entry.outputs.at(out_idx);
     if (entry.output_aliased_to_input.at(out_idx) == -1) {
@@ -398,57 +399,38 @@ std::vector<at::Tensor> allocateKernelOutputs(
       if (shouldFillAllocationWithNan()) {
         fillTensorWithNan(alloc_tensor);
       }
-      out_tensors.emplace_back(alloc_tensor);
-    } else {
-      NVF_ERROR(
-          entry.output_aliased_to_input.at(out_idx) <= (int64_t)args.size(),
-          "Tried to grab an out of range input argument.");
-      auto input_arg = args[entry.output_aliased_to_input.at(out_idx)];
+      out_tensors[out_idx] = alloc_tensor;
+    } else if (
+        fusion->getOutputAlias(out_info.tv).type ==
+        AllocationType::ReuseBuffer) {
+      auto inp = args[entry.output_aliased_to_input.at(out_idx)];
+      NVF_ERROR(inp.is<at::Tensor>(), "Input is not a Tensor");
+      out_tensors[out_idx] = inp;
+    } else if (
+        fusion->getOutputAlias(out_info.tv).type == AllocationType::Evaluate) {
+      if (dynamic_alias) {
+        out_tensors[out_idx] = std::monostate();
+        continue;
+      }
+
       ExpressionEvaluator ee;
       ee.bind(
-          fusion->inputs()[entry.output_aliased_to_input.at(out_idx)],
-          input_arg);
-      auto output = ee.evaluate(out_info.tv).as<at::Tensor>();
-      NVF_ERROR(
-          input_arg.is<at::Tensor>(),
-          "Aliased input argument is not a tensor.");
-      if (input_arg.as<at::Tensor>().sizes() !=
-              out_info.shape_info.logical_sizes ||
-          input_arg.as<at::Tensor>().strides() !=
-              out_info.shape_info.logical_strides) {
-        out_tensors.emplace_back(output.as_strided(
-            out_info.shape_info.logical_sizes,
-            out_info.shape_info.logical_strides));
-      } else {
-        out_tensors.emplace_back(output);
-      }
+          fusion->getOutputAlias(out_info.tv).aliased_io,
+          args[entry.output_aliased_to_input.at(out_idx)]);
+      out_tensors[out_idx] = ee.evaluate(out_info.tv);
+    } else {
+      NVF_THROW(
+          "Unexpected allocation path, internal logic around allocations must be incorrect.");
     }
   }
   return out_tensors;
 }
 
-namespace {
-GlobalBufferInfo getBufferInfo(
-    ExpressionEvaluator& expr_eval,
-    DataType index_dtype,
-    TensorView* tv) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::getBufferInfo");
-  GlobalBufferInfo info;
-  info.tv = tv;
-  info.shape_info = inferTensorShapes(info.tv, expr_eval);
-  auto dtype =
-      (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
-  info.type = data_type_to_aten(dtype);
-  return info;
-}
-
-} // namespace
-
 std::vector<GlobalBufferInfo> getBufferInfos(
     ExpressionEvaluator& expr_eval,
     DataType index_dtype,
     const std::vector<Val*>& fusion_outputs) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::getOutbufferInfo");
+  FUSER_PERF_SCOPE("fusion_executor::allocations::getOutbufferInfos");
   std::vector<GlobalBufferInfo> output_buffer_infos;
   output_buffer_infos.reserve(fusion_outputs.size());
   for (const auto out : fusion_outputs) {
@@ -456,8 +438,14 @@ std::vector<GlobalBufferInfo> getBufferInfos(
         out->isA<TensorView>(),
         "Cannot allocate outputs that are not tensors.");
 
-    output_buffer_infos.emplace_back(
-        getBufferInfo(expr_eval, index_dtype, out->as<TensorView>()));
+    GlobalBufferInfo info;
+    info.tv = out->as<TensorView>();
+    info.shape_info = inferTensorShapes(info.tv, expr_eval);
+    auto dtype =
+        (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
+    info.type = data_type_to_aten(dtype);
+
+    output_buffer_infos.emplace_back(info);
   }
   return output_buffer_infos;
 }
@@ -889,12 +877,30 @@ std::vector<GlobalBufferInfo> getInputBufferInfos(
 TensorShapeInfo inferTensorShapes(
     TensorView* tv,
     const ExpressionEvaluator& expr_eval) {
+  std::cout << "Infer shape: " << tv->toString() << std::endl;
   // Alias handling:
   auto alias_info = tv->fusion()->getOutputAlias(tv);
-  if (alias_info.type != AllocationType::New) {
+  if (alias_info.type == AllocationType::Evaluate) {
+    std::cout << "Evaluating" << std::endl;
+    std::cout << "Output: " << tv->toString() << std::endl;
+    auto inps = InputsOf::output(tv);
+    std::cout << "Inputs: " << std::endl;
+    for (auto inp : inps) {
+      std::cout << "  " << inp->toString() << std::endl;
+    }
+    auto exprs =
+        DependencyCheck::getAllExprsBetween({inps.begin(), inps.end()}, {tv});
+    std::cout << "Expressions: " << std::endl;
+    for (auto expr : exprs) {
+      std::cout << "  " << expr->toString() << std::endl;
+    }
+
     auto val = expr_eval.evaluate(tv);
+    NVF_ERROR(val.hasValue() && val.is<at::Tensor>(), "Output is not a Tensor");
     auto tensor = val.as<at::Tensor>();
+
     if (!tv->hasAllocation()) {
+      std::cout << "Inferred 0" << std::endl;
       return TensorShapeInfo{
           tensor.sizes().vec(),
           tensor.strides().vec(),
@@ -903,8 +909,10 @@ TensorShapeInfo inferTensorShapes(
           tensor.sizes().vec(),
           tensor.strides().vec()};
     }
+    std::cout << "Allocation" << std::endl;
     auto allocation_size_stride =
         inferAndValidateAllocationSizesAndStrides(tensor, tv, expr_eval);
+    std::cout << "Inferred 1" << std::endl;
     return TensorShapeInfo{
         tensor.sizes().vec(),
         tensor.strides().vec(),
@@ -917,6 +925,7 @@ TensorShapeInfo inferTensorShapes(
   // Non-alias handling:
   auto allocation_size_stride = inferAllocationShape(tv, expr_eval);
   if (!tv->hasAllocation()) {
+    std::cout << "Inferred 2" << std::endl;
     return TensorShapeInfo{
         allocation_size_stride.first,
         allocation_size_stride.second,
@@ -935,6 +944,7 @@ TensorShapeInfo inferTensorShapes(
   // `transformFromAllocationToLogical`
   logical_meta_tensor =
       transformFromAllocationToLogical(logical_meta_tensor, tv, expr_eval);
+  std::cout << "Inferred 3" << std::endl;
   return {
       logical_meta_tensor.sizes().vec(),
       logical_meta_tensor.strides().vec(),
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
index b2984d48629..29adbd5e254 100644
--- a/csrc/runtime/allocations.h
+++ b/csrc/runtime/allocations.h
@@ -90,11 +90,17 @@ std::vector<at::Tensor> allocateOutputs(
     const c10::Device& device,
     ExpressionEvaluator& ee);
 
-std::vector<at::Tensor> allocateKernelOutputs(
+// Allocate output tensors for a given fusion. Outputs may alias inputs, in
+// that case output tensors are shallow copies of the aliased inputs.
+//
+// If dynamic_alias is true, then any argument with AllocationType::Evaluate
+// will not be populated, it will be filled with std::monostate.
+KernelArgumentHolder allocateKernelOutputs(
     const Fusion* fusion,
     const KernelExecutorEntry& entry,
     const c10::Device& device,
-    const KernelArgumentHolder& args);
+    const KernelArgumentHolder& args,
+    bool dynamic_alias = false);
 
 //! Return information necessary for allocating output tensors. Input
 //! and output tensors are allowed to alias each other, which is
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index c85d05f6d0c..b806e4f761e 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -296,7 +296,7 @@ void KernelExecutor::compile(
     if (output->isA<TensorView>()) {
       auto out_tv = output->as<TensorView>();
       auto alias_info = fusion->getOutputAlias(out_tv);
-      if (alias_info.type == AllocationType::New) {
+      if (alias_info.type != AllocationType::Evaluate) {
         continue;
       }
       auto aliased_to = alias_info.aliased_io->as<TensorView>();
@@ -768,9 +768,6 @@ void KernelExecutor::initializeExecutorEntry(
     auto out_info = output_info[output_idx];
     auto fusion = compiled_kernel_->kernel()->as<Fusion>();
     auto alias_info = fusion->getOutputAlias(out_info.tv);
-    TORCH_WARN_ONCE(
-        alias_info.type != AllocationType::Evaluate,
-        "Outputs should not be evaluate type for kernels, this will be ignored and a kernel will produce the output tensor.");
     if (alias_info.type == AllocationType::New) {
       continue;
     }
@@ -795,8 +792,8 @@ void KernelExecutor::initializeExecutorEntry(
           fusion->inputs(),
           "\nFusion Outputs:\n  ",
           fusion->outputs());
-      TORCH_WARN(
-          "Kernel found with output to output aliasing, this is unsupported in a kernel and will beignored.\n",
+      NVF_THROW(
+          "Kernel found with output to output aliasing, this is unsupported at this moment.\n",
           "Output: ",
           out_info.tv->toString(),
           "\nAliased to: ",
@@ -1249,6 +1246,7 @@ std::vector<at::Tensor> KernelExecutor::run(
 
   // Initialize the executor entry if not initlized
   if (!executor_entry->init) {
+    std::cout << "Initializing executor entry" << std::endl;
     initializeExecutorEntry(
         *executor_entry,
         args,
@@ -1256,6 +1254,7 @@ std::vector<at::Tensor> KernelExecutor::run(
         compile_params,
         outputs,
         compiled_kernel_->kernel()->indexType());
+    std::cout << "Executor entry initialized" << std::endl;
   }
 
   if (!(executor_entry->launch_params.nThreads() <=
@@ -1274,11 +1273,33 @@ std::vector<at::Tensor> KernelExecutor::run(
 
   // only allocate outputs when not given
   if (outputs.empty()) {
-    outputs = allocateKernelOutputs(
+    auto outputs_args = allocateKernelOutputs(
         compiled_kernel_->kernel(),
         *executor_entry,
         compiled_kernel_->device(),
-        args);
+        args,
+        has_dynamic_alias_);
+    outputs.reserve(outputs_args.size());
+    if (has_dynamic_alias_) {
+      for (const auto i :
+           c10::irange(compiled_kernel_->kernel()->outputs().size())) {
+        auto param = compiled_kernel_->kernel()->outputs()[i];
+        if (!param->isA<TensorView>()) {
+          continue;
+        }
+        if (compiled_kernel_->kernel()
+                ->getOutputAlias(param->as<TensorView>())
+                .type == AllocationType::Evaluate) {
+          outputs_args[i] = expr_eval.evaluate(param);
+        }
+      }
+    }
+    for (const auto i : c10::irange(outputs_args.size())) {
+      NVF_ERROR(
+          outputs_args[i].hasValue() && outputs_args[i].is<at::Tensor>(),
+          "Output is not populated or not a Tensor");
+      outputs.emplace_back(outputs_args[i].as<at::Tensor>());
+    }
   }
   args.push(outputs);
 
@@ -1372,17 +1393,22 @@ std::vector<at::Tensor> KernelExecutor::run(
     if (has_TMA_) {
       // Resolving TMA requires binding all values and evaluating the TMA
       // arguments
+      std::cout << "Resolving TMA" << std::endl;
       args = resolveTMA(*executor_entry, args);
+      std::cout << "TMA resolved" << std::endl;
     }
     if (has_rng_) {
       // Resolving RNG seed requires evaluating and adding those values, but
       // doesn't require binding all values as getting RNG seed and offset
       // doesn't depend on other values
+      std::cout << "Resolving RNG seed" << std::endl;
       args = resolveRNGSeed(compiled_kernel_->kernel(), args);
+      std::cout << "RNG seed resolved" << std::endl;
     }
   }
-
+  std::cout << "Computing args" << std::endl;
   computeArgs2(*executor_entry, args);
+  std::cout << "Args computed" << std::endl;
 
   if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) {
     launch_params_.print();
diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h
index 40d44866165..6e3852549e6 100644
--- a/csrc/runtime/executor_kernel_arg.h
+++ b/csrc/runtime/executor_kernel_arg.h
@@ -78,6 +78,10 @@ class NVF_API KernelArgumentHolder {
     arguments_.reserve(size);
   }
 
+  void resize(size_t size) {
+    arguments_.resize(size);
+  }
+
   void push(const std::vector<at::Tensor>& tensors);
   void push(const c10::ArrayRef<c10::IValue>& args);
   void push(std::initializer_list<c10::IValue> args) {

From e4d94c6dbe8f59c0c9b1227c50c7a6225a8a9090 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sat, 1 Mar 2025 09:38:19 -0800
Subject: [PATCH 20/35] Cleanup.

---
 csrc/runtime/allocations.cpp | 20 --------------------
 csrc/runtime/executor.cpp    | 16 ++++++++--------
 csrc/utils.cpp               |  8 +++-----
 3 files changed, 11 insertions(+), 33 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 5d64275599b..39ae7a39c54 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -877,30 +877,14 @@ std::vector<GlobalBufferInfo> getInputBufferInfos(
 TensorShapeInfo inferTensorShapes(
     TensorView* tv,
     const ExpressionEvaluator& expr_eval) {
-  std::cout << "Infer shape: " << tv->toString() << std::endl;
   // Alias handling:
   auto alias_info = tv->fusion()->getOutputAlias(tv);
   if (alias_info.type == AllocationType::Evaluate) {
-    std::cout << "Evaluating" << std::endl;
-    std::cout << "Output: " << tv->toString() << std::endl;
-    auto inps = InputsOf::output(tv);
-    std::cout << "Inputs: " << std::endl;
-    for (auto inp : inps) {
-      std::cout << "  " << inp->toString() << std::endl;
-    }
-    auto exprs =
-        DependencyCheck::getAllExprsBetween({inps.begin(), inps.end()}, {tv});
-    std::cout << "Expressions: " << std::endl;
-    for (auto expr : exprs) {
-      std::cout << "  " << expr->toString() << std::endl;
-    }
-
     auto val = expr_eval.evaluate(tv);
     NVF_ERROR(val.hasValue() && val.is<at::Tensor>(), "Output is not a Tensor");
     auto tensor = val.as<at::Tensor>();
 
     if (!tv->hasAllocation()) {
-      std::cout << "Inferred 0" << std::endl;
       return TensorShapeInfo{
           tensor.sizes().vec(),
           tensor.strides().vec(),
@@ -909,10 +893,8 @@ TensorShapeInfo inferTensorShapes(
           tensor.sizes().vec(),
           tensor.strides().vec()};
     }
-    std::cout << "Allocation" << std::endl;
     auto allocation_size_stride =
         inferAndValidateAllocationSizesAndStrides(tensor, tv, expr_eval);
-    std::cout << "Inferred 1" << std::endl;
     return TensorShapeInfo{
         tensor.sizes().vec(),
         tensor.strides().vec(),
@@ -925,7 +907,6 @@ TensorShapeInfo inferTensorShapes(
   // Non-alias handling:
   auto allocation_size_stride = inferAllocationShape(tv, expr_eval);
   if (!tv->hasAllocation()) {
-    std::cout << "Inferred 2" << std::endl;
     return TensorShapeInfo{
         allocation_size_stride.first,
         allocation_size_stride.second,
@@ -944,7 +925,6 @@ TensorShapeInfo inferTensorShapes(
   // `transformFromAllocationToLogical`
   logical_meta_tensor =
       transformFromAllocationToLogical(logical_meta_tensor, tv, expr_eval);
-  std::cout << "Inferred 3" << std::endl;
   return {
       logical_meta_tensor.sizes().vec(),
       logical_meta_tensor.strides().vec(),
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index b806e4f761e..a80e90a1bb0 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -1246,7 +1246,7 @@ std::vector<at::Tensor> KernelExecutor::run(
 
   // Initialize the executor entry if not initlized
   if (!executor_entry->init) {
-    std::cout << "Initializing executor entry" << std::endl;
+    // std::cout << "Initializing executor entry" << std::endl;
     initializeExecutorEntry(
         *executor_entry,
         args,
@@ -1254,7 +1254,7 @@ std::vector<at::Tensor> KernelExecutor::run(
         compile_params,
         outputs,
         compiled_kernel_->kernel()->indexType());
-    std::cout << "Executor entry initialized" << std::endl;
+    // std::cout << "Executor entry initialized" << std::endl;
   }
 
   if (!(executor_entry->launch_params.nThreads() <=
@@ -1393,22 +1393,22 @@ std::vector<at::Tensor> KernelExecutor::run(
     if (has_TMA_) {
       // Resolving TMA requires binding all values and evaluating the TMA
       // arguments
-      std::cout << "Resolving TMA" << std::endl;
+      // std::cout << "Resolving TMA" << std::endl;
       args = resolveTMA(*executor_entry, args);
-      std::cout << "TMA resolved" << std::endl;
+      // std::cout << "TMA resolved" << std::endl;
     }
     if (has_rng_) {
       // Resolving RNG seed requires evaluating and adding those values, but
       // doesn't require binding all values as getting RNG seed and offset
       // doesn't depend on other values
-      std::cout << "Resolving RNG seed" << std::endl;
+      // std::cout << "Resolving RNG seed" << std::endl;
       args = resolveRNGSeed(compiled_kernel_->kernel(), args);
-      std::cout << "RNG seed resolved" << std::endl;
+      // std::cout << "RNG seed resolved" << std::endl;
     }
   }
-  std::cout << "Computing args" << std::endl;
+  // std::cout << "Computing args" << std::endl;
   computeArgs2(*executor_entry, args);
-  std::cout << "Args computed" << std::endl;
+  // std::cout << "Args computed" << std::endl;
 
   if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) {
     launch_params_.print();
diff --git a/csrc/utils.cpp b/csrc/utils.cpp
index 0358908a6e1..c24ae356cbb 100644
--- a/csrc/utils.cpp
+++ b/csrc/utils.cpp
@@ -43,15 +43,13 @@ std::string debug_str(const at::Tensor& tensor) {
   std::stringstream ss;
   ss << "Tensor:";
   ss << " shape: " << tensor.sizes();
-
-  if (!tensor.is_contiguous()) {
-    ss << ", strides: " << tensor.strides();
-  }
-
   ss << ", dtype: " << tensor.dtype();
   ss << ", device: " << tensor.device();
   ss << ", pointer: " << reinterpret_cast<size_t>(tensor.data_ptr());
 
+  if (!tensor.is_contiguous()) {
+    ss << ", strides: " << tensor.strides();
+  }
   return ss.str();
 }
 

From ae81a733a40bd9c356ca974b97d57933d0a3f7bb Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sat, 1 Mar 2025 15:11:14 -0800
Subject: [PATCH 21/35] Fix serialization.

---
 .gitignore                        |  2 +
 csrc/runtime/executor.cpp         | 68 +++++++++++++++++++++++++------
 csrc/runtime/executor.h           |  3 +-
 csrc/serde/fusion_cache.fbs       |  6 ++-
 tests/python/test_schedule_ops.py |  3 ++
 tools/run_nvfuser_tests.py        |  2 +-
 6 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index 81bb26635d5..27b73f61ffa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,3 +51,5 @@ foo.bin
 
 # Mac OS internal file
 .DS_Store
+
+test_log*
\ No newline at end of file
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index a80e90a1bb0..7cc816e4f31 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -771,6 +771,7 @@ void KernelExecutor::initializeExecutorEntry(
     if (alias_info.type == AllocationType::New) {
       continue;
     }
+    NVF_ERROR(alias_info.aliased_io, "Alias info is not an input or output");
     auto aliased_to = alias_info.aliased_io->as<TensorView>();
     auto aliased_to_idx =
         std::find(
@@ -779,10 +780,12 @@ void KernelExecutor::initializeExecutorEntry(
     if (aliased_to_idx < (int64_t)fusion->inputs().size()) {
       output_aliased_to_input[output_idx] = aliased_to_idx;
     } else {
+      auto aliased_out = std::find(
+              fusion->outputs().begin(), fusion->outputs().end(), aliased_to);
+      NVF_ERROR(aliased_out != fusion->outputs().end(), "Alias not found");
       output_aliased_to_output[output_idx] =
-          std::find(
-              fusion->outputs().begin(), fusion->outputs().end(), aliased_to) -
-          fusion->outputs().begin();
+          aliased_out - fusion->outputs().begin();
+      
       NVF_ERROR(
           output_aliased_to_output[output_idx] < (int)fusion->outputs().size(),
           "Alias found but is not an output or input of the fusion. ",
@@ -1014,6 +1017,9 @@ void KernelExecutor::computeArgs2(
       entry.args[arg_idx] = bytes;
       entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data();
     } else {
+      if(args[arg_idx].is<at::Tensor>()){
+        buffer_info_idx++;
+      }
       auto bytes = polymorphicValueToBytes(
           args[arg_idx],
           compiled_kernel_->kernel()->parameters()[arg_idx]->dtype(),
@@ -1613,8 +1619,9 @@ flatbuffers::Offset<serde::KernelExecutorEntry> KernelExecutor::serialize(
         ? -1
         : std::distance(
               compiledKernel()->kernel()->outputs().cbegin(), tv_iter);
+    NVF_ERROR(tv_position != -1, "Output TensorView not found in kernel outputs");
     outputs_fb.push_back(
-        serialize(builder, buffer, tv_position, true /* is_fusion_output */));
+        serialize(builder, buffer, tv_position, true /* is_fusion_output */, false /* is_fusion_input */));
   }
 
   // Serialize GlobalBufferInfo for intermediates.
@@ -1638,23 +1645,44 @@ flatbuffers::Offset<serde::KernelExecutorEntry> KernelExecutor::serialize(
         : std::distance(
               compiledKernel()->kernel()->summary().global_allocations.cbegin(),
               tv_iter);
+    NVF_ERROR(tv_position != -1, "Intermediate TensorView not found in kernel global allocations");
     intermediates_fb.push_back(
-        serialize(builder, buffer, tv_position, false /* is_fusion_output */));
+        serialize(builder, buffer, tv_position, false /* is_fusion_output */, false /* is_fusion_input */));
   }
 
+
+  std::vector<fb_global_buffer_info> inputs_fb;
+  inputs_fb.reserve(data.inputs.size());
+  for (const auto& buffer : data.inputs) {
+    auto tv_iter = std::find(
+        compiledKernel()->kernel()->inputs().cbegin(),
+        compiledKernel()->kernel()->inputs().cend(),
+        buffer.tv);
+    auto tv_position = (tv_iter == compiledKernel()->kernel()->inputs().cend())
+        ? -1
+        : std::distance(
+              compiledKernel()->kernel()->inputs().cbegin(), tv_iter);
+    NVF_ERROR(tv_position != -1, "Input TensorView not found in kernel inputs");
+    inputs_fb.push_back(
+        serialize(builder, buffer, tv_position, false /* is_fusion_output */, true /* is_fusion_input */));
+  }
   return serde::CreateKernelExecutorEntryDirect(
       builder,
       data.init,
       data.launch_params.serialize(builder),
       &outputs_fb,
-      &intermediates_fb);
+      &intermediates_fb,
+      &inputs_fb,
+      &data.output_aliased_to_input,
+      &data.output_aliased_to_output);
 }
 
 flatbuffers::Offset<serde::GlobalBufferInfo> KernelExecutor::serialize(
     flatbuffers::FlatBufferBuilder& builder,
     const GlobalBufferInfo& data,
     int64_t tv_position,
-    bool is_fusion_output) const {
+    bool is_fusion_output,
+    bool is_fusion_input) const {
   // See table definition for GlobalBufferInfo in serde/fusion_cache.fbs
   return serde::CreateGlobalBufferInfoDirect(
       builder,
@@ -1668,7 +1696,8 @@ flatbuffers::Offset<serde::GlobalBufferInfo> KernelExecutor::serialize(
       data.zero_init,
       data.resets_to_zero,
       data.is_profile_buffer,
-      is_fusion_output);
+      is_fusion_output,
+      is_fusion_input);
 }
 
 void KernelExecutor::deserialize(
@@ -1682,7 +1711,6 @@ void KernelExecutor::deserialize(
     int64_t runtime_id,
     int64_t group_id) {
   // See table definition for KernelExecutor in serde/fusion_cache.fbs
-
   NVF_ERROR(buffer != nullptr, "serde::KernelExecutor is nullptr.");
   NVF_ERROR(_fusion != nullptr, "Fusion is nullptr.");
 
@@ -1755,6 +1783,18 @@ KernelExecutorEntry KernelExecutor::deserialize(
     entry.intermediates.push_back(deserialize(intermediate_buffer));
   }
 
+  for (auto input_buffer : *buffer->inputs()) {
+    entry.inputs.push_back(deserialize(input_buffer));
+  }
+
+  for (auto output_aliased_to_input : *buffer->output_aliased_to_input()) {
+    entry.output_aliased_to_input.push_back(output_aliased_to_input);
+  }
+
+  for (auto output_aliased_to_output : *buffer->output_aliased_to_output()) {
+    entry.output_aliased_to_output.push_back(output_aliased_to_output);
+  }
+
   return entry;
 }
 
@@ -1765,7 +1805,7 @@ GlobalBufferInfo KernelExecutor::deserialize(
   NVF_ERROR(buffer != nullptr, "serde::GlobalBufferInfo is nullptr.");
 
   NVF_ERROR(
-      buffer->tv() != -1, "Serialization failed to encode buffer tv position.");
+      buffer->tv_pos() != -1, "Serialization failed to encode buffer tv position.");
 
   NVF_ERROR(
       compiled_kernel_->lowered() != nullptr,
@@ -1773,12 +1813,16 @@ GlobalBufferInfo KernelExecutor::deserialize(
 
   GlobalBufferInfo info;
   if (buffer->is_fusion_output()) {
-    auto out_val = compiled_kernel_->kernel()->outputs().at(buffer->tv());
+    auto out_val = compiled_kernel_->kernel()->outputs().at(buffer->tv_pos());
     NVF_ERROR(out_val != nullptr);
     info.tv = dynamic_cast<TensorView*>(out_val);
+  } else if(buffer->is_fusion_input()) {
+    auto in_val = compiled_kernel_->kernel()->inputs().at(buffer->tv_pos());
+    NVF_ERROR(in_val != nullptr);
+    info.tv = dynamic_cast<TensorView*>(in_val);
   } else {
     auto out_val = compiled_kernel_->kernel()->summary().global_allocations.at(
-        buffer->tv());
+        buffer->tv_pos());
     NVF_ERROR(out_val != nullptr);
     info.tv = dynamic_cast<TensorView*>(out_val->buffer());
   }
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index 2fa54473a12..522d2b49f1f 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -276,7 +276,8 @@ class KernelExecutor : public ExecutorAbstract {
       flatbuffers::FlatBufferBuilder& builder,
       const GlobalBufferInfo& data,
       int64_t tv_position,
-      bool is_fusion_output) const;
+      bool is_fusion_output,
+      bool is_fusion_input) const;
 
   //! Deserialize GlobalBufferInfo using flatbuffers
   GlobalBufferInfo deserialize(const serde::GlobalBufferInfo* buffer);
diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs
index 3ff755b2c15..079c6e88152 100644
--- a/csrc/serde/fusion_cache.fbs
+++ b/csrc/serde/fusion_cache.fbs
@@ -214,7 +214,7 @@ table LaunchParams {
 // For output tensors, we use its position in the fusion outputs.
 // For intermediate tensors, we use its position in the KernelSummary global_allocations.
 table GlobalBufferInfo {
-  tv : long = -1;
+  tv_pos : long = -1;
   logical_sizes : [long];
   logical_strides : [long];
   unsharded_logical_sizes : [long];
@@ -225,6 +225,7 @@ table GlobalBufferInfo {
   resets_to_zero : bool;
   is_profile_buffer : bool;
   is_fusion_output : bool;
+  is_fusion_input : bool;
 }
 
 // This table describes the cached KernelExecutorEntry for a kernel.
@@ -233,6 +234,9 @@ table KernelExecutorEntry {
   launch_params : LaunchParams;
   outputs : [GlobalBufferInfo];
   intermediates : [GlobalBufferInfo];
+  inputs : [GlobalBufferInfo];
+  output_aliased_to_input : [int];
+  output_aliased_to_output : [int];
 }
 
 // =====================================================================================
diff --git a/tests/python/test_schedule_ops.py b/tests/python/test_schedule_ops.py
index 640753a2a5d..3c59ff37621 100644
--- a/tests/python/test_schedule_ops.py
+++ b/tests/python/test_schedule_ops.py
@@ -1012,6 +1012,9 @@ def schedule(self):
         self.assertEqual(nvf_out[1], inputs[4])
         self.assertEqual(nvf_out[2], torch_ref)
 
+    @pytest.mark.skip(
+    reason="Disable test, the scheduler is not actually sending to ExprEvalExec but is sending to KernelExecutor which will correctly error."
+    )
     def test_matmul_auto_scheduler(self):
         """
         Implement a simple matmul kernel with a user defined schedule
diff --git a/tools/run_nvfuser_tests.py b/tools/run_nvfuser_tests.py
index 31c1067f37d..5292273f277 100644
--- a/tools/run_nvfuser_tests.py
+++ b/tools/run_nvfuser_tests.py
@@ -64,7 +64,7 @@ def get_python_tests(python_test_dir):
 
 def get_test_timeout(test_name):
     """Return timeout in seconds for a given test"""
-    if test_name in ["test_nvfuser", "test_matmul", "test_ops"]:
+    if test_name in ["test_nvfuser", "test_matmul", "test_ops.py"]:
         return 3600  # 1 hour
     return 600  # 10 minutes
 

From 378d2511c0dce903d15a0babd3b3a20e2e75375e Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sun, 2 Mar 2025 05:25:39 -0800
Subject: [PATCH 22/35] Fix serialization.

---
 .gitignore                        |  2 +-
 csrc/runtime/executor.cpp         | 59 +++++++++++++++++++++----------
 csrc/serde/fusion_cache.fbs       |  3 ++
 tests/python/test_schedule_ops.py |  2 +-
 4 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/.gitignore b/.gitignore
index 27b73f61ffa..89d7c587c4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,4 +52,4 @@ foo.bin
 # Mac OS internal file
 .DS_Store
 
-test_log*
\ No newline at end of file
+test_log*
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 7cc816e4f31..6e319679dea 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -781,11 +781,11 @@ void KernelExecutor::initializeExecutorEntry(
       output_aliased_to_input[output_idx] = aliased_to_idx;
     } else {
       auto aliased_out = std::find(
-              fusion->outputs().begin(), fusion->outputs().end(), aliased_to);
+          fusion->outputs().begin(), fusion->outputs().end(), aliased_to);
       NVF_ERROR(aliased_out != fusion->outputs().end(), "Alias not found");
       output_aliased_to_output[output_idx] =
           aliased_out - fusion->outputs().begin();
-      
+
       NVF_ERROR(
           output_aliased_to_output[output_idx] < (int)fusion->outputs().size(),
           "Alias found but is not an output or input of the fusion. ",
@@ -1017,7 +1017,7 @@ void KernelExecutor::computeArgs2(
       entry.args[arg_idx] = bytes;
       entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data();
     } else {
-      if(args[arg_idx].is<at::Tensor>()){
+      if (args[arg_idx].is<at::Tensor>()) {
         buffer_info_idx++;
       }
       auto bytes = polymorphicValueToBytes(
@@ -1287,6 +1287,7 @@ std::vector<at::Tensor> KernelExecutor::run(
         has_dynamic_alias_);
     outputs.reserve(outputs_args.size());
     if (has_dynamic_alias_) {
+      // TODO: Make sure dynamic alias works.
       for (const auto i :
            c10::irange(compiled_kernel_->kernel()->outputs().size())) {
         auto param = compiled_kernel_->kernel()->outputs()[i];
@@ -1402,8 +1403,7 @@ std::vector<at::Tensor> KernelExecutor::run(
       // std::cout << "Resolving TMA" << std::endl;
       args = resolveTMA(*executor_entry, args);
       // std::cout << "TMA resolved" << std::endl;
-    }
-    if (has_rng_) {
+    } else if (has_rng_) {
       // Resolving RNG seed requires evaluating and adding those values, but
       // doesn't require binding all values as getting RNG seed and offset
       // doesn't depend on other values
@@ -1550,7 +1550,10 @@ flatbuffers::Offset<serde::KernelExecutor> KernelExecutor::serialize(
       &executor_entry_lookup_keys_fb,
       &executor_entry_lookup_values_fb,
       toUnderlying(compiledKernel()->kernel()->indexType()),
-      serialize(builder, compiledKernel()->cudaExecutable().get()));
+      serialize(builder, compiledKernel()->cudaExecutable().get()),
+      has_rng_,
+      has_TMA_,
+      has_dynamic_alias_);
 }
 
 flatbuffers::Offset<serde::CudaKernel> KernelExecutor::serialize(
@@ -1619,9 +1622,14 @@ flatbuffers::Offset<serde::KernelExecutorEntry> KernelExecutor::serialize(
         ? -1
         : std::distance(
               compiledKernel()->kernel()->outputs().cbegin(), tv_iter);
-    NVF_ERROR(tv_position != -1, "Output TensorView not found in kernel outputs");
-    outputs_fb.push_back(
-        serialize(builder, buffer, tv_position, true /* is_fusion_output */, false /* is_fusion_input */));
+    NVF_ERROR(
+        tv_position != -1, "Output TensorView not found in kernel outputs");
+    outputs_fb.push_back(serialize(
+        builder,
+        buffer,
+        tv_position,
+        true /* is_fusion_output */,
+        false /* is_fusion_input */));
   }
 
   // Serialize GlobalBufferInfo for intermediates.
@@ -1645,12 +1653,17 @@ flatbuffers::Offset<serde::KernelExecutorEntry> KernelExecutor::serialize(
         : std::distance(
               compiledKernel()->kernel()->summary().global_allocations.cbegin(),
               tv_iter);
-    NVF_ERROR(tv_position != -1, "Intermediate TensorView not found in kernel global allocations");
-    intermediates_fb.push_back(
-        serialize(builder, buffer, tv_position, false /* is_fusion_output */, false /* is_fusion_input */));
+    NVF_ERROR(
+        tv_position != -1,
+        "Intermediate TensorView not found in kernel global allocations");
+    intermediates_fb.push_back(serialize(
+        builder,
+        buffer,
+        tv_position,
+        false /* is_fusion_output */,
+        false /* is_fusion_input */));
   }
 
-
   std::vector<fb_global_buffer_info> inputs_fb;
   inputs_fb.reserve(data.inputs.size());
   for (const auto& buffer : data.inputs) {
@@ -1660,11 +1673,14 @@ flatbuffers::Offset<serde::KernelExecutorEntry> KernelExecutor::serialize(
         buffer.tv);
     auto tv_position = (tv_iter == compiledKernel()->kernel()->inputs().cend())
         ? -1
-        : std::distance(
-              compiledKernel()->kernel()->inputs().cbegin(), tv_iter);
+        : std::distance(compiledKernel()->kernel()->inputs().cbegin(), tv_iter);
     NVF_ERROR(tv_position != -1, "Input TensorView not found in kernel inputs");
-    inputs_fb.push_back(
-        serialize(builder, buffer, tv_position, false /* is_fusion_output */, true /* is_fusion_input */));
+    inputs_fb.push_back(serialize(
+        builder,
+        buffer,
+        tv_position,
+        false /* is_fusion_output */,
+        true /* is_fusion_input */));
   }
   return serde::CreateKernelExecutorEntryDirect(
       builder,
@@ -1761,6 +1777,10 @@ void KernelExecutor::deserialize(
         buffer->executor_entry_lookup_keys()->Get(idx),
         deserialize(buffer->executor_entry_lookup_values()->Get(idx)));
   }
+
+  has_rng_ = buffer->has_rng();
+  has_TMA_ = buffer->has_TMA();
+  has_dynamic_alias_ = buffer->has_dynamic_alias();
 }
 
 KernelExecutorEntry KernelExecutor::deserialize(
@@ -1805,7 +1825,8 @@ GlobalBufferInfo KernelExecutor::deserialize(
   NVF_ERROR(buffer != nullptr, "serde::GlobalBufferInfo is nullptr.");
 
   NVF_ERROR(
-      buffer->tv_pos() != -1, "Serialization failed to encode buffer tv position.");
+      buffer->tv_pos() != -1,
+      "Serialization failed to encode buffer tv position.");
 
   NVF_ERROR(
       compiled_kernel_->lowered() != nullptr,
@@ -1816,7 +1837,7 @@ GlobalBufferInfo KernelExecutor::deserialize(
     auto out_val = compiled_kernel_->kernel()->outputs().at(buffer->tv_pos());
     NVF_ERROR(out_val != nullptr);
     info.tv = dynamic_cast<TensorView*>(out_val);
-  } else if(buffer->is_fusion_input()) {
+  } else if (buffer->is_fusion_input()) {
     auto in_val = compiled_kernel_->kernel()->inputs().at(buffer->tv_pos());
     NVF_ERROR(in_val != nullptr);
     info.tv = dynamic_cast<TensorView*>(in_val);
diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs
index 079c6e88152..c39b76bbad2 100644
--- a/csrc/serde/fusion_cache.fbs
+++ b/csrc/serde/fusion_cache.fbs
@@ -379,6 +379,9 @@ table KernelExecutor {
   // Is this kernel being compiled with int32 or int64 indexing?
   index_type : long;
   compiled_kernel: CudaKernel;
+  has_rng: bool;
+  has_TMA: bool;
+  has_dynamic_alias: bool;
 }
 
 // A directed edge on DAG, which wraps a value that connects segmented groups.
diff --git a/tests/python/test_schedule_ops.py b/tests/python/test_schedule_ops.py
index 3c59ff37621..a271a8531f3 100644
--- a/tests/python/test_schedule_ops.py
+++ b/tests/python/test_schedule_ops.py
@@ -1013,7 +1013,7 @@ def schedule(self):
         self.assertEqual(nvf_out[2], torch_ref)
 
     @pytest.mark.skip(
-    reason="Disable test, the scheduler is not actually sending to ExprEvalExec but is sending to KernelExecutor which will correctly error."
+        reason="Disable test, the scheduler is not actually sending to ExprEvalExec but is sending to KernelExecutor which will correctly error."
     )
     def test_matmul_auto_scheduler(self):
         """

From 9ba868ebec9ae7d6415c475aecd6412b2f2063d7 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sun, 2 Mar 2025 16:42:35 -0800
Subject: [PATCH 23/35] Clang.

---
 csrc/runtime/allocations.cpp       | 12 +++++++-----
 csrc/runtime/executor.cpp          | 16 ++++++++--------
 csrc/runtime/executor_kernel_arg.h |  2 +-
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 06817136eac..a459cbb09ed 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -403,7 +403,7 @@ KernelArgumentHolder allocateKernelOutputs(
     } else if (
         fusion->getOutputAlias(out_info.tv).type ==
         AllocationType::ReuseBuffer) {
-      auto inp = args[entry.output_aliased_to_input.at(out_idx)];
+      const auto& inp = args[entry.output_aliased_to_input.at(out_idx)];
       NVF_ERROR(inp.is<at::Tensor>(), "Input is not a Tensor");
       out_tensors[out_idx] = inp;
     } else if (
@@ -845,7 +845,7 @@ std::vector<GlobalBufferInfo> getInputBufferInfos(
       auto allocation_domain = TensorDomain::noReductions(
           buffer_info.tv->getMaybeAllocationDomain());
       std::unordered_map<int64_t, int64_t> logical_to_allocation_map;
-      for (int64_t logical_idx : c10::irange(logical_domain.size())) {
+      for (auto logical_idx : c10::irange(logical_domain.size())) {
         auto allocation_id = std::find(
             allocation_domain.begin(),
             allocation_domain.end(),
@@ -853,14 +853,16 @@ std::vector<GlobalBufferInfo> getInputBufferInfos(
         NVF_ERROR(
             allocation_id != allocation_domain.end(),
             "Logical domain and allocation domain have different sets of IterDomains, this is not supported yet.");
-        logical_to_allocation_map[logical_idx] =
+        logical_to_allocation_map[(int64_t)logical_idx] =
             std::distance(allocation_domain.begin(), allocation_id);
       }
       std::vector<int64_t> allocation_sizes(allocation_domain.size());
       std::vector<int64_t> allocation_strides(allocation_domain.size());
       for (auto i : c10::irange(allocation_domain.size())) {
-        allocation_sizes[i] = logical_sizes[logical_to_allocation_map[i]];
-        allocation_strides[i] = logical_strides[logical_to_allocation_map[i]];
+        allocation_sizes[i] =
+            (int64_t)logical_sizes[logical_to_allocation_map[i]];
+        allocation_strides[i] =
+            (int64_t)logical_strides[logical_to_allocation_map[i]];
       }
       buffer_info.shape_info.allocation_sizes = allocation_sizes;
       buffer_info.shape_info.allocation_strides = allocation_strides;
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index facd575f6ed..9a04931f146 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -718,13 +718,13 @@ void KernelExecutor::initializeExecutorEntry(
       }
       shape_info.allocation_sizes = alloc_sizes;
       shape_info.allocation_strides = alloc_strides;
-      GlobalBufferInfo info(
+      GlobalBufferInfo info{
           input_tv,
           shape_info,
           data_type_to_aten(input_tv->dtype()),
           false,
           false,
-          false);
+          false};
       input_info.emplace_back(info);
     }
   }
@@ -774,18 +774,18 @@ void KernelExecutor::initializeExecutorEntry(
     }
     NVF_ERROR(alias_info.aliased_io, "Alias info is not an input or output");
     auto aliased_to = alias_info.aliased_io->as<TensorView>();
-    auto aliased_to_idx =
+    auto aliased_to_idx = std::distance(
+        fusion->inputs().begin(),
         std::find(
-            fusion->inputs().begin(), fusion->inputs().end(), aliased_to) -
-        fusion->inputs().begin();
+            fusion->inputs().begin(), fusion->inputs().end(), aliased_to));
     if (aliased_to_idx < (int64_t)fusion->inputs().size()) {
-      output_aliased_to_input[output_idx] = aliased_to_idx;
+      output_aliased_to_input[(int64_t)output_idx] = aliased_to_idx;
     } else {
       auto aliased_out = std::find(
           fusion->outputs().begin(), fusion->outputs().end(), aliased_to);
       NVF_ERROR(aliased_out != fusion->outputs().end(), "Alias not found");
       output_aliased_to_output[output_idx] =
-          aliased_out - fusion->outputs().begin();
+          std::distance(fusion->outputs().begin(), aliased_out);
 
       NVF_ERROR(
           output_aliased_to_output[output_idx] < (int)fusion->outputs().size(),
@@ -1183,7 +1183,7 @@ KernelArgumentHolder KernelExecutor::resolveTMA(
         compiled_kernel_->kernel()->outputs()[out_idx], args[arg_idx++]);
   }
 
-  for (auto intermediate_entry : entry.intermediates) {
+  for (const auto& intermediate_entry : entry.intermediates) {
     expr_eval.bind(intermediate_entry.tv, args[arg_idx++]);
   }
 
diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h
index 6e3852549e6..a8a7fb110de 100644
--- a/csrc/runtime/executor_kernel_arg.h
+++ b/csrc/runtime/executor_kernel_arg.h
@@ -22,7 +22,7 @@
 
 namespace nvfuser {
 
-class GlobalBufferInfo;
+struct GlobalBufferInfo;
 
 //! KernelArgumentHolder copies meta information from kernel inputs, including
 //! tensor sizes/shapes/dtype/memory_ptr and copies scalar inputs. It is used

From 32a87fb5820e3956d8d5213f749d91a7b1c76a52 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sun, 2 Mar 2025 16:57:15 -0800
Subject: [PATCH 24/35] Remove output to output aliasing, it's not used.

---
 csrc/runtime/allocations.cpp |  2 +-
 csrc/runtime/executor.cpp    | 26 ++++++--------------------
 csrc/runtime/executor.h      |  1 -
 csrc/serde/fusion_cache.fbs  |  1 -
 4 files changed, 7 insertions(+), 23 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index a459cbb09ed..c3ef950d563 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -197,7 +197,7 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfIntermediate(
   return inferShape(tv, symbolic_sizes, expand_flags, expr_eval);
 }
 
-bool fill_allocation_with_nan_ = false;
+static bool fill_allocation_with_nan_ = false;
 
 bool shouldFillAllocationWithNan() {
   return fill_allocation_with_nan_;
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 9a04931f146..24b652ff323 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -763,7 +763,6 @@ void KernelExecutor::initializeExecutorEntry(
   }
 
   std::vector<int> output_aliased_to_input(output_info.size(), -1);
-  std::vector<int> output_aliased_to_output(output_info.size(), -1);
 
   for (auto output_idx : c10::irange(output_info.size())) {
     auto out_info = output_info[output_idx];
@@ -783,19 +782,12 @@ void KernelExecutor::initializeExecutorEntry(
     } else {
       auto aliased_out = std::find(
           fusion->outputs().begin(), fusion->outputs().end(), aliased_to);
-      NVF_ERROR(aliased_out != fusion->outputs().end(), "Alias not found");
-      output_aliased_to_output[output_idx] =
-          std::distance(fusion->outputs().begin(), aliased_out);
-
       NVF_ERROR(
-          output_aliased_to_output[output_idx] < (int)fusion->outputs().size(),
-          "Alias found but is not an output or input of the fusion. ",
-          "Aliased to tv: ",
-          aliased_to->toString(),
-          "\nFusion Inputs:\n  ",
-          fusion->inputs(),
-          "\nFusion Outputs:\n  ",
-          fusion->outputs());
+          aliased_out != fusion->outputs().end(),
+          "Could not find the alias tensor of: ",
+          out_info.tv->toString(),
+          "\nAliased to: ",
+          aliased_to->toString());
       NVF_THROW(
           "Kernel found with output to output aliasing, this is unsupported at this moment.\n",
           "Output: ",
@@ -811,7 +803,6 @@ void KernelExecutor::initializeExecutorEntry(
   executor_entry.launch_params = launch_params;
   executor_entry.outputs = output_info;
   executor_entry.output_aliased_to_input = output_aliased_to_input;
-  executor_entry.output_aliased_to_output = output_aliased_to_output;
   executor_entry.intermediates = intermediates;
   executor_entry.inputs = input_info;
   executor_entry.init = true;
@@ -1681,8 +1672,7 @@ flatbuffers::Offset<serde::KernelExecutorEntry> KernelExecutor::serialize(
       &outputs_fb,
       &intermediates_fb,
       &inputs_fb,
-      &data.output_aliased_to_input,
-      &data.output_aliased_to_output);
+      &data.output_aliased_to_input);
 }
 
 flatbuffers::Offset<serde::GlobalBufferInfo> KernelExecutor::serialize(
@@ -1803,10 +1793,6 @@ KernelExecutorEntry KernelExecutor::deserialize(
     entry.output_aliased_to_input.push_back(output_aliased_to_input);
   }
 
-  for (auto output_aliased_to_output : *buffer->output_aliased_to_output()) {
-    entry.output_aliased_to_output.push_back(output_aliased_to_output);
-  }
-
   return entry;
 }
 
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index 74547d2bf86..ffe4d1a37da 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -70,7 +70,6 @@ struct KernelExecutorEntry {
   // If an output is aliased to an input, this will hold the index of the
   // input that it is aliased to. If not aliased, it will hold -1.
   std::vector<int> output_aliased_to_input;
-  std::vector<int> output_aliased_to_output;
   // Temporary work buffers and intemediate global-memory tensors
   std::vector<GlobalBufferInfo> intermediates;
   std::vector<GlobalBufferInfo> inputs;
diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs
index c39b76bbad2..d7ffdeba45d 100644
--- a/csrc/serde/fusion_cache.fbs
+++ b/csrc/serde/fusion_cache.fbs
@@ -236,7 +236,6 @@ table KernelExecutorEntry {
   intermediates : [GlobalBufferInfo];
   inputs : [GlobalBufferInfo];
   output_aliased_to_input : [int];
-  output_aliased_to_output : [int];
 }
 
 // =====================================================================================

From cec8ab3ed348ee56124b8aada467713fe6c0c70b Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sun, 2 Mar 2025 18:17:36 -0800
Subject: [PATCH 25/35] Update HostIR Exec and remove duplicate allocation
 code.

---
 csrc/host_ir/executor.cpp       |  69 ++++++++++------
 csrc/runtime/allocations.cpp    | 141 +++-----------------------------
 csrc/runtime/allocations.h      |  22 +----
 csrc/runtime/executor.cpp       |  62 +++++---------
 csrc/runtime/executor.h         |   2 +-
 csrc/runtime/executor_utils.cpp |  38 +++++++++
 csrc/runtime/executor_utils.h   |   5 ++
 csrc/serde/fusion_cache.fbs     |   2 +-
 8 files changed, 123 insertions(+), 218 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index b726918d375..5465a02b09b 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -86,21 +86,6 @@ bool HostIrExecutor::isCompiled() const {
   return (bool)host_ir_container_;
 }
 
-namespace {
-// Host IR specific function, returns the at:Tensor (ordered list) associated
-// with the provdied Fusion output tv
-at::Tensor findBufferForFusionOutput(
-    const KernelArgumentHolder& output_args,
-    const Val* fusion_out,
-    const Fusion* fusion) {
-  auto i =
-      std::find(fusion->outputs().begin(), fusion->outputs().end(), fusion_out);
-  NVF_ERROR(i != fusion->outputs().end());
-  auto index = std::distance(fusion->outputs().begin(), i);
-  return output_args[index].as<at::Tensor>();
-}
-} // namespace
-
 KernelArgumentHolder HostIrExecutor::run(
     KernelArgumentHolder& args,
     KernelArgumentHolder output_args) {
@@ -120,13 +105,18 @@ KernelArgumentHolder HostIrExecutor::run(
   auto expr_eval = executor_utils::bindInputs(args, host_ir_container_.get());
 
   if (output_args.empty()) {
-    std::vector<GlobalBufferInfo> output_info = getBufferInfos(
+    std::vector<GlobalBufferInfo> output_infos = getBufferInfos(
         expr_eval, PrimDataType::Int, host_ir_container_->outputs());
-    output_args = allocateOutputs(
+    output_args.resize(host_ir_container_->outputs().size());
+    auto output_alias_to_input =
+        executor_utils::getOutputAliasToInputMap(host_ir_container_.get());
+    output_args = allocateKernelOutputs(
         host_ir_container_.get(),
-        output_info,
+        output_infos,
+        output_alias_to_input,
         c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex()),
-        expr_eval);
+        args,
+        true);
   }
 
   // TODO: If outputs are provided validate they're the correct size
@@ -136,8 +126,18 @@ KernelArgumentHolder HostIrExecutor::run(
     c10d::Backend* backend =
         communicator_->getBackendForTeam(communication->team(), std::nullopt);
     auto in_tensor = expr_eval.evaluate(communication->in()).as<at::Tensor>();
-    at::Tensor out_tensor = findBufferForFusionOutput(
-        output_args, communication->out(), host_ir_container_.get());
+    auto out_idx = std::distance(
+        host_ir_container_->outputs().begin(),
+        std::find(
+            host_ir_container_->outputs().begin(),
+            host_ir_container_->outputs().end(),
+            communication->out()));
+
+    NVF_ERROR(
+        out_idx < (int64_t)host_ir_container_->outputs().size(),
+        "Output tensor not found in fusion outputs");
+    auto out_tensor = output_args[out_idx].as<at::Tensor>();
+
     c10::intrusive_ptr<c10d::Work> work = postSingleCommunication(
         communication,
         communicator_->deviceId(),
@@ -148,6 +148,19 @@ KernelArgumentHolder HostIrExecutor::run(
       work->wait();
     }
   }
+
+  // Evaluate outputs that are marked as Evaluate
+  for (auto out_idx : c10::irange(host_ir_container_->outputs().size())) {
+    auto out = host_ir_container_->outputs()[out_idx];
+    auto alias_info = host_ir_container_->getOutputAlias(out);
+    if (alias_info.type == AllocationType::Evaluate) {
+      NVF_ERROR(
+          !output_args[out_idx].hasValue(),
+          "Output tensor already has a value");
+      output_args[out_idx] = expr_eval.evaluate(out);
+    }
+  }
+
   if (isProfilerEnabled()) {
     FusionProfiler::segment(group_id_).setDevice(args.getDeviceIndex());
     FusionProfiler::segment(group_id_).stopKernel();
@@ -572,13 +585,21 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) {
       "Allocation must be on a TensorView but got ",
       allocate->buffer());
   TensorView* tv = allocate->buffer()->as<TensorView>();
+  if (expr_evaluator_.isKnown(tv)) {
+    return;
+  }
   GlobalBufferInfo info =
       getBufferInfos(expr_evaluator_, PrimDataType::Int, {tv}).at(0);
-  AliasInfo alias_info = {
-      .type = AllocationType::New, .aliased_io = nullptr, .hide_output = false};
   c10::Device device =
       communicator_ ? communicator_->device() : at::Device("cuda:0");
-  at::Tensor tensor = allocateTensor(info, alias_info, device, expr_evaluator_);
+  auto tensor = at::native::empty_strided_cuda(
+      info.shape_info.logical_sizes,
+      info.shape_info.logical_strides,
+      info.type,
+      c10::nullopt,
+      device,
+      c10::nullopt);
+
   expr_evaluator_.bind(tv, tensor);
 }
 
diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index c3ef950d563..770d8a73a52 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -254,141 +254,20 @@ void fillTensorWithNan(at::Tensor& t) {
   }
 }
 
-at::Tensor allocateTensor(
-    const GlobalBufferInfo& out_info,
-    const AliasInfo& alias_info,
-    const c10::Device& device,
-    ExpressionEvaluator& ee) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::allocateTensor");
-  // Handle a fusion with duplicated outputs.
-  TensorView* out_tv = out_info.tv;
-  if (ee.isKnown(out_tv)) {
-    return ee.evaluate(out_tv).as<at::Tensor>();
-  }
-
-  std::optional<at::Tensor> aliased_io_tensor = std::nullopt;
-  Val* aliased_io = alias_info.aliased_io;
-  if (aliased_io != nullptr) {
-    NVF_ERROR(
-        aliased_io->isFusionInput() || aliased_io->isFusionOutput(),
-        aliased_io->toInlineString(),
-        " is expected to be a fusion input/output. `ee.evaluate` ",
-        "an intermediate tensor may involve GPU computation to materialize it ",
-        "to global memory.");
-    const PolymorphicValue& aliased_io_val = ee.evaluate(aliased_io);
-    NVF_ERROR(
-        aliased_io_val.is<at::Tensor>(),
-        "Alias io only supports tensor. Found ",
-        PolymorphicValue_functions::toString(aliased_io_val));
-    aliased_io_tensor = aliased_io_val.as<at::Tensor>();
-  }
-
-  switch (alias_info.type) {
-    case AllocationType::New: {
-      auto alloc_tensor = at::native::empty_strided_cuda(
-          out_info.shape_info.logical_sizes,
-          out_info.shape_info.logical_strides,
-          out_info.type,
-          c10::nullopt,
-          device,
-          c10::nullopt);
-      if (shouldFillAllocationWithNan()) {
-        fillTensorWithNan(alloc_tensor);
-      }
-      return alloc_tensor;
-    }
-    case AllocationType::ReuseBuffer:
-      // Unlike for `AllocationType::Evaluate`, don't use
-      // ExpressionEvaluator to compute the output tensor. This is because
-      // the output tensor may hold different data from the input, e.g., an
-      // updated running mean.  `ExpressionEvaluator::evaluate(out_tv)`
-      // would trigger non-trivial host computation.
-      return aliased_io_tensor.value();
-    case AllocationType::Evaluate: {
-      auto out_tensor = ee.evaluate(out_tv).as<at::Tensor>();
-      if (aliased_io_tensor.has_value()) {
-        NVF_ERROR(
-            out_tensor.is_alias_of(aliased_io_tensor.value()),
-            "ExpressionEvaluator failed to evaluate ",
-            out_tv->toString(),
-            " as an alias of ",
-            aliased_io->toString());
-        inferAndValidateAllocationSizesAndStrides(out_tensor, out_tv, ee);
-      }
-      return out_tensor;
-    }
-    default:
-      NVF_THROW("Unrecognized AllocationType.");
-  }
-}
-
-KernelArgumentHolder allocateOutputs(
-    const Fusion* fusion,
-    const std::vector<GlobalBufferInfo>& output_info,
-    const c10::Device& device,
-    ExpressionEvaluator& ee) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs");
-
-  const auto num_outs = output_info.size();
-
-  // Sort the outputs so we compute aliases after allocating non-aliases. The
-  // order between aliases can be arbitrary. E.g.,
-  //
-  // ```
-  // non_alias_out = ...
-  // alias_out_0 = reshape(non_alias_out, ...)
-  // alias_out_1 = reshape(alias_out_0, ...)
-  // ```
-  //
-  // It's fine to compute `alias_out_1` before computing `alias_out_0`: when we
-  // compute `alias_out_1`, `alias_out_0` will be recursively
-  // `ExpressionEvaluator::evaluate`ed. However, `non_alias_out` must be
-  // allocated first so `alias_out_*` can refer them.
-  std::vector<std::pair<int64_t, Val*>> sorted_outs;
-  sorted_outs.reserve(num_outs);
-  for (const auto out_index : c10::irange(num_outs)) {
-    sorted_outs.emplace_back(out_index, fusion->outputs()[out_index]);
-  }
-  std::sort(
-      sorted_outs.begin(),
-      sorted_outs.end(),
-      [fusion](
-          const std::pair<int64_t, Val*>& lhs,
-          const std::pair<int64_t, Val*>& rhs) {
-        return (
-            fusion->getOutputAlias(lhs.second).type == AllocationType::New &&
-            fusion->getOutputAlias(rhs.second).type != AllocationType::New);
-      });
-
-  std::vector<at::Tensor> out_tensors(num_outs);
-  for (const auto& [out_index, out] : sorted_outs) {
-    at::Tensor out_tensor = allocateTensor(
-        output_info[out_index], fusion->getOutputAlias(out), device, ee);
-    // Bind `out_tensor` so
-    // 1. duplicated outputs map to the same tensor,
-    // 2. an output that aliases another output can be evaluated via
-    // ExpressionEvaluator cheaply.
-    ee.bind(out, out_tensor);
-    out_tensors[out_index] = out_tensor;
-  }
-  return KernelArgumentHolder(out_tensors);
-}
-
 KernelArgumentHolder allocateKernelOutputs(
     const Fusion* fusion,
-    const KernelExecutorEntry& entry,
+    const std::vector<GlobalBufferInfo>& output_infos,
+    const std::vector<int>& output_alias_to_input_map,
     const c10::Device& device,
     const KernelArgumentHolder& args,
-    bool dynamic_alias) {
+    bool dynamic_evaluate) {
   FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs");
 
-  // TODO: Figure out if output to output aliasing is needed
-
   KernelArgumentHolder out_tensors;
-  out_tensors.resize(entry.outputs.size());
-  for (auto out_idx : c10::irange(entry.outputs.size())) {
-    auto out_info = entry.outputs.at(out_idx);
-    if (entry.output_aliased_to_input.at(out_idx) == -1) {
+  out_tensors.resize(output_infos.size());
+  for (auto out_idx : c10::irange(output_infos.size())) {
+    auto out_info = output_infos.at(out_idx);
+    if (output_alias_to_input_map.at(out_idx) == -1) {
       auto alloc_tensor = at::native::empty_strided_cuda(
           out_info.shape_info.logical_sizes,
           out_info.shape_info.logical_strides,
@@ -403,12 +282,12 @@ KernelArgumentHolder allocateKernelOutputs(
     } else if (
         fusion->getOutputAlias(out_info.tv).type ==
         AllocationType::ReuseBuffer) {
-      const auto& inp = args[entry.output_aliased_to_input.at(out_idx)];
+      const auto& inp = args[output_alias_to_input_map.at(out_idx)];
       NVF_ERROR(inp.is<at::Tensor>(), "Input is not a Tensor");
       out_tensors[out_idx] = inp;
     } else if (
         fusion->getOutputAlias(out_info.tv).type == AllocationType::Evaluate) {
-      if (dynamic_alias) {
+      if (dynamic_evaluate) {
         out_tensors[out_idx] = std::monostate();
         continue;
       }
@@ -416,7 +295,7 @@ KernelArgumentHolder allocateKernelOutputs(
       ExpressionEvaluator ee;
       ee.bind(
           fusion->getOutputAlias(out_info.tv).aliased_io,
-          args[entry.output_aliased_to_input.at(out_idx)]);
+          args[output_alias_to_input_map.at(out_idx)]);
       out_tensors[out_idx] = ee.evaluate(out_info.tv);
     } else {
       NVF_THROW(
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
index 72fea232ab1..71a3b46ddbb 100644
--- a/csrc/runtime/allocations.h
+++ b/csrc/runtime/allocations.h
@@ -75,32 +75,18 @@ TensorShapeInfo inferTensorShapes(
     TensorView* tv,
     const ExpressionEvaluator& expr_eval);
 
-// Allocate an `at::Tensor` for `out_info` or compute it as an alias.
-at::Tensor allocateTensor(
-    const GlobalBufferInfo& out_info,
-    const AliasInfo& alias_info,
-    const c10::Device& device,
-    ExpressionEvaluator& ee);
-
-// Allocate output tensors for a given fusion. Outputs may alias inputs, in
-// that case output tensors are shallow copies of the aliased inputs
-KernelArgumentHolder allocateOutputs(
-    const Fusion* fusion,
-    const std::vector<GlobalBufferInfo>& output_info,
-    const c10::Device& device,
-    ExpressionEvaluator& ee);
-
 // Allocate output tensors for a given fusion. Outputs may alias inputs, in
 // that case output tensors are shallow copies of the aliased inputs.
 //
-// If dynamic_alias is true, then any argument with AllocationType::Evaluate
+// If dynamic_evaluate is true, then any argument with AllocationType::Evaluate
 // will not be populated, it will be filled with std::monostate.
 KernelArgumentHolder allocateKernelOutputs(
     const Fusion* fusion,
-    const KernelExecutorEntry& entry,
+    const std::vector<GlobalBufferInfo>& output_infos,
+    const std::vector<int>& output_alias_to_input_map,
     const c10::Device& device,
     const KernelArgumentHolder& args,
-    bool dynamic_alias = false);
+    bool dynamic_evaluate = false);
 
 //! Return information necessary for allocating output tensors. Input
 //! and output tensors are allowed to alias each other, which is
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 24b652ff323..8340db7b33c 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -285,13 +285,22 @@ void KernelExecutor::compile(
 
   for (auto expr : exprs) {
     if (ir_utils::isCpAsyncBulk(expr)) {
-      has_TMA_ = true;
+      has_tma_ = true;
     }
     if (expr->isA<RNGOp>()) {
       has_rng_ = true;
     }
   }
 
+  // If an output has an alias to an input and is marked Evaluate, then
+  // expression evaluator evaluate is called on that output to produce the meta
+  // data manipulation it requires. If that manipulation is something like a
+  // slice, and that slice has a symbolic integer it depends on, then this
+  // function returns true.
+  //
+  // This could happen for other examples and this function will return true if
+  // to evaluate the output that has an alias, other values besides the aliased
+  // input need to be bound to the expression evaluator to evaluate the output.
   for (auto output : fusion->outputs()) {
     if (output->isA<TensorView>()) {
       auto out_tv = output->as<TensorView>();
@@ -762,47 +771,13 @@ void KernelExecutor::initializeExecutorEntry(
     }
   }
 
-  std::vector<int> output_aliased_to_input(output_info.size(), -1);
-
-  for (auto output_idx : c10::irange(output_info.size())) {
-    auto out_info = output_info[output_idx];
-    auto fusion = compiled_kernel_->kernel()->as<Fusion>();
-    auto alias_info = fusion->getOutputAlias(out_info.tv);
-    if (alias_info.type == AllocationType::New) {
-      continue;
-    }
-    NVF_ERROR(alias_info.aliased_io, "Alias info is not an input or output");
-    auto aliased_to = alias_info.aliased_io->as<TensorView>();
-    auto aliased_to_idx = std::distance(
-        fusion->inputs().begin(),
-        std::find(
-            fusion->inputs().begin(), fusion->inputs().end(), aliased_to));
-    if (aliased_to_idx < (int64_t)fusion->inputs().size()) {
-      output_aliased_to_input[(int64_t)output_idx] = aliased_to_idx;
-    } else {
-      auto aliased_out = std::find(
-          fusion->outputs().begin(), fusion->outputs().end(), aliased_to);
-      NVF_ERROR(
-          aliased_out != fusion->outputs().end(),
-          "Could not find the alias tensor of: ",
-          out_info.tv->toString(),
-          "\nAliased to: ",
-          aliased_to->toString());
-      NVF_THROW(
-          "Kernel found with output to output aliasing, this is unsupported at this moment.\n",
-          "Output: ",
-          out_info.tv->toString(),
-          "\nAliased to: ",
-          aliased_to->toString());
-    }
-  }
-
   auto intermediates = getIntermediateBufferInfo(expr_eval, index_type);
 
   // All information is gathered. Save it to KernelExecutorEntry
   executor_entry.launch_params = launch_params;
   executor_entry.outputs = output_info;
-  executor_entry.output_aliased_to_input = output_aliased_to_input;
+  executor_entry.output_aliased_to_input =
+      executor_utils::getOutputAliasToInputMap(compiled_kernel_->kernel());
   executor_entry.intermediates = intermediates;
   executor_entry.inputs = input_info;
   executor_entry.init = true;
@@ -1205,7 +1180,7 @@ KernelArgumentHolder KernelExecutor::run(
   }
 
   ExpressionEvaluator expr_eval;
-  if (has_dynamic_alias_ || has_TMA_) {
+  if (has_dynamic_alias_ || has_tma_) {
     expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel());
   }
 
@@ -1273,7 +1248,8 @@ KernelArgumentHolder KernelExecutor::run(
   if (output_args.empty()) {
     output_args = allocateKernelOutputs(
         compiled_kernel_->kernel(),
-        *executor_entry,
+        executor_entry->outputs,
+        executor_entry->output_aliased_to_input,
         compiled_kernel_->device(),
         args,
         has_dynamic_alias_);
@@ -1375,11 +1351,11 @@ KernelArgumentHolder KernelExecutor::run(
 
   if (args.size() != compiled_kernel_->kernel()->parameters().size()) {
     NVF_ERROR(
-        has_TMA_ || has_rng_,
+        has_tma_ || has_rng_,
         "No TMA or RNG found in the kernel, but detected an argument size mismatch.");
     // If args don't match one of two things is happening. We need to add TMA
     // related args or RNG related args. Resolve these scenarios.
-    if (has_TMA_) {
+    if (has_tma_) {
       // Resolving TMA requires binding all values and evaluating the TMA
       // arguments
       // std::cout << "Resolving TMA" << std::endl;
@@ -1535,7 +1511,7 @@ flatbuffers::Offset<serde::KernelExecutor> KernelExecutor::serialize(
       toUnderlying(compiledKernel()->kernel()->indexType()),
       serialize(builder, compiledKernel()->cudaExecutable().get()),
       has_rng_,
-      has_TMA_,
+      has_tma_,
       has_dynamic_alias_);
 }
 
@@ -1761,7 +1737,7 @@ void KernelExecutor::deserialize(
   }
 
   has_rng_ = buffer->has_rng();
-  has_TMA_ = buffer->has_TMA();
+  has_tma_ = buffer->has_tma();
   has_dynamic_alias_ = buffer->has_dynamic_alias();
 }
 
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index ffe4d1a37da..bd352dd4fe6 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -319,7 +319,7 @@ class KernelExecutor : public ExecutorAbstract {
 
   // Has a TMA kernel and therefore needs to infer TMA inputs through expression
   // evaluator
-  bool has_TMA_ = false;
+  bool has_tma_ = false;
 
   // Has a dynamic alias and therefore needs to infer what they are through
   // expression evaluator
diff --git a/csrc/runtime/executor_utils.cpp b/csrc/runtime/executor_utils.cpp
index 6e0c2d769a5..e81f2895a31 100644
--- a/csrc/runtime/executor_utils.cpp
+++ b/csrc/runtime/executor_utils.cpp
@@ -599,6 +599,44 @@ ExpressionEvaluator bindInputs(
   return expr_eval;
 }
 
+std::vector<int> getOutputAliasToInputMap(const Fusion* fusion) {
+  std::vector<int> output_to_input_map(fusion->outputs().size(), -1);
+  for (auto output_idx : c10::irange(fusion->outputs().size())) {
+    auto alias_info = fusion->getOutputAlias(fusion->outputs()[output_idx]);
+    if (alias_info.type == AllocationType::New) {
+      continue;
+    }
+    NVF_ERROR(
+        alias_info.aliased_io && alias_info.aliased_io->isA<TensorView>(),
+        "Alias information is missing the aliased tensor.");
+
+    auto aliased_to = alias_info.aliased_io->as<TensorView>();
+    auto aliased_to_idx = std::distance(
+        fusion->inputs().begin(),
+        std::find(
+            fusion->inputs().begin(), fusion->inputs().end(), aliased_to));
+    if (aliased_to_idx < (int64_t)fusion->inputs().size()) {
+      output_to_input_map[(int64_t)output_idx] = aliased_to_idx;
+    } else {
+      auto aliased_out = std::find(
+          fusion->outputs().begin(), fusion->outputs().end(), aliased_to);
+      NVF_ERROR(
+          aliased_out != fusion->outputs().end(),
+          "Could not find the alias tensor of: ",
+          fusion->outputs()[output_idx]->toString(),
+          "\nAliased to: ",
+          aliased_to->toString());
+      NVF_THROW(
+          "Kernel found with output to output aliasing, this is unsupported at this moment.\n",
+          "Output: ",
+          fusion->outputs()[output_idx]->toString(),
+          "\nAliased to: ",
+          aliased_to->toString());
+    }
+  }
+  return output_to_input_map;
+}
+
 CudaExecutable::~CudaExecutable() {
   if (module != nullptr) {
     NVFUSER_CUDA_SAFE_CALL(cuModuleUnload(module));
diff --git a/csrc/runtime/executor_utils.h b/csrc/runtime/executor_utils.h
index 9d631f2548e..211e612c31c 100644
--- a/csrc/runtime/executor_utils.h
+++ b/csrc/runtime/executor_utils.h
@@ -51,6 +51,11 @@ struct CudaExecutable : public NonCopyable {
 NVF_API ExpressionEvaluator
 bindInputs(const KernelArgumentHolder& args, Fusion* fusion);
 
+// Returns a vector where vector[out_idx] == the input index in fusion->inputs()
+// that output[out_idx] is aliased to. If output[out_idx] is not aliased to any
+// input, then vector[out_idx] is -1.
+std::vector<int> getOutputAliasToInputMap(const Fusion* fusion);
+
 // Compile time cache for execution
 namespace caching {
 // TODO: Could consider putting some of
diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs
index d7ffdeba45d..a09c3497e73 100644
--- a/csrc/serde/fusion_cache.fbs
+++ b/csrc/serde/fusion_cache.fbs
@@ -379,7 +379,7 @@ table KernelExecutor {
   index_type : long;
   compiled_kernel: CudaKernel;
   has_rng: bool;
-  has_TMA: bool;
+  has_tma: bool;
   has_dynamic_alias: bool;
 }
 

From e05cd7fb320c15b524760d864ef9d6b6ec0696d7 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sun, 2 Mar 2025 18:18:48 -0800
Subject: [PATCH 26/35] Revert test debug changes.

---
 tests/cpp/test_allocation_domain.cpp    |  4 ----
 tests/cpp/test_multidevice_sharding.cpp | 13 +++++--------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp
index e78425045fd..f736edc1296 100644
--- a/tests/cpp/test_allocation_domain.cpp
+++ b/tests/cpp/test_allocation_domain.cpp
@@ -1114,11 +1114,7 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) {
   fusion->addOutput(tv1);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  // allocate [8,8] strided [8, 1]
-  // modify to [4,8] strided [1, 8]
-  // Tell nvFuser it's allocated as [4, 8] strided [8, 1]
   at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8});
-
   FusionExecutorCache executor_cache(std::move(fusion));
   auto outputs = executor_cache.runFusionWithInputs({t0});
 
diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp
index 7772ae9da9e..5c7a5e43c62 100644
--- a/tests/cpp/test_multidevice_sharding.cpp
+++ b/tests/cpp/test_multidevice_sharding.cpp
@@ -241,16 +241,13 @@ TEST_F(MultiDeviceTest, DivideBySum) {
   for (auto* tv : {x, y}) {
     tv->setAllocationDomain(tv->getLoopDomain(), true);
   }
-  x->setContiguity({false, false, true, true, true});
-  const int64_t b = 7;
-  const int64_t h = d * 5;
-  const int64_t s = 3;
+
+  const int64_t b = 2;
+  const int64_t h = d * 3;
+  const int64_t s = 5;
   at::Tensor unsharded_x_tensor = at::randint(5, {b, h, s, s}, tensor_options);
+  at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x);
 
-  // at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x);
-  at::Tensor x_tensor = at::randint(5, {b, h / d, s, s}, tensor_options);
-  std::cout << "Properties: " << debug_str(x_tensor) << "\n";
-  // std::cout<<"input:"<<x_tensor<<std::endl;
   FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor y_tensor =
       executor_cache.runFusionWithInputs({x_tensor})[0].as<at::Tensor>();

From 7c2ac6507f6098c12ed0bf8174e6373c62d0c1df Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Sun, 2 Mar 2025 18:27:13 -0800
Subject: [PATCH 27/35] Rename and remove dead code.

---
 csrc/host_ir/executor.cpp    |  2 +-
 csrc/runtime/allocations.cpp | 68 +-----------------------------------
 csrc/runtime/allocations.h   |  8 +----
 csrc/runtime/executor.cpp    |  2 +-
 4 files changed, 4 insertions(+), 76 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 5465a02b09b..32c2f5a956e 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -110,7 +110,7 @@ KernelArgumentHolder HostIrExecutor::run(
     output_args.resize(host_ir_container_->outputs().size());
     auto output_alias_to_input =
         executor_utils::getOutputAliasToInputMap(host_ir_container_.get());
-    output_args = allocateKernelOutputs(
+    output_args = allocateOutputs(
         host_ir_container_.get(),
         output_infos,
         output_alias_to_input,
diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 770d8a73a52..83ab60d900d 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -254,7 +254,7 @@ void fillTensorWithNan(at::Tensor& t) {
   }
 }
 
-KernelArgumentHolder allocateKernelOutputs(
+KernelArgumentHolder allocateOutputs(
     const Fusion* fusion,
     const std::vector<GlobalBufferInfo>& output_infos,
     const std::vector<int>& output_alias_to_input_map,
@@ -689,72 +689,6 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
   return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()};
 }
 
-std::vector<GlobalBufferInfo> getInputBufferInfos(
-    ExpressionEvaluator& expr_eval,
-    DataType index_dtype,
-    const std::vector<Val*>& fusion_inputs,
-    const std::vector<at::Tensor>& inputs) {
-  NVF_ERROR(
-      fusion_inputs.size() == inputs.size(),
-      "Mismatch in inputs provided, expected ",
-      fusion_inputs.size(),
-      " but got ",
-      inputs.size());
-  std::vector<GlobalBufferInfo> buffer_infos;
-  for (auto i : c10::irange(fusion_inputs.size())) {
-    GlobalBufferInfo buffer_info;
-    buffer_info.tv = fusion_inputs[i]->as<TensorView>();
-    auto logical_sizes = inputs[i].sizes().vec();
-    auto logical_strides = inputs[i].strides().vec();
-    TensorShapeInfo shape_info;
-    shape_info.logical_sizes = logical_sizes;
-    shape_info.logical_strides = logical_strides;
-    if (isSharded(buffer_info.tv)) {
-      shape_info.unsharded_logical_sizes =
-          unshardedSizes(buffer_info.tv, logical_sizes);
-    }
-    buffer_info.shape_info = shape_info;
-    buffer_info.type = inputs[i].scalar_type();
-
-    // TODO: Handle input allocation domains that aren't permutes
-    // of the logical domain
-    if (buffer_info.tv->hasAllocation()) {
-      auto logical_domain =
-          TensorDomain::noReductions(buffer_info.tv->getLogicalDomain());
-      auto allocation_domain = TensorDomain::noReductions(
-          buffer_info.tv->getMaybeAllocationDomain());
-      std::unordered_map<int64_t, int64_t> logical_to_allocation_map;
-      for (auto logical_idx : c10::irange(logical_domain.size())) {
-        auto allocation_id = std::find(
-            allocation_domain.begin(),
-            allocation_domain.end(),
-            logical_domain[logical_idx]);
-        NVF_ERROR(
-            allocation_id != allocation_domain.end(),
-            "Logical domain and allocation domain have different sets of IterDomains, this is not supported yet.");
-        logical_to_allocation_map[(int64_t)logical_idx] =
-            std::distance(allocation_domain.begin(), allocation_id);
-      }
-      std::vector<int64_t> allocation_sizes(allocation_domain.size());
-      std::vector<int64_t> allocation_strides(allocation_domain.size());
-      for (auto i : c10::irange(allocation_domain.size())) {
-        allocation_sizes[i] =
-            (int64_t)logical_sizes[logical_to_allocation_map[i]];
-        allocation_strides[i] =
-            (int64_t)logical_strides[logical_to_allocation_map[i]];
-      }
-      buffer_info.shape_info.allocation_sizes = allocation_sizes;
-      buffer_info.shape_info.allocation_strides = allocation_strides;
-    } else {
-      buffer_info.shape_info.allocation_sizes = logical_sizes;
-      buffer_info.shape_info.allocation_strides = logical_strides;
-    }
-
-    buffer_infos.emplace_back(buffer_info);
-  }
-  return buffer_infos;
-}
-
 TensorShapeInfo inferTensorShapes(
     TensorView* tv,
     const ExpressionEvaluator& expr_eval) {
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
index 71a3b46ddbb..c1907fc1955 100644
--- a/csrc/runtime/allocations.h
+++ b/csrc/runtime/allocations.h
@@ -80,7 +80,7 @@ TensorShapeInfo inferTensorShapes(
 //
 // If dynamic_evaluate is true, then any argument with AllocationType::Evaluate
 // will not be populated, it will be filled with std::monostate.
-KernelArgumentHolder allocateKernelOutputs(
+KernelArgumentHolder allocateOutputs(
     const Fusion* fusion,
     const std::vector<GlobalBufferInfo>& output_infos,
     const std::vector<int>& output_alias_to_input_map,
@@ -96,10 +96,4 @@ std::vector<GlobalBufferInfo> getBufferInfos(
     DataType index_dtype,
     const std::vector<Val*>& fusion_outputs);
 
-std::vector<GlobalBufferInfo> getInputBufferInfos(
-    ExpressionEvaluator& expr_eval,
-    DataType index_dtype,
-    const std::vector<Val*>& fusion_outputs,
-    const std::vector<at::Tensor>& inputs);
-
 } // namespace nvfuser
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 8340db7b33c..69d0b6a53da 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -1246,7 +1246,7 @@ KernelArgumentHolder KernelExecutor::run(
 
   // only allocate outputs when not given
   if (output_args.empty()) {
-    output_args = allocateKernelOutputs(
+    output_args = allocateOutputs(
         compiled_kernel_->kernel(),
         executor_entry->outputs,
         executor_entry->output_aliased_to_input,

From 7a320bd2dd45f10187e3f827fbdb7f40c3d3f356 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 3 Mar 2025 07:44:42 -0800
Subject: [PATCH 28/35] Clang.

---
 csrc/runtime/executor_utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/runtime/executor_utils.cpp b/csrc/runtime/executor_utils.cpp
index e81f2895a31..8a54d0c30b9 100644
--- a/csrc/runtime/executor_utils.cpp
+++ b/csrc/runtime/executor_utils.cpp
@@ -616,7 +616,7 @@ std::vector<int> getOutputAliasToInputMap(const Fusion* fusion) {
         std::find(
             fusion->inputs().begin(), fusion->inputs().end(), aliased_to));
     if (aliased_to_idx < (int64_t)fusion->inputs().size()) {
-      output_to_input_map[(int64_t)output_idx] = aliased_to_idx;
+      output_to_input_map[(int64_t)output_idx] = (int64_t)aliased_to_idx;
     } else {
       auto aliased_out = std::find(
           fusion->outputs().begin(), fusion->outputs().end(), aliased_to);

From 15b03c581129014be13d49d3d641ae225fcd3417 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 3 Mar 2025 08:04:02 -0800
Subject: [PATCH 29/35] Cleanup, remove dead code.

---
 benchmarks/cpp/matmul.cpp            |   9 +-
 csrc/runtime/executor.cpp            | 159 +--------------------------
 csrc/runtime/executor.h              |  20 +---
 csrc/runtime/executor_kernel_arg.cpp |  40 -------
 csrc/runtime/executor_kernel_arg.h   |  14 ---
 5 files changed, 10 insertions(+), 232 deletions(-)

diff --git a/benchmarks/cpp/matmul.cpp b/benchmarks/cpp/matmul.cpp
index 0f5159b9cad..5ef718b4a4f 100644
--- a/benchmarks/cpp/matmul.cpp
+++ b/benchmarks/cpp/matmul.cpp
@@ -175,7 +175,7 @@ static void SingleMatmulBase(
   // Compile kernel
   auto launch_constraints = LaunchParams();
   KernelExecutor ke;
-  ke.compile(fusion, args.toC10Array(), launch_constraints, cparams);
+  ke.compile(fusion, args, launch_constraints, cparams);
   NVF_CHECK(
       getBankConflictInfo(ke.compiledKernel()->kernel(), launch_constraints)
           .empty(),
@@ -352,7 +352,7 @@ static void SingleMatmulPartitionedK(
   // Compile kernel
   KernelExecutor ke;
   auto lparams = LaunchParams();
-  ke.compile(fusion, args.toC10Array(), lparams, cparams);
+  ke.compile(fusion, args, lparams, cparams);
   NVF_CHECK(
       getBankConflictInfo(ke.compiledKernel()->kernel(), lparams).empty(),
       "Shared memory bank conflict not removed.");
@@ -461,10 +461,7 @@ static void NvFuserScheduler_MatmulSplitKReduction(
   // Compile kernel
   KernelExecutor ke;
   ke.compile(
-      fusion,
-      args.toC10Array(),
-      heuristic_params->lparams,
-      heuristic_params->cparams);
+      fusion, args, heuristic_params->lparams, heuristic_params->cparams);
 
   NVF_CHECK(
       getBankConflictInfo(
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 69d0b6a53da..8cf1adf4625 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -665,7 +665,6 @@ void KernelExecutor::initializeExecutorEntry(
   auto launch_params = computeLaunchParams(
       launch_constraints, expr_eval, warp_size_, index_type);
 
-  // NVF_THROW("Stop here");
   for (const auto& entry : compiled_kernel_->kernel()->summary().validations) {
     NVF_CHECK(expr_eval.evaluate(entry.first).as<bool>(), entry.second);
   }
@@ -783,101 +782,6 @@ void KernelExecutor::initializeExecutorEntry(
   executor_entry.init = true;
 }
 
-/// Copies the data, logical_size, and alloc_stride parameters to the
-/// appropriate parts of entry.args[idx].
-///
-/// For GPU tensors, we pass a Tensor<type, rank, rank> struct (see
-/// runtime/tensor.cu), where the rank describes the number of elements in the
-/// shape and stride arrays. The actual shapes and strides are dynamic, but the
-/// type and rank of the tensors are actually static (changing them would need
-/// a new FusionDefinition). So we create the storage area for the
-/// Tensor<t,r,r> during ::computeArgs, and then in this function we just
-/// update that memory with the current values for the tensor's base address,
-/// shape, and strides.
-///
-/// @param entry the entry we have previously setup for this fusion
-/// @param idx the index into entry.args and related parallel arrays in the
-///            entry.
-/// @param idx_type_size generally sizeof(int32_t) or sizeof(int64_t); used for
-///                      computing how large the arrays to copy are.
-static void fillTensorArgMetadata(
-    KernelExecutorEntry& entry,
-    const PolymorphicValue& tensor_metadata,
-    size_t idx,
-    size_t idx_type_size) {
-  void* data = tensor_metadata->*&TensorMetaData::data;
-  // g++ has trouble inferring the types of more complicated fields through our
-  // *& operators. Creating an `auto` alias as a temporary resolves this
-  // problem.
-#define TMD_ARRAY_REF(pv, field)                  \
-  ({                                              \
-    const auto& fld_tmp_ = pv->*&field;           \
-    const c10::IntArrayRef& fld_aref_ = fld_tmp_; \
-    fld_aref_;                                    \
-  })
-  const c10::IntArrayRef& shape =
-      TMD_ARRAY_REF(tensor_metadata, TensorMetaData::logical_size);
-  const c10::IntArrayRef& strides =
-      TMD_ARRAY_REF(tensor_metadata, TensorMetaData::alloc_stride);
-#undef TMD_ARRAY_REF
-
-  // These are the three offsets we need to copy into.
-  std::array<std::byte*, 3> offsets = {
-      entry.args[idx].data(), // data ptr
-      entry.args[idx].data() + sizeof(void*), // shape array
-      // strides array:
-      entry.args[idx].data() + sizeof(void*) + shape.size() * idx_type_size,
-  };
-
-  memcpy(offsets[0], &data, sizeof(void*));
-  switch (idx_type_size) {
-    case sizeof(int64_t): {
-      // we use i64's for our sizes, so can use a simple copy here
-      memcpy(offsets[1], shape.data(), shape.size() * sizeof(int64_t));
-      memcpy(offsets[2], strides.data(), strides.size() * sizeof(int64_t));
-    } break;
-    case sizeof(int32_t): {
-      // we need to cast per-element, so need a loop.
-      // This case happens when the kernel uses 32bit indices. Since we
-      // (specifically TensorMetaData) store indices in 64bit, we can't
-      // directly copy our buffer into the args buffer. We thus have to
-      // manually downcast each element to fit in the smaller buffer.
-      for (size_t i = 0; i < shape.size(); ++i) {
-        const int32_t shp = static_cast<int32_t>(shape[i]);
-        memcpy(offsets[1] + i * sizeof(int32_t), &shp, sizeof(int32_t));
-      }
-      // In rare cases we have fewer strides than shapes
-      for (size_t i = 0; i < strides.size(); ++i) {
-        const int32_t strd = static_cast<int32_t>(strides[i]);
-        memcpy(offsets[2] + i * sizeof(int32_t), &strd, sizeof(int32_t));
-      }
-    } break;
-    default:
-      NVF_CHECK(0, "Unhandled index type size");
-      break;
-  }
-}
-
-// set the arguments that we'll pass to cuLaunchKernel. This should happen
-// when we change the rank of a tensor or the number of arguments to a kernel.
-// It does not need to happen when only shapes change---use recomputeArgs for
-// that.
-void KernelExecutor::computeArgs(
-    KernelExecutorEntry& entry,
-    ExpressionEvaluator& expr_eval,
-    const kir::Kernel* kernel) const {
-  FUSER_PERF_SCOPE("KernelExecutor::computeArgs");
-
-  const std::vector<Val*>& params = kernel->parameters();
-  entry.args.resize(params.size());
-  entry.arg_ptrs.resize(params.size());
-  const PrimDataType idx_type = kernel->indexType();
-  for (size_t p = 0; p < params.size(); ++p) {
-    entry.args[p] = getKernelArgument(expr_eval, params[p], idx_type);
-    entry.arg_ptrs[p] = entry.args[p].data();
-  }
-}
-
 namespace {
 GlobalBufferInfo& linear_buffer_info_getter(
     KernelExecutorEntry& entry,
@@ -906,10 +810,10 @@ GlobalBufferInfo& linear_buffer_info_getter(
 };
 } // namespace
 
-void KernelExecutor::computeArgs2(
+void KernelExecutor::computeArgs(
     KernelExecutorEntry& entry,
     const KernelArgumentHolder& args) const {
-  FUSER_PERF_SCOPE("KernelExecutor::computeArgs2");
+  FUSER_PERF_SCOPE("KernelExecutor::computeArgs");
   if (entry.args.size() != args.size()) {
     entry.args.resize(args.size());
     entry.arg_ptrs.resize(args.size());
@@ -997,42 +901,6 @@ void KernelExecutor::computeArgs2(
   }
 }
 
-// Reset the arguments that we'll pass to cuLaunchKernel. This needs to be
-// invoked on every shape change.
-void KernelExecutor::recomputeArgs(
-    KernelExecutorEntry& entry,
-    ExpressionEvaluator& expr_eval,
-    const kir::Kernel* kernel) const {
-  FUSER_PERF_SCOPE("KernelExecutor::recomputeArgs");
-
-  const std::vector<Val*>& params = kernel->parameters();
-  const PrimDataType idx_type = kernel->indexType();
-  // assert(entry.args.size() == params.size());
-  // assert(entry.arg_ptrs.size() == params.size());
-  // assert(params.size() >= args.size());
-  for (size_t p = 0; p < params.size(); ++p) {
-    PolymorphicValue pv = expr_eval.evaluate(params[p]);
-    if (pv.is<at::Tensor>() && pv.as<at::Tensor>().is_cuda()) {
-      // GPU tensors are not passed directly: instead we pass a Tensor<type,
-      // rank, rank> struct. The pointer and dimensions are dynamic, but the
-      // types and ranks are actually static (changing the rank or the types
-      // would need to be done via a new FusionDefinition). As such, we created
-      // the Tensor<t, r, r> struct during ::computeArgs, and here we just fill
-      // in the base address, shape, and stride arrays to cover whatever new
-      // tensors we got this round.
-      TensorView* mtv = dynamic_cast<TensorView*>(params[p]);
-      const Val* mdexpr = IrBuilder::metadataExpr(mtv);
-      const PolymorphicValue& tmd = expr_eval.evaluate(mdexpr);
-      const size_t idx_type_size =
-          PrimDataType::Int == idx_type ? sizeof(int64_t) : sizeof(int32_t);
-      fillTensorArgMetadata(entry, tmd, p, idx_type_size);
-    } else {
-      entry.args[p] = getKernelArgument(expr_eval, params[p], idx_type);
-    }
-    entry.arg_ptrs[p] = entry.args[p].data();
-  }
-}
-
 int64_t KernelExecutor::getAvailableDynamicSmemSize() {
   if (!available_dynamic_smem_size_.has_value()) {
     int size = 0;
@@ -1219,7 +1087,6 @@ KernelArgumentHolder KernelExecutor::run(
 
   // Initialize the executor entry if not initlized
   if (!executor_entry->init) {
-    // std::cout << "Initializing executor entry" << std::endl;
     initializeExecutorEntry(
         *executor_entry,
         args,
@@ -1227,7 +1094,6 @@ KernelArgumentHolder KernelExecutor::run(
         compile_params,
         output_args,
         compiled_kernel_->kernel()->indexType());
-    // std::cout << "Executor entry initialized" << std::endl;
   }
 
   if (!(executor_entry->launch_params.nThreads() <=
@@ -1254,7 +1120,7 @@ KernelArgumentHolder KernelExecutor::run(
         args,
         has_dynamic_alias_);
     if (has_dynamic_alias_) {
-      // TODO: Make sure dynamic alias works.
+      // TODO: Make sure there's a dynamic alias test.
       for (const auto i :
            c10::irange(compiled_kernel_->kernel()->outputs().size())) {
         auto param = compiled_kernel_->kernel()->outputs()[i];
@@ -1335,14 +1201,6 @@ KernelArgumentHolder KernelExecutor::run(
       }
       args.push(intermediate_buffer);
       intermediate_args.push(intermediate_buffer);
-      // expr_eval.bind(
-      //     compiled_kernel_->kernel()
-      //         ->summary()
-      //         .global_allocations.at(i)
-      //         ->buffer(),
-      //     args
-      //         [compiled_kernel_->kernel()->inputs().size() + outputs.size() +
-      //          i]);
       if (buf_info.is_profile_buffer) {
         profile_buffer = intermediate_buffer;
       }
@@ -1358,21 +1216,16 @@ KernelArgumentHolder KernelExecutor::run(
     if (has_tma_) {
       // Resolving TMA requires binding all values and evaluating the TMA
       // arguments
-      // std::cout << "Resolving TMA" << std::endl;
       args = resolveTMA(*executor_entry, args);
-      // std::cout << "TMA resolved" << std::endl;
     } else if (has_rng_) {
       // Resolving RNG seed requires evaluating and adding those values, but
       // doesn't require binding all values as getting RNG seed and offset
       // doesn't depend on other values
-      // std::cout << "Resolving RNG seed" << std::endl;
       args = resolveRNGSeed(compiled_kernel_->kernel(), args);
-      // std::cout << "RNG seed resolved" << std::endl;
     }
   }
-  // std::cout << "Computing args" << std::endl;
-  computeArgs2(*executor_entry, args);
-  // std::cout << "Args computed" << std::endl;
+
+  computeArgs(*executor_entry, args);
 
   if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) {
     launch_params_.print();
@@ -1398,8 +1251,6 @@ KernelArgumentHolder KernelExecutor::run(
     FUSER_PERF_SCOPE("KernelExecutor::runFusion::execute_kernel");
     ensureAvailableDynamicSmemSize(executor_entry->launch_params.smem());
 
-    // recomputeArgs(*executor_entry, expr_eval, compiled_kernel_->kernel());
-
     if (isDebugDumpEnabled(DebugDumpOption::Occupancy) ||
         isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
       int blocks_per_sm = -1;
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index bd352dd4fe6..4b8c5e12be5 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -229,29 +229,13 @@ class KernelExecutor : public ExecutorAbstract {
 
   // Creates the initial set of arguments to a kernel, based on the arguments
   // to we have now.
-  void computeArgs(
-      KernelExecutorEntry&,
-      ExpressionEvaluator&,
-      const kir::Kernel*) const;
-
-  void computeArgs2(
-      KernelExecutorEntry& entry,
-      const KernelArgumentHolder& args) const;
+  void computeArgs(KernelExecutorEntry& entry, const KernelArgumentHolder& args)
+      const;
 
   KernelArgumentHolder resolveTMA(
       KernelExecutorEntry& entry,
       const KernelArgumentHolder& args) const;
 
-  // Updates an existing set of arguments based on the current arguments. It is
-  // is an error to call this before `computeArgs` has been invoked.
-  // recomputeArgs will fail if the arity of the function changes, or the rank
-  // of any tensor changes (as these are compiled-in to the generated kernel
-  // and therefore would require us to do a larger recompilation).
-  void recomputeArgs(
-      KernelExecutorEntry&,
-      ExpressionEvaluator&,
-      const kir::Kernel*) const;
-
   //! Serialize CompiledKernel using flatbuffers
   flatbuffers::Offset<serde::CudaKernel> serialize(
       flatbuffers::FlatBufferBuilder& builder,
diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp
index 0392ae65b9a..184d7579b84 100644
--- a/csrc/runtime/executor_kernel_arg.cpp
+++ b/csrc/runtime/executor_kernel_arg.cpp
@@ -120,15 +120,6 @@ void KernelArgumentHolder::pushTensorProxy(
   push(meta_tensor);
 }
 
-std::vector<c10::IValue> KernelArgumentHolder::toC10Array() const {
-  std::vector<c10::IValue> ival_array;
-  ival_array.reserve(arguments_.size());
-  for (const auto& arg : arguments_) {
-    ival_array.push_back(PolymorphicValue_functions::toIValue(arg));
-  }
-  return ival_array;
-}
-
 void KernelArgumentHolder::setDeviceIndex(std::optional<int8_t> index) {
   if (index.has_value()) {
     device_index_ = index.value();
@@ -359,26 +350,6 @@ std::vector<std::byte> polymorphicValueToBytes(
   }
 }
 
-std::vector<std::byte> getKernelArgument(
-    ExpressionEvaluator& ee,
-    Val* parameter,
-    PrimDataType index_type) {
-  FUSER_PERF_SCOPE("getKernelArgument");
-  NVF_ERROR(parameter != nullptr);
-  PolymorphicValue pv = ee.evaluate(parameter);
-  if (auto tv = dynamic_cast<TensorView*>(parameter)) {
-    if (tv->isCpuScalar()) {
-      return polymorphicValueToBytes(pv, tv->dtype(), index_type);
-    } else {
-      const Val* metadata_val = IrBuilder::metadataExpr(tv);
-      const PolymorphicValue& metadata = ee.evaluate(metadata_val);
-      return polymorphicValueToBytes(
-          metadata, metadata_val->dtype(), index_type);
-    }
-  }
-  return polymorphicValueToBytes(pv, parameter->dtype(), index_type);
-}
-
 int64_t computeBytes(const KernelArgumentHolder& args) {
   int64_t num_bytes = 0;
   // Figure how many bytes are inputs, outputs, and temporary buffers
@@ -391,15 +362,4 @@ int64_t computeBytes(const KernelArgumentHolder& args) {
   return num_bytes;
 }
 
-int64_t computeBytes(const std::vector<at::Tensor>& outputs) {
-  int64_t num_bytes = 0;
-  for (auto i : c10::irange(outputs.size())) {
-    const auto& output = outputs.at(i);
-    // NOTE: this assumes that all output elements correspond to a single
-    // store
-    num_bytes += static_cast<int64_t>(output.storage().nbytes());
-  }
-  return num_bytes;
-}
-
 } // namespace nvfuser
diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h
index a8a7fb110de..d96705d01e7 100644
--- a/csrc/runtime/executor_kernel_arg.h
+++ b/csrc/runtime/executor_kernel_arg.h
@@ -107,8 +107,6 @@ class NVF_API KernelArgumentHolder {
 
   void erase(const PolymorphicValue& arg_to_delete);
 
-  std::vector<c10::IValue> toC10Array() const;
-
   PolymorphicValue& back() {
     return arguments_.back();
   }
@@ -227,18 +225,6 @@ std::vector<std::byte> polymorphicValueToBytes(
     const DataType& dtype,
     PrimDataType index_type);
 
-std::vector<std::byte> getKernelArgument(
-    ExpressionEvaluator& ee,
-    Val* parameter,
-    PrimDataType index_type);
-
-std::vector<std::byte> getKernelArgument(
-    at::Tensor tensor,
-    const GlobalBufferInfo& output_info,
-    PrimDataType index_type);
-
 int64_t computeBytes(const KernelArgumentHolder& args);
 
-int64_t computeBytes(const std::vector<at::Tensor>& outputs);
-
 } // namespace nvfuser

From 0db5b89bc6f069d7ba9f699bb539ea2d274c756e Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 3 Mar 2025 08:54:13 -0800
Subject: [PATCH 30/35] Cleanup argument to byte conversion.

---
 csrc/runtime/compiled_kernel.cpp     | 29 ++++----
 csrc/runtime/executor.cpp            | 76 ++++++---------------
 csrc/runtime/executor_kernel_arg.cpp | 99 ++++++++++++++++------------
 csrc/runtime/executor_kernel_arg.h   | 10 +++
 4 files changed, 98 insertions(+), 116 deletions(-)

diff --git a/csrc/runtime/compiled_kernel.cpp b/csrc/runtime/compiled_kernel.cpp
index 8b4f37ed3ca..d4715339c0b 100644
--- a/csrc/runtime/compiled_kernel.cpp
+++ b/csrc/runtime/compiled_kernel.cpp
@@ -1416,23 +1416,18 @@ float RtcKernel::run(
   std::vector<void*> pointers;
 
   for (const auto& input : args) {
-    const auto& input_tensor = input.as<at::Tensor>();
-    auto dtype = std::get<PrimDataType>(
-        aten_to_data_type(input_tensor.scalar_type()).type);
-    DataType metadata_type = globalTensorMetaData(dtype, input_tensor.dim());
-
-    std::shared_ptr<Struct> struct_ = std::make_shared<TensorMetaData>();
-    TensorMetaData* metadata = (TensorMetaData*)struct_.get();
-    metadata->dtype = dtype;
-    metadata->data = input_tensor.data_ptr();
-    metadata->logical_size = input_tensor.sizes();
-    metadata->logical_stride = input_tensor.strides();
-    metadata->alloc_size = input_tensor.sizes();
-    metadata->alloc_stride = input_tensor.strides();
-
-    data.emplace_back(polymorphicValueToBytes(
-        PolymorphicValue(std::move(struct_)), metadata_type, index_type));
-    pointers.emplace_back(data.back().data());
+    NVF_ERROR(
+        input.is<at::Tensor>() && input.as<at::Tensor>().is_cuda(),
+        "Only CUDA tensors are supported for direct nvRTC launches at this time.");
+    if (input.is<at::Tensor>() && input.as<at::Tensor>().is_cuda()) {
+      auto input_tensor = input.as<at::Tensor>();
+      data.emplace_back(tensorToBytes(
+          input_tensor,
+          input_tensor.sizes().vec(),
+          input_tensor.strides().vec(),
+          index_type));
+      pointers.emplace_back(data.back().data());
+    }
   }
 
   NVFUSER_CUDA_SAFE_CALL(cuLaunchKernel(
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 8cf1adf4625..b2d34d3a1b2 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -173,6 +173,7 @@ void KernelExecutor::compile(
     CompileParams compile_params,
     SchedulerType scheduler_type) {
   FUSER_PERF_SCOPE("KernelExecutor::compile");
+
   NVF_ERROR(
       supported(fusion),
       "KernelExecutor does not support the Fusion provided.");
@@ -661,7 +662,7 @@ void KernelExecutor::initializeExecutorEntry(
 
   ExpressionEvaluator expr_eval =
       executor_utils::bindInputs(args, compiled_kernel_->kernel());
-  // expr_eval.precomputedValues() = evaluatorPrecomputedValues().get();
+
   auto launch_params = computeLaunchParams(
       launch_constraints, expr_eval, warp_size_, index_type);
 
@@ -835,57 +836,16 @@ void KernelExecutor::computeArgs(
   const PrimDataType idx_type = compiled_kernel_->kernel()->indexType();
   int64_t buffer_info_idx = 0;
   for (size_t arg_idx = 0; arg_idx < args.size(); ++arg_idx) {
-    std::vector<std::byte> bytes;
     if (args[arg_idx].is<at::Tensor>() &&
         args[arg_idx].as<at::Tensor>().is_cuda()) {
-      auto tensor = args[arg_idx].as<at::Tensor>();
-
-      auto data = tensor.data_ptr();
       const auto& buffer_info =
-          linear_buffer_info_getter(entry, buffer_info_idx);
-      const auto& logical_size = buffer_info.shape_info.logical_sizes.size() ==
-              buffer_info.shape_info.unsharded_logical_sizes.size()
-          ? buffer_info.shape_info.unsharded_logical_sizes
-          : buffer_info.shape_info.logical_sizes;
-      const auto& alloc_stride = buffer_info.shape_info.allocation_strides;
-      buffer_info_idx++;
-      // special handle for TensorMetaData so that CPU overhead is minimal.
-      if (idx_type == PrimDataType::Int) {
-        bytes.reserve(
-            sizeof(void*) + sizeof(int64_t) * logical_size.size() +
-            sizeof(int64_t) * alloc_stride.size());
-        bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1));
-        bytes.insert(
-            bytes.end(),
-            (std::byte*)logical_size.data(),
-            (std::byte*)logical_size.data() +
-                sizeof(int64_t) * logical_size.size());
-        bytes.insert(
-            bytes.end(),
-            (std::byte*)alloc_stride.data(),
-            (std::byte*)alloc_stride.data() +
-                sizeof(int64_t) * alloc_stride.size());
-      } else {
-        bytes.reserve(
-            sizeof(void*) + sizeof(int32_t) * logical_size.size() +
-            sizeof(int32_t) * alloc_stride.size());
-        bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1));
-        std::vector<int32_t> logical_size32(
-            logical_size.begin(), logical_size.end());
-        bytes.insert(
-            bytes.end(),
-            (std::byte*)logical_size32.data(),
-            (std::byte*)logical_size32.data() +
-                sizeof(int32_t) * logical_size32.size());
-        std::vector<int32_t> alloc_stride32(
-            alloc_stride.begin(), alloc_stride.end());
-        bytes.insert(
-            bytes.end(),
-            (std::byte*)alloc_stride32.data(),
-            (std::byte*)alloc_stride32.data() +
-                sizeof(int32_t) * alloc_stride32.size());
-      }
-      entry.args[arg_idx] = bytes;
+          linear_buffer_info_getter(entry, buffer_info_idx++);
+      entry.args[arg_idx] = tensorToBytes(
+          args[arg_idx],
+          buffer_info.shape_info.logical_sizes,
+          buffer_info.shape_info.allocation_strides,
+          idx_type,
+          buffer_info.shape_info.unsharded_logical_sizes);
       entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data();
     } else {
       if (args[arg_idx].is<at::Tensor>()) {
@@ -996,6 +956,8 @@ KernelArgumentHolder resolveRNGSeed(
 }
 } // namespace
 
+// TODO: Reduce bindings to only those necessaary to resolve missing params.
+// TODO: Check if this could be reused to also resolve dynamic aliases.
 KernelArgumentHolder KernelExecutor::resolveTMA(
     KernelExecutorEntry& entry,
     const KernelArgumentHolder& args) const {
@@ -1018,7 +980,9 @@ KernelArgumentHolder KernelExecutor::resolveTMA(
   }
 
   for (const auto& intermediate_entry : entry.intermediates) {
-    expr_eval.bind(intermediate_entry.tv, args[arg_idx++]);
+    if (args[arg_idx].hasValue()) {
+      expr_eval.bind(intermediate_entry.tv, args[arg_idx++]);
+    }
   }
 
   KernelArgumentHolder resolved_args;
@@ -1047,11 +1011,6 @@ KernelArgumentHolder KernelExecutor::run(
     sprof.startKernel();
   }
 
-  ExpressionEvaluator expr_eval;
-  if (has_dynamic_alias_ || has_tma_) {
-    expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel());
-  }
-
   NVF_ERROR(isCompiled());
   NVF_ERROR(
       output_args.empty() ||
@@ -1120,7 +1079,12 @@ KernelArgumentHolder KernelExecutor::run(
         args,
         has_dynamic_alias_);
     if (has_dynamic_alias_) {
-      // TODO: Make sure there's a dynamic alias test.
+      ExpressionEvaluator expr_eval;
+      if (has_dynamic_alias_ || has_tma_) {
+        expr_eval =
+            executor_utils::bindInputs(args, compiled_kernel_->kernel());
+      }
+
       for (const auto i :
            c10::irange(compiled_kernel_->kernel()->outputs().size())) {
         auto param = compiled_kernel_->kernel()->outputs()[i];
diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp
index 184d7579b84..1531a253504 100644
--- a/csrc/runtime/executor_kernel_arg.cpp
+++ b/csrc/runtime/executor_kernel_arg.cpp
@@ -287,49 +287,8 @@ std::vector<std::byte> polymorphicValueToBytes(
     // FUSER_PERF_SCOPE("polymorphicValueToBytes(StructHandle)");
     std::vector<std::byte> buffer;
     if (argument.as<StructHandle>().is<TensorMetaData>()) {
-      auto& data = argument->*&TensorMetaData::data;
-      auto& logical_size = argument->*&TensorMetaData::logical_size;
-      auto& alloc_stride = argument->*&TensorMetaData::alloc_stride;
-
-      // special handle for TensorMetaData so that CPU overhead is minimal.
-      if (index_type == PrimDataType::Int) {
-        buffer.reserve(
-            sizeof(void*) + sizeof(int64_t) * logical_size.size() +
-            sizeof(int64_t) * alloc_stride.size());
-        buffer.insert(
-            buffer.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*));
-        buffer.insert(
-            buffer.end(),
-            (std::byte*)logical_size.data(),
-            (std::byte*)logical_size.data() +
-                sizeof(int64_t) * logical_size.size());
-        buffer.insert(
-            buffer.end(),
-            (std::byte*)alloc_stride.data(),
-            (std::byte*)alloc_stride.data() +
-                sizeof(int64_t) * alloc_stride.size());
-      } else {
-        buffer.reserve(
-            sizeof(void*) + sizeof(int32_t) * logical_size.size() +
-            sizeof(int32_t) * alloc_stride.size());
-        buffer.insert(
-            buffer.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*));
-        std::vector<int32_t> logical_size32(
-            logical_size.begin(), logical_size.end());
-        buffer.insert(
-            buffer.end(),
-            (std::byte*)logical_size32.data(),
-            (std::byte*)logical_size32.data() +
-                sizeof(int32_t) * logical_size32.size());
-        std::vector<int32_t> alloc_stride32(
-            alloc_stride.begin(), alloc_stride.end());
-        buffer.insert(
-            buffer.end(),
-            (std::byte*)alloc_stride32.data(),
-            (std::byte*)alloc_stride32.data() +
-                sizeof(int32_t) * alloc_stride32.size());
-      }
-      return buffer;
+      NVF_THROW(
+          "Don't send tensor metadata to this function directly, use tensorToBytes.");
     } else {
       const auto& dtype_ = std::get<StructType>(dtype.type);
       for (const auto& field : dtype_.fields) {
@@ -350,6 +309,60 @@ std::vector<std::byte> polymorphicValueToBytes(
   }
 }
 
+std::vector<std::byte> tensorToBytes(
+    const PolymorphicValue& argument,
+    const std::vector<int64_t>& logical_sizes,
+    const std::vector<int64_t>& alloc_strides,
+    PrimDataType idx_type,
+    const std::vector<int64_t>& unsharded_logical_sizes) {
+  std::vector<std::byte> bytes;
+  NVF_ERROR(
+      argument.is<at::Tensor>() && argument.as<at::Tensor>().is_cuda(),
+      "Argument is not a CUDA tensor.");
+  auto tensor = argument.as<at::Tensor>();
+  auto data = tensor.data_ptr();
+
+  const auto& size_to_use =
+      logical_sizes.size() == unsharded_logical_sizes.size()
+      ? unsharded_logical_sizes
+      : logical_sizes;
+  // special handle for TensorMetaData so that CPU overhead is minimal.
+  if (idx_type == PrimDataType::Int) {
+    bytes.reserve(
+        sizeof(void*) + sizeof(int64_t) * size_to_use.size() +
+        sizeof(int64_t) * alloc_strides.size());
+    bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1));
+    bytes.insert(
+        bytes.end(),
+        (std::byte*)size_to_use.data(),
+        (std::byte*)size_to_use.data() + sizeof(int64_t) * size_to_use.size());
+    bytes.insert(
+        bytes.end(),
+        (std::byte*)alloc_strides.data(),
+        (std::byte*)alloc_strides.data() +
+            sizeof(int64_t) * alloc_strides.size());
+  } else {
+    bytes.reserve(
+        sizeof(void*) + sizeof(int32_t) * size_to_use.size() +
+        sizeof(int32_t) * alloc_strides.size());
+    bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1));
+    std::vector<int32_t> logical_size32(size_to_use.begin(), size_to_use.end());
+    bytes.insert(
+        bytes.end(),
+        (std::byte*)logical_size32.data(),
+        (std::byte*)logical_size32.data() +
+            sizeof(int32_t) * logical_size32.size());
+    std::vector<int32_t> alloc_stride32(
+        alloc_strides.begin(), alloc_strides.end());
+    bytes.insert(
+        bytes.end(),
+        (std::byte*)alloc_stride32.data(),
+        (std::byte*)alloc_stride32.data() +
+            sizeof(int32_t) * alloc_stride32.size());
+  }
+  return bytes;
+}
+
 int64_t computeBytes(const KernelArgumentHolder& args) {
   int64_t num_bytes = 0;
   // Figure how many bytes are inputs, outputs, and temporary buffers
diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h
index d96705d01e7..77198ae9002 100644
--- a/csrc/runtime/executor_kernel_arg.h
+++ b/csrc/runtime/executor_kernel_arg.h
@@ -220,11 +220,21 @@ class NVF_API KernelArgumentHolder {
   std::optional<size_t> cache_id_ = std::nullopt;
 };
 
+// Used to convert a polymorphic value to a byte vector. Do not use for CUDA
+// Tensors, use tensorToBytes instead.
 std::vector<std::byte> polymorphicValueToBytes(
     const PolymorphicValue& argument,
     const DataType& dtype,
     PrimDataType index_type);
 
+// Used to convert a CUDA tensor to a byte vector.
+std::vector<std::byte> tensorToBytes(
+    const PolymorphicValue& argument,
+    const std::vector<int64_t>& logical_sizes,
+    const std::vector<int64_t>& allocation_strides,
+    PrimDataType idx_type,
+    const std::vector<int64_t>& unsharded_logical_sizes = {});
+
 int64_t computeBytes(const KernelArgumentHolder& args);
 
 } // namespace nvfuser

From 2241594a8f4171cba3f65db59f4ad7fcb3e89d11 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 3 Mar 2025 16:33:20 -0800
Subject: [PATCH 31/35] PR Comments.

---
 csrc/host_ir/executor.cpp        |  1 -
 csrc/runtime/allocations.cpp     |  4 ++--
 csrc/runtime/compiled_kernel.cpp | 16 +++++++---------
 csrc/runtime/executor.cpp        | 16 ++++++++++------
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 32c2f5a956e..b2dc87b2789 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -107,7 +107,6 @@ KernelArgumentHolder HostIrExecutor::run(
   if (output_args.empty()) {
     std::vector<GlobalBufferInfo> output_infos = getBufferInfos(
         expr_eval, PrimDataType::Int, host_ir_container_->outputs());
-    output_args.resize(host_ir_container_->outputs().size());
     auto output_alias_to_input =
         executor_utils::getOutputAliasToInputMap(host_ir_container_.get());
     output_args = allocateOutputs(
diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 83ab60d900d..76a87a60e81 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -309,7 +309,7 @@ std::vector<GlobalBufferInfo> getBufferInfos(
     ExpressionEvaluator& expr_eval,
     DataType index_dtype,
     const std::vector<Val*>& fusion_outputs) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::getOutbufferInfos");
+  FUSER_PERF_SCOPE("fusion_executor::allocations::getBufferInfos");
   std::vector<GlobalBufferInfo> output_buffer_infos;
   output_buffer_infos.reserve(fusion_outputs.size());
   for (const auto out : fusion_outputs) {
@@ -696,7 +696,7 @@ TensorShapeInfo inferTensorShapes(
   auto alias_info = tv->fusion()->getOutputAlias(tv);
   if (alias_info.type == AllocationType::Evaluate) {
     auto val = expr_eval.evaluate(tv);
-    NVF_ERROR(val.hasValue() && val.is<at::Tensor>(), "Output is not a Tensor");
+    NVF_ERROR(val.is<at::Tensor>(), "Output is not a Tensor");
     auto tensor = val.as<at::Tensor>();
 
     if (!tv->hasAllocation()) {
diff --git a/csrc/runtime/compiled_kernel.cpp b/csrc/runtime/compiled_kernel.cpp
index d4715339c0b..e2aaf18069b 100644
--- a/csrc/runtime/compiled_kernel.cpp
+++ b/csrc/runtime/compiled_kernel.cpp
@@ -1419,15 +1419,13 @@ float RtcKernel::run(
     NVF_ERROR(
         input.is<at::Tensor>() && input.as<at::Tensor>().is_cuda(),
         "Only CUDA tensors are supported for direct nvRTC launches at this time.");
-    if (input.is<at::Tensor>() && input.as<at::Tensor>().is_cuda()) {
-      auto input_tensor = input.as<at::Tensor>();
-      data.emplace_back(tensorToBytes(
-          input_tensor,
-          input_tensor.sizes().vec(),
-          input_tensor.strides().vec(),
-          index_type));
-      pointers.emplace_back(data.back().data());
-    }
+    auto input_tensor = input.as<at::Tensor>();
+    data.emplace_back(tensorToBytes(
+        input_tensor,
+        input_tensor.sizes().vec(),
+        input_tensor.strides().vec(),
+        index_type));
+    pointers.emplace_back(data.back().data());
   }
 
   NVFUSER_CUDA_SAFE_CALL(cuLaunchKernel(
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index b2d34d3a1b2..0b35baee5bd 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -299,7 +299,7 @@ void KernelExecutor::compile(
   // slice, and that slice has a symbolic integer it depends on, then this
   // function returns true.
   //
-  // This could happen for other examples and this function will return true if
+  // This could happen for other examples and has_dynamic_alias_ will be true if
   // to evaluate the output that has an alias, other values besides the aliased
   // input need to be bound to the expression evaluator to evaluate the output.
   for (auto output : fusion->outputs()) {
@@ -315,11 +315,12 @@ void KernelExecutor::compile(
         if (input->isA<TensorView>() && input->sameAs(aliased_to)) {
           continue;
         }
+
+        if (input->isConst()) {
+          continue;
+        }
+        has_dynamic_alias_ = true;
       }
-      if (out_tv->isConst()) {
-        continue;
-      }
-      has_dynamic_alias_ = true;
     }
   }
 }
@@ -956,7 +957,7 @@ KernelArgumentHolder resolveRNGSeed(
 }
 } // namespace
 
-// TODO: Reduce bindings to only those necessaary to resolve missing params.
+// TODO: Reduce bindings to only those necessary to resolve missing params.
 // TODO: Check if this could be reused to also resolve dynamic aliases.
 KernelArgumentHolder KernelExecutor::resolveTMA(
     KernelExecutorEntry& entry,
@@ -1180,6 +1181,9 @@ KernelArgumentHolder KernelExecutor::run(
     if (has_tma_) {
       // Resolving TMA requires binding all values and evaluating the TMA
       // arguments
+      //
+      // Resolving TMA also resolves RNG, so if TMA exists the resolveRNGSeed
+      // function shouldn't also be called.
       args = resolveTMA(*executor_entry, args);
     } else if (has_rng_) {
       // Resolving RNG seed requires evaluating and adding those values, but

From 9cbff1646206a82c91210352b9e37dd2cdd4e551 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 3 Mar 2025 16:47:44 -0800
Subject: [PATCH 32/35] Clang.

---
 csrc/runtime/executor_kernel_arg.cpp | 2 +-
 csrc/runtime/executor_utils.cpp      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp
index 1531a253504..ccc2f7657f7 100644
--- a/csrc/runtime/executor_kernel_arg.cpp
+++ b/csrc/runtime/executor_kernel_arg.cpp
@@ -319,7 +319,7 @@ std::vector<std::byte> tensorToBytes(
   NVF_ERROR(
       argument.is<at::Tensor>() && argument.as<at::Tensor>().is_cuda(),
       "Argument is not a CUDA tensor.");
-  auto tensor = argument.as<at::Tensor>();
+  const auto& tensor = argument.as<at::Tensor>();
   auto data = tensor.data_ptr();
 
   const auto& size_to_use =
diff --git a/csrc/runtime/executor_utils.cpp b/csrc/runtime/executor_utils.cpp
index 8a54d0c30b9..2900a1bb022 100644
--- a/csrc/runtime/executor_utils.cpp
+++ b/csrc/runtime/executor_utils.cpp
@@ -616,7 +616,7 @@ std::vector<int> getOutputAliasToInputMap(const Fusion* fusion) {
         std::find(
             fusion->inputs().begin(), fusion->inputs().end(), aliased_to));
     if (aliased_to_idx < (int64_t)fusion->inputs().size()) {
-      output_to_input_map[(int64_t)output_idx] = (int64_t)aliased_to_idx;
+      output_to_input_map[output_idx] = (int)aliased_to_idx;
     } else {
       auto aliased_out = std::find(
           fusion->outputs().begin(), fusion->outputs().end(), aliased_to);

From 28e7cf90b01b64d94d5a2c249cbfffcf22091756 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 3 Mar 2025 17:14:01 -0800
Subject: [PATCH 33/35] Make allocation sizes/strides optional in tensor shape
 info.

---
 csrc/runtime/allocations.cpp | 16 +++++++---------
 csrc/runtime/allocations.h   |  5 +++++
 csrc/runtime/executor.cpp    | 28 ++++++++++++++--------------
 3 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 76a87a60e81..28fcf60d347 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -704,9 +704,8 @@ TensorShapeInfo inferTensorShapes(
           tensor.sizes().vec(),
           tensor.strides().vec(),
           isSharded(tv) ? unshardedSizes(tv, tensor.sizes().vec())
-                        : tensor.sizes().vec(),
-          tensor.sizes().vec(),
-          tensor.strides().vec()};
+                        : std::vector<int64_t>(),
+      };
     }
     auto allocation_size_stride =
         inferAndValidateAllocationSizesAndStrides(tensor, tv, expr_eval);
@@ -714,7 +713,7 @@ TensorShapeInfo inferTensorShapes(
         tensor.sizes().vec(),
         tensor.strides().vec(),
         isSharded(tv) ? unshardedSizes(tv, tensor.sizes().vec())
-                      : tensor.sizes().vec(),
+                      : std::vector<int64_t>(),
         allocation_size_stride.first,
         allocation_size_stride.second};
   }
@@ -726,9 +725,8 @@ TensorShapeInfo inferTensorShapes(
         allocation_size_stride.first,
         allocation_size_stride.second,
         isSharded(tv) ? unshardedSizes(tv, allocation_size_stride.first)
-                      : allocation_size_stride.first,
-        allocation_size_stride.first,
-        allocation_size_stride.second};
+                      : std::vector<int64_t>(),
+    };
   }
 
   auto options =
@@ -740,11 +738,11 @@ TensorShapeInfo inferTensorShapes(
   // `transformFromAllocationToLogical`
   logical_meta_tensor =
       transformFromAllocationToLogical(logical_meta_tensor, tv, expr_eval);
-  return {
+  return TensorShapeInfo{
       logical_meta_tensor.sizes().vec(),
       logical_meta_tensor.strides().vec(),
       isSharded(tv) ? unshardedSizes(tv, logical_meta_tensor.sizes().vec())
-                    : logical_meta_tensor.sizes().vec(),
+                    : std::vector<int64_t>(),
       allocation_size_stride.first,
       allocation_size_stride.second};
 }
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
index c1907fc1955..b05c6cffc27 100644
--- a/csrc/runtime/allocations.h
+++ b/csrc/runtime/allocations.h
@@ -17,6 +17,11 @@ namespace nvfuser {
 
 struct KernelExecutorEntry;
 
+// If not sharded unsharded_logical_sizes is empty.
+// If no allocation domain is found, allocation_sizes and allocation_strides
+// are empty.
+// For intermediate tensors, logical_sizes and logical_strides are used only,
+// the rest are empty.
 struct TensorShapeInfo {
   std::vector<int64_t> logical_sizes;
   std::vector<int64_t> logical_strides;
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 0b35baee5bd..bed20321cd9 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -523,8 +523,6 @@ std::vector<GlobalBufferInfo> KernelExecutor::getIntermediateBufferInfo(
     auto [sizes, strides] = has_expanded_domains
         ? inferShapeOfOutput(tv, expr_eval)
         : inferShapeOfIntermediate(tv, alloc, expr_eval);
-    info.shape_info.allocation_sizes = sizes;
-    info.shape_info.allocation_strides = strides;
     info.shape_info.logical_sizes = sizes;
     info.shape_info.logical_strides = strides;
     auto dtype = tv->dtype() == DataType::Index ? index_type : tv->dtype();
@@ -713,9 +711,6 @@ void KernelExecutor::initializeExecutorEntry(
         std::tie(alloc_sizes, alloc_strides) =
             inferAndValidateAllocationSizesAndStrides(
                 at_tensor, input_tv, expr_eval);
-      } else {
-        alloc_sizes = at_tensor.sizes().vec();
-        alloc_strides = at_tensor.strides().vec();
       }
 
       TensorShapeInfo shape_info;
@@ -764,8 +759,6 @@ void KernelExecutor::initializeExecutorEntry(
           "Accepting allocated outputs is not currently supported with allocation domain. ",
           "Allocation domain found for tv: ",
           info.tv->toString());
-      info.shape_info.allocation_sizes = output_tensor.sizes().vec();
-      info.shape_info.allocation_strides = output_tensor.strides().vec();
       info.shape_info.logical_sizes = output_tensor.sizes().vec();
       info.shape_info.logical_strides = output_tensor.strides().vec();
       output_info.emplace_back(info);
@@ -844,7 +837,9 @@ void KernelExecutor::computeArgs(
       entry.args[arg_idx] = tensorToBytes(
           args[arg_idx],
           buffer_info.shape_info.logical_sizes,
-          buffer_info.shape_info.allocation_strides,
+          buffer_info.shape_info.allocation_strides.empty()
+              ? buffer_info.shape_info.logical_strides
+              : buffer_info.shape_info.allocation_strides,
           idx_type,
           buffer_info.shape_info.unsharded_logical_sizes);
       entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data();
@@ -1115,22 +1110,27 @@ KernelArgumentHolder KernelExecutor::run(
   at::Tensor profile_buffer;
   {
     FUSER_PERF_SCOPE("KernelExecutor::runFusion::intermediates");
+    // Intermediates just use logical sizes and strides even though they're
+    // really allocation sizes and strides.
+    //
+    // This is simply because the convention used is that allocation
+    // sizes/strides are optional, logical are not.
     for (const auto intermediate_i :
          c10::irange(executor_entry->intermediates.size())) {
       const auto& buf_info = executor_entry->intermediates.at(intermediate_i);
       bool has_expansion = false;
       std::vector<int64_t> unexpanded_sizes;
-      unexpanded_sizes.reserve(buf_info.shape_info.allocation_sizes.size());
+      unexpanded_sizes.reserve(buf_info.shape_info.logical_sizes.size());
       NVF_ERROR(
-          buf_info.shape_info.allocation_sizes.size() ==
-          buf_info.shape_info.allocation_strides.size())
+          buf_info.shape_info.logical_sizes.size() ==
+          buf_info.shape_info.logical_strides.size())
       for (const auto j :
-           c10::irange(buf_info.shape_info.allocation_sizes.size())) {
-        if (buf_info.shape_info.allocation_strides[j] == 0) {
+           c10::irange(buf_info.shape_info.logical_sizes.size())) {
+        if (buf_info.shape_info.logical_strides[j] == 0) {
           has_expansion = true;
           unexpanded_sizes.push_back(1L);
         } else {
-          unexpanded_sizes.push_back(buf_info.shape_info.allocation_sizes[j]);
+          unexpanded_sizes.push_back(buf_info.shape_info.logical_sizes[j]);
         }
       }
       at::Tensor intermediate_buffer;

From a6252f70b6cdbe6e1151ba4ed673903f5e16269e Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Mon, 3 Mar 2025 17:22:05 -0800
Subject: [PATCH 34/35] Remove intermediate logic in allocations.cpp output
 logic can be used instead.

---
 csrc/runtime/allocations.cpp | 24 ------------------------
 csrc/runtime/allocations.h   |  7 -------
 csrc/runtime/executor.cpp    | 19 ++++++-------------
 3 files changed, 6 insertions(+), 44 deletions(-)

diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 28fcf60d347..d0dc3becbb9 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -149,12 +149,6 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShape(
     const ExpressionEvaluator& expr_eval) {
   FUSER_PERF_SCOPE("fusion_executor::allocations::inferShape");
 
-  // Allocate should be provided for intermediates. We just need to
-  // grab a chunk of memory of the size dicatated by
-  // Allocate::shape(). Fusion outputs do not come with Allocate and
-  // need to be allocated while taking expanded broadcasts into
-  // account.
-
   std::vector<int64_t> concrete_sizes(symbolic_sizes.size(), 0);
 
   for (const auto i : c10::irange(symbolic_sizes.size())) {
@@ -179,24 +173,6 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShape(
 }
 } // namespace
 
-std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfIntermediate(
-    const TensorView* tv,
-    const kir::Allocate* alloc,
-    ExpressionEvaluator& expr_eval) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::inferShapeOfIntermediate");
-  // The allocation domain represents the logical allocation domain,
-  // bu its actual allocation size may be different, e.g., for
-  // supporting halo accesses. The actual size is currently computed
-  // when creating the Allocate expr.
-  NVF_ERROR(alloc != nullptr);
-  const auto& symbolic_sizes = alloc->shape();
-  // For intermediate tensors, we just need to allocate a memory chunk
-  // of the specified size. Broadcast expansion does not need to be considered.
-  const auto expand_flags = std::vector<bool>(symbolic_sizes.size(), false);
-
-  return inferShape(tv, symbolic_sizes, expand_flags, expr_eval);
-}
-
 static bool fill_allocation_with_nan_ = false;
 
 bool shouldFillAllocationWithNan() {
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
index b05c6cffc27..40dda0eb842 100644
--- a/csrc/runtime/allocations.h
+++ b/csrc/runtime/allocations.h
@@ -57,13 +57,6 @@ int64_t computeSharedMemory(
     DataType index_type,
     int64_t smem_offset = 0);
 
-// Infer the shape of an intemediate tensor using kir::Allocate. This
-// is not ideal but still necessary when tensors are expanded with halo
-std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfIntermediate(
-    const TensorView* tv,
-    const kir::Allocate* alloc,
-    ExpressionEvaluator& expr_eval);
-
 bool shouldFillAllocationWithNan();
 
 NVF_API void setFillAllocationWithNan(bool value);
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index bed20321cd9..2b8e43a5db2 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -510,19 +510,12 @@ std::vector<GlobalBufferInfo> KernelExecutor::getIntermediateBufferInfo(
     info.zero_init = alloc->zeroInit();
     info.resets_to_zero = alloc->resetsToZero();
     // TODO: Allocation size needs to consider both expanded domains
-    // as well as halo. Currently, allocation of tensors with halo is
-    // only supported by inferShapeOfIntermediate, whereas expanded
-    // domains are only supported by inferShapeOfOutput. Until the
-    // halo support is revisited, use the former for all tensors
-    // unless expanded and the latter otherwise. This assumes there's
-    // no expanded domains with halo, which is fine for now.
-    const auto has_expanded_domains = std::any_of(
-        tv->getMaybeAllocationDomain().begin(),
-        tv->getMaybeAllocationDomain().end(),
-        [](IterDomain* id) { return id->hasExpandedExtent(); });
-    auto [sizes, strides] = has_expanded_domains
-        ? inferShapeOfOutput(tv, expr_eval)
-        : inferShapeOfIntermediate(tv, alloc, expr_eval);
+    // as well as halo. Currently, halo support has bene removed so we only need
+    // to worry about the expand case which is handled in inferShapeofOutputs.
+    // There used to also be a inferShapeOfIntermediate function before this
+    // commit, but that was safely removed with halo support. This will need to
+    // be revisited when halo support is added again.
+    auto [sizes, strides] = inferShapeOfOutput(tv, expr_eval);
     info.shape_info.logical_sizes = sizes;
     info.shape_info.logical_strides = strides;
     auto dtype = tv->dtype() == DataType::Index ? index_type : tv->dtype();

From 9e5a9ca45b2cf064cdd7a94f05d2297e85c88977 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Tue, 4 Mar 2025 12:45:25 -0800
Subject: [PATCH 35/35] Fix for ExpandedBroadcastGlobalIntermediateTest

---
 csrc/runtime/executor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 2b8e43a5db2..801aa8a92c5 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -1155,7 +1155,7 @@ KernelArgumentHolder KernelExecutor::run(
       }
       if (has_expansion) {
         intermediate_buffer = at::native::expand(
-            intermediate_buffer, buf_info.shape_info.allocation_sizes);
+            intermediate_buffer, buf_info.shape_info.logical_sizes);
       }
       args.push(intermediate_buffer);
       intermediate_args.push(intermediate_buffer);