diff --git a/.gitignore b/.gitignore
index 81bb26635d5..89d7c587c4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,3 +51,5 @@ foo.bin
 
 # Mac OS internal file
 .DS_Store
+
+test_log*
diff --git a/benchmarks/cpp/matmul.cpp b/benchmarks/cpp/matmul.cpp
index 0f5159b9cad..5ef718b4a4f 100644
--- a/benchmarks/cpp/matmul.cpp
+++ b/benchmarks/cpp/matmul.cpp
@@ -175,7 +175,7 @@ static void SingleMatmulBase(
   // Compile kernel
   auto launch_constraints = LaunchParams();
   KernelExecutor ke;
-  ke.compile(fusion, args.toC10Array(), launch_constraints, cparams);
+  ke.compile(fusion, args, launch_constraints, cparams);
   NVF_CHECK(
       getBankConflictInfo(ke.compiledKernel()->kernel(), launch_constraints)
           .empty(),
@@ -352,7 +352,7 @@ static void SingleMatmulPartitionedK(
   // Compile kernel
   KernelExecutor ke;
   auto lparams = LaunchParams();
-  ke.compile(fusion, args.toC10Array(), lparams, cparams);
+  ke.compile(fusion, args, lparams, cparams);
   NVF_CHECK(
       getBankConflictInfo(ke.compiledKernel()->kernel(), lparams).empty(),
       "Shared memory bank conflict not removed.");
@@ -461,10 +461,7 @@ static void NvFuserScheduler_MatmulSplitKReduction(
   // Compile kernel
   KernelExecutor ke;
   ke.compile(
-      fusion,
-      args.toC10Array(),
-      heuristic_params->lparams,
-      heuristic_params->cparams);
+      fusion, args, heuristic_params->lparams, heuristic_params->cparams);
 
   NVF_CHECK(
       getBankConflictInfo(
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index b726918d375..b2dc87b2789 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -86,21 +86,6 @@ bool HostIrExecutor::isCompiled() const {
   return (bool)host_ir_container_;
 }
 
-namespace {
-// Host IR specific function, returns the at:Tensor (ordered list) associated
-// with the provdied Fusion output tv
-at::Tensor findBufferForFusionOutput(
-    const KernelArgumentHolder& output_args,
-    const Val* fusion_out,
-    const Fusion* fusion) {
-  auto i =
-      std::find(fusion->outputs().begin(), fusion->outputs().end(), fusion_out);
-  NVF_ERROR(i != fusion->outputs().end());
-  auto index = std::distance(fusion->outputs().begin(), i);
-  return output_args[index].as<at::Tensor>();
-}
-} // namespace
-
 KernelArgumentHolder HostIrExecutor::run(
     KernelArgumentHolder& args,
     KernelArgumentHolder output_args) {
@@ -120,13 +105,17 @@ KernelArgumentHolder HostIrExecutor::run(
   auto expr_eval = executor_utils::bindInputs(args, host_ir_container_.get());
 
   if (output_args.empty()) {
-    std::vector<GlobalBufferInfo> output_info = getBufferInfos(
+    std::vector<GlobalBufferInfo> output_infos = getBufferInfos(
         expr_eval, PrimDataType::Int, host_ir_container_->outputs());
+    auto output_alias_to_input =
+        executor_utils::getOutputAliasToInputMap(host_ir_container_.get());
     output_args = allocateOutputs(
         host_ir_container_.get(),
-        output_info,
+        output_infos,
+        output_alias_to_input,
         c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex()),
-        expr_eval);
+        args,
+        true);
   }
 
   // TODO: If outputs are provided validate they're the correct size
@@ -136,8 +125,18 @@ KernelArgumentHolder HostIrExecutor::run(
     c10d::Backend* backend =
         communicator_->getBackendForTeam(communication->team(), std::nullopt);
     auto in_tensor = expr_eval.evaluate(communication->in()).as<at::Tensor>();
-    at::Tensor out_tensor = findBufferForFusionOutput(
-        output_args, communication->out(), host_ir_container_.get());
+    auto out_idx = std::distance(
+        host_ir_container_->outputs().begin(),
+        std::find(
+            host_ir_container_->outputs().begin(),
+            host_ir_container_->outputs().end(),
+            communication->out()));
+
+    NVF_ERROR(
+        out_idx < (int64_t)host_ir_container_->outputs().size(),
+        "Output tensor not found in fusion outputs");
+    auto out_tensor = output_args[out_idx].as<at::Tensor>();
+
     c10::intrusive_ptr<c10d::Work> work = postSingleCommunication(
         communication,
         communicator_->deviceId(),
@@ -148,6 +147,19 @@ KernelArgumentHolder HostIrExecutor::run(
       work->wait();
     }
   }
+
+  // Evaluate outputs that are marked as Evaluate
+  for (auto out_idx : c10::irange(host_ir_container_->outputs().size())) {
+    auto out = host_ir_container_->outputs()[out_idx];
+    auto alias_info = host_ir_container_->getOutputAlias(out);
+    if (alias_info.type == AllocationType::Evaluate) {
+      NVF_ERROR(
+          !output_args[out_idx].hasValue(),
+          "Output tensor already has a value");
+      output_args[out_idx] = expr_eval.evaluate(out);
+    }
+  }
+
   if (isProfilerEnabled()) {
     FusionProfiler::segment(group_id_).setDevice(args.getDeviceIndex());
     FusionProfiler::segment(group_id_).stopKernel();
@@ -572,13 +584,21 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) {
       "Allocation must be on a TensorView but got ",
       allocate->buffer());
   TensorView* tv = allocate->buffer()->as<TensorView>();
+  if (expr_evaluator_.isKnown(tv)) {
+    return;
+  }
   GlobalBufferInfo info =
       getBufferInfos(expr_evaluator_, PrimDataType::Int, {tv}).at(0);
-  AliasInfo alias_info = {
-      .type = AllocationType::New, .aliased_io = nullptr, .hide_output = false};
   c10::Device device =
       communicator_ ? communicator_->device() : at::Device("cuda:0");
-  at::Tensor tensor = allocateTensor(info, alias_info, device, expr_evaluator_);
+  auto tensor = at::native::empty_strided_cuda(
+      info.shape_info.logical_sizes,
+      info.shape_info.logical_strides,
+      info.type,
+      c10::nullopt,
+      device,
+      c10::nullopt);
+
   expr_evaluator_.bind(tv, tensor);
 }
 
diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index 0a172c38201..d0dc3becbb9 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -10,7 +10,9 @@
 
 #include <expr_evaluator.h>
 #include <instrumentation.h>
+#include <multidevice/utils.h>
 #include <polymorphic_value.h>
+#include <runtime/executor.h>
 #include <runtime/executor_kernel_arg.h>
 #include <runtime/executor_utils.h>
 #include <tensor_metadata.h>
@@ -147,12 +149,6 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShape(
     const ExpressionEvaluator& expr_eval) {
   FUSER_PERF_SCOPE("fusion_executor::allocations::inferShape");
 
-  // Allocate should be provided for intermediates. We just need to
-  // grab a chunk of memory of the size dicatated by
-  // Allocate::shape(). Fusion outputs do not come with Allocate and
-  // need to be allocated while taking expanded broadcasts into
-  // account.
-
   std::vector<int64_t> concrete_sizes(symbolic_sizes.size(), 0);
 
   for (const auto i : c10::irange(symbolic_sizes.size())) {
@@ -177,25 +173,7 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShape(
 }
 } // namespace
 
-std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfIntermediate(
-    const TensorView* tv,
-    const kir::Allocate* alloc,
-    ExpressionEvaluator& expr_eval) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::inferShapeOfIntermediate");
-  // The allocation domain represents the logical allocation domain,
-  // bu its actual allocation size may be different, e.g., for
-  // supporting halo accesses. The actual size is currently computed
-  // when creating the Allocate expr.
-  NVF_ERROR(alloc != nullptr);
-  const auto& symbolic_sizes = alloc->shape();
-  // For intermediate tensors, we just need to allocate a memory chunk
-  // of the specified size. Broadcast expansion does not need to be considered.
-  const auto expand_flags = std::vector<bool>(symbolic_sizes.size(), false);
-
-  return inferShape(tv, symbolic_sizes, expand_flags, expr_eval);
-}
-
-bool fill_allocation_with_nan_ = false;
+static bool fill_allocation_with_nan_ = false;
 
 bool shouldFillAllocationWithNan() {
   return fill_allocation_with_nan_;
@@ -252,40 +230,23 @@ void fillTensorWithNan(at::Tensor& t) {
   }
 }
 
-at::Tensor allocateTensor(
-    const GlobalBufferInfo& out_info,
-    const AliasInfo& alias_info,
+KernelArgumentHolder allocateOutputs(
+    const Fusion* fusion,
+    const std::vector<GlobalBufferInfo>& output_infos,
+    const std::vector<int>& output_alias_to_input_map,
     const c10::Device& device,
-    ExpressionEvaluator& ee) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::allocateTensor");
-  // Handle a fusion with duplicated outputs.
-  TensorView* out_tv = out_info.tv;
-  if (ee.isKnown(out_tv)) {
-    return ee.evaluate(out_tv).as<at::Tensor>();
-  }
-
-  std::optional<at::Tensor> aliased_io_tensor = std::nullopt;
-  Val* aliased_io = alias_info.aliased_io;
-  if (aliased_io != nullptr) {
-    NVF_ERROR(
-        aliased_io->isFusionInput() || aliased_io->isFusionOutput(),
-        aliased_io->toInlineString(),
-        " is expected to be a fusion input/output. `ee.evaluate` ",
-        "an intermediate tensor may involve GPU computation to materialize it ",
-        "to global memory.");
-    const PolymorphicValue& aliased_io_val = ee.evaluate(aliased_io);
-    NVF_ERROR(
-        aliased_io_val.is<at::Tensor>(),
-        "Alias io only supports tensor. Found ",
-        PolymorphicValue_functions::toString(aliased_io_val));
-    aliased_io_tensor = aliased_io_val.as<at::Tensor>();
-  }
+    const KernelArgumentHolder& args,
+    bool dynamic_evaluate) {
+  FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs");
 
-  switch (alias_info.type) {
-    case AllocationType::New: {
+  KernelArgumentHolder out_tensors;
+  out_tensors.resize(output_infos.size());
+  for (auto out_idx : c10::irange(output_infos.size())) {
+    auto out_info = output_infos.at(out_idx);
+    if (output_alias_to_input_map.at(out_idx) == -1) {
       auto alloc_tensor = at::native::empty_strided_cuda(
-          out_info.sizes,
-          out_info.strides,
+          out_info.shape_info.logical_sizes,
+          out_info.shape_info.logical_strides,
           out_info.type,
           c10::nullopt,
           device,
@@ -293,106 +254,38 @@ at::Tensor allocateTensor(
       if (shouldFillAllocationWithNan()) {
         fillTensorWithNan(alloc_tensor);
       }
-      return alloc_tensor;
-    }
-    case AllocationType::ReuseBuffer:
-      // Unlike for `AllocationType::Evaluate`, don't use
-      // ExpressionEvaluator to compute the output tensor. This is because
-      // the output tensor may hold different data from the input, e.g., an
-      // updated running mean.  `ExpressionEvaluator::evaluate(out_tv)`
-      // would trigger non-trivial host computation.
-      return aliased_io_tensor.value();
-    case AllocationType::Evaluate: {
-      auto out_tensor = ee.evaluate(out_tv).as<at::Tensor>();
-      if (aliased_io_tensor.has_value()) {
-        NVF_ERROR(
-            out_tensor.is_alias_of(aliased_io_tensor.value()),
-            "ExpressionEvaluator failed to evaluate ",
-            out_tv->toString(),
-            " as an alias of ",
-            aliased_io->toString());
-        inferAndValidateAllocationSizesAndStrides(out_tensor, out_tv, ee);
+      out_tensors[out_idx] = alloc_tensor;
+    } else if (
+        fusion->getOutputAlias(out_info.tv).type ==
+        AllocationType::ReuseBuffer) {
+      const auto& inp = args[output_alias_to_input_map.at(out_idx)];
+      NVF_ERROR(inp.is<at::Tensor>(), "Input is not a Tensor");
+      out_tensors[out_idx] = inp;
+    } else if (
+        fusion->getOutputAlias(out_info.tv).type == AllocationType::Evaluate) {
+      if (dynamic_evaluate) {
+        out_tensors[out_idx] = std::monostate();
+        continue;
       }
-      return out_tensor;
-    }
-    default:
-      NVF_THROW("Unrecognized AllocationType.");
-  }
-}
-
-KernelArgumentHolder allocateOutputs(
-    const Fusion* fusion,
-    const std::vector<GlobalBufferInfo>& output_info,
-    const c10::Device& device,
-    ExpressionEvaluator& ee) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs");
 
-  const auto num_outs = output_info.size();
-
-  // Sort the outputs so we compute aliases after allocating non-aliases. The
-  // order between aliases can be arbitrary. E.g.,
-  //
-  // ```
-  // non_alias_out = ...
-  // alias_out_0 = reshape(non_alias_out, ...)
-  // alias_out_1 = reshape(alias_out_0, ...)
-  // ```
-  //
-  // It's fine to compute `alias_out_1` before computing `alias_out_0`: when we
-  // compute `alias_out_1`, `alias_out_0` will be recursively
-  // `ExpressionEvaluator::evaluate`ed. However, `non_alias_out` must be
-  // allocated first so `alias_out_*` can refer them.
-  std::vector<std::pair<int64_t, Val*>> sorted_outs;
-  sorted_outs.reserve(num_outs);
-  for (const auto out_index : c10::irange(num_outs)) {
-    sorted_outs.emplace_back(out_index, fusion->outputs()[out_index]);
-  }
-  std::sort(
-      sorted_outs.begin(),
-      sorted_outs.end(),
-      [fusion](
-          const std::pair<int64_t, Val*>& lhs,
-          const std::pair<int64_t, Val*>& rhs) {
-        return (
-            fusion->getOutputAlias(lhs.second).type == AllocationType::New &&
-            fusion->getOutputAlias(rhs.second).type != AllocationType::New);
-      });
-
-  std::vector<at::Tensor> out_tensors(num_outs);
-  for (const auto& [out_index, out] : sorted_outs) {
-    at::Tensor out_tensor = allocateTensor(
-        output_info[out_index], fusion->getOutputAlias(out), device, ee);
-    // Bind `out_tensor` so
-    // 1. duplicated outputs map to the same tensor,
-    // 2. an output that aliases another output can be evaluated via
-    // ExpressionEvaluator cheaply.
-    ee.bind(out, out_tensor);
-    out_tensors[out_index] = out_tensor;
+      ExpressionEvaluator ee;
+      ee.bind(
+          fusion->getOutputAlias(out_info.tv).aliased_io,
+          args[output_alias_to_input_map.at(out_idx)]);
+      out_tensors[out_idx] = ee.evaluate(out_info.tv);
+    } else {
+      NVF_THROW(
+          "Unexpected allocation path, internal logic around allocations must be incorrect.");
+    }
   }
-  return KernelArgumentHolder(out_tensors);
+  return out_tensors;
 }
 
-namespace {
-GlobalBufferInfo getBufferInfo(
-    ExpressionEvaluator& expr_eval,
-    DataType index_dtype,
-    TensorView* tv) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::getBufferInfo");
-  GlobalBufferInfo info;
-  info.tv = tv;
-  std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval);
-  auto dtype =
-      (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
-  info.type = data_type_to_aten(dtype);
-  return info;
-}
-
-} // namespace
 std::vector<GlobalBufferInfo> getBufferInfos(
     ExpressionEvaluator& expr_eval,
     DataType index_dtype,
     const std::vector<Val*>& fusion_outputs) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::getOutbufferInfo");
+  FUSER_PERF_SCOPE("fusion_executor::allocations::getBufferInfos");
   std::vector<GlobalBufferInfo> output_buffer_infos;
   output_buffer_infos.reserve(fusion_outputs.size());
   for (const auto out : fusion_outputs) {
@@ -400,8 +293,14 @@ std::vector<GlobalBufferInfo> getBufferInfos(
         out->isA<TensorView>(),
         "Cannot allocate outputs that are not tensors.");
 
-    output_buffer_infos.emplace_back(
-        getBufferInfo(expr_eval, index_dtype, out->as<TensorView>()));
+    GlobalBufferInfo info;
+    info.tv = out->as<TensorView>();
+    info.shape_info = inferTensorShapes(info.tv, expr_eval);
+    auto dtype =
+        (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype());
+    info.type = data_type_to_aten(dtype);
+
+    output_buffer_infos.emplace_back(info);
   }
   return output_buffer_infos;
 }
@@ -711,16 +610,9 @@ at::Tensor transformFromAllocationToLogical(
   return tensor.permute(dims);
 }
 
-} // namespace
-
-std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
+std::pair<std::vector<int64_t>, std::vector<int64_t>> inferAllocationShape(
     TensorView* tv,
     const ExpressionEvaluator& expr_eval) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::inferShapeOfOutput");
-  // Fusion outputs do not come with Allocate and
-  // need to be allocated while taking expanded broadcasts into
-  // account.
-
   std::vector<Val*> symbolic_sizes;
   std::vector<bool> expand_flags;
 
@@ -745,8 +637,20 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
       expand_flags.push_back(false);
     }
   }
+  return inferShape(tv, symbolic_sizes, expand_flags, expr_eval);
+}
 
-  auto size_stride = inferShape(tv, symbolic_sizes, expand_flags, expr_eval);
+} // namespace
+
+std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
+    TensorView* tv,
+    const ExpressionEvaluator& expr_eval) {
+  FUSER_PERF_SCOPE("fusion_executor::allocations::inferShapeOfOutput");
+  // Fusion outputs do not come with Allocate and
+  // need to be allocated while taking expanded broadcasts into
+  // account.
+
+  auto size_stride = inferAllocationShape(tv, expr_eval);
   if (!tv->hasAllocation()) {
     return size_stride;
   }
@@ -761,4 +665,62 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
   return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()};
 }
 
+TensorShapeInfo inferTensorShapes(
+    TensorView* tv,
+    const ExpressionEvaluator& expr_eval) {
+  // Alias handling:
+  auto alias_info = tv->fusion()->getOutputAlias(tv);
+  if (alias_info.type == AllocationType::Evaluate) {
+    auto val = expr_eval.evaluate(tv);
+    NVF_ERROR(val.is<at::Tensor>(), "Output is not a Tensor");
+    auto tensor = val.as<at::Tensor>();
+
+    if (!tv->hasAllocation()) {
+      return TensorShapeInfo{
+          tensor.sizes().vec(),
+          tensor.strides().vec(),
+          isSharded(tv) ? unshardedSizes(tv, tensor.sizes().vec())
+                        : std::vector<int64_t>(),
+      };
+    }
+    auto allocation_size_stride =
+        inferAndValidateAllocationSizesAndStrides(tensor, tv, expr_eval);
+    return TensorShapeInfo{
+        tensor.sizes().vec(),
+        tensor.strides().vec(),
+        isSharded(tv) ? unshardedSizes(tv, tensor.sizes().vec())
+                      : std::vector<int64_t>(),
+        allocation_size_stride.first,
+        allocation_size_stride.second};
+  }
+
+  // Non-alias handling:
+  auto allocation_size_stride = inferAllocationShape(tv, expr_eval);
+  if (!tv->hasAllocation()) {
+    return TensorShapeInfo{
+        allocation_size_stride.first,
+        allocation_size_stride.second,
+        isSharded(tv) ? unshardedSizes(tv, allocation_size_stride.first)
+                      : std::vector<int64_t>(),
+    };
+  }
+
+  auto options =
+      c10::TensorOptions().device(c10::Device(c10::DeviceType::Meta));
+  auto logical_meta_tensor = at::empty_strided(
+      allocation_size_stride.first, allocation_size_stride.second, options);
+  // TODO(jiej): we should refactor it here, there's no need to use
+  // logical_meta_tensor at all, size + stride should be used directly in the
+  // `transformFromAllocationToLogical`
+  logical_meta_tensor =
+      transformFromAllocationToLogical(logical_meta_tensor, tv, expr_eval);
+  return TensorShapeInfo{
+      logical_meta_tensor.sizes().vec(),
+      logical_meta_tensor.strides().vec(),
+      isSharded(tv) ? unshardedSizes(tv, logical_meta_tensor.sizes().vec())
+                    : std::vector<int64_t>(),
+      allocation_size_stride.first,
+      allocation_size_stride.second};
+}
+
 } // namespace nvfuser
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
index 9097871d712..40dda0eb842 100644
--- a/csrc/runtime/allocations.h
+++ b/csrc/runtime/allocations.h
@@ -15,10 +15,24 @@
 
 namespace nvfuser {
 
+struct KernelExecutorEntry;
+
+// If not sharded unsharded_logical_sizes is empty.
+// If no allocation domain is found, allocation_sizes and allocation_strides
+// are empty.
+// For intermediate tensors, logical_sizes and logical_strides are used only,
+// the rest are empty.
+struct TensorShapeInfo {
+  std::vector<int64_t> logical_sizes;
+  std::vector<int64_t> logical_strides;
+  std::vector<int64_t> unsharded_logical_sizes;
+  std::vector<int64_t> allocation_sizes;
+  std::vector<int64_t> allocation_strides;
+};
+
 struct GlobalBufferInfo {
   TensorView* tv = nullptr;
-  std::vector<int64_t> sizes;
-  std::vector<int64_t> strides;
+  TensorShapeInfo shape_info;
   at::ScalarType type = at::ScalarType::Undefined;
   bool zero_init = false;
   bool resets_to_zero = false;
@@ -43,13 +57,6 @@ int64_t computeSharedMemory(
     DataType index_type,
     int64_t smem_offset = 0);
 
-// Infer the shape of an intemediate tensor using kir::Allocate. This
-// is not ideal but still necessary when tensors are expanded with halo
-std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfIntermediate(
-    const TensorView* tv,
-    const kir::Allocate* alloc,
-    ExpressionEvaluator& expr_eval);
-
 bool shouldFillAllocationWithNan();
 
 NVF_API void setFillAllocationWithNan(bool value);
@@ -61,20 +68,23 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
     TensorView* tv,
     const ExpressionEvaluator& expr_eval);
 
-// Allocate an `at::Tensor` for `out_info` or compute it as an alias.
-at::Tensor allocateTensor(
-    const GlobalBufferInfo& out_info,
-    const AliasInfo& alias_info,
-    const c10::Device& device,
-    ExpressionEvaluator& ee);
+// Infer the sizes and strides of an output tensor
+TensorShapeInfo inferTensorShapes(
+    TensorView* tv,
+    const ExpressionEvaluator& expr_eval);
 
 // Allocate output tensors for a given fusion. Outputs may alias inputs, in
-// that case output tensors are shallow copies of the aliased inputs
+// that case output tensors are shallow copies of the aliased inputs.
+//
+// If dynamic_evaluate is true, then any argument with AllocationType::Evaluate
+// will not be populated, it will be filled with std::monostate.
 KernelArgumentHolder allocateOutputs(
     const Fusion* fusion,
-    const std::vector<GlobalBufferInfo>& output_info,
+    const std::vector<GlobalBufferInfo>& output_infos,
+    const std::vector<int>& output_alias_to_input_map,
     const c10::Device& device,
-    ExpressionEvaluator& ee);
+    const KernelArgumentHolder& args,
+    bool dynamic_evaluate = false);
 
 //! Return information necessary for allocating output tensors. Input
 //! and output tensors are allowed to alias each other, which is
diff --git a/csrc/runtime/compiled_kernel.cpp b/csrc/runtime/compiled_kernel.cpp
index 8b4f37ed3ca..e2aaf18069b 100644
--- a/csrc/runtime/compiled_kernel.cpp
+++ b/csrc/runtime/compiled_kernel.cpp
@@ -1416,22 +1416,15 @@ float RtcKernel::run(
   std::vector<void*> pointers;
 
   for (const auto& input : args) {
-    const auto& input_tensor = input.as<at::Tensor>();
-    auto dtype = std::get<PrimDataType>(
-        aten_to_data_type(input_tensor.scalar_type()).type);
-    DataType metadata_type = globalTensorMetaData(dtype, input_tensor.dim());
-
-    std::shared_ptr<Struct> struct_ = std::make_shared<TensorMetaData>();
-    TensorMetaData* metadata = (TensorMetaData*)struct_.get();
-    metadata->dtype = dtype;
-    metadata->data = input_tensor.data_ptr();
-    metadata->logical_size = input_tensor.sizes();
-    metadata->logical_stride = input_tensor.strides();
-    metadata->alloc_size = input_tensor.sizes();
-    metadata->alloc_stride = input_tensor.strides();
-
-    data.emplace_back(polymorphicValueToBytes(
-        PolymorphicValue(std::move(struct_)), metadata_type, index_type));
+    NVF_ERROR(
+        input.is<at::Tensor>() && input.as<at::Tensor>().is_cuda(),
+        "Only CUDA tensors are supported for direct nvRTC launches at this time.");
+    auto input_tensor = input.as<at::Tensor>();
+    data.emplace_back(tensorToBytes(
+        input_tensor,
+        input_tensor.sizes().vec(),
+        input_tensor.strides().vec(),
+        index_type));
     pointers.emplace_back(data.back().data());
   }
 
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 57fd91224e8..801aa8a92c5 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -20,6 +20,7 @@
 #include <ir/utils.h>
 #include <iter_visitor.h>
 #include <kernel_ir.h>
+#include <multidevice/utils.h>
 #include <options.h>
 #include <polymorphic_value.h>
 #include <runtime/allocations.h>
@@ -282,6 +283,46 @@ void KernelExecutor::compile(
   if (isProfilerEnabled()) {
     FusionProfiler::segment(group_id_).stopCompile();
   }
+
+  for (auto expr : exprs) {
+    if (ir_utils::isCpAsyncBulk(expr)) {
+      has_tma_ = true;
+    }
+    if (expr->isA<RNGOp>()) {
+      has_rng_ = true;
+    }
+  }
+
+  // If an output has an alias to an input and is marked Evaluate, then
+  // expression evaluator evaluate is called on that output to produce the meta
+  // data manipulation it requires. If that manipulation is something like a
+  // slice, and that slice has a symbolic integer it depends on, then this
+  // function returns true.
+  //
+  // This could happen for other examples and has_dynamic_alias_ will be true if
+  // to evaluate the output that has an alias, other values besides the aliased
+  // input need to be bound to the expression evaluator to evaluate the output.
+  for (auto output : fusion->outputs()) {
+    if (output->isA<TensorView>()) {
+      auto out_tv = output->as<TensorView>();
+      auto alias_info = fusion->getOutputAlias(out_tv);
+      if (alias_info.type != AllocationType::Evaluate) {
+        continue;
+      }
+      auto aliased_to = alias_info.aliased_io->as<TensorView>();
+      auto inputs = InputsOf::output(out_tv);
+      for (auto input : inputs) {
+        if (input->isA<TensorView>() && input->sameAs(aliased_to)) {
+          continue;
+        }
+
+        if (input->isConst()) {
+          continue;
+        }
+        has_dynamic_alias_ = true;
+      }
+    }
+  }
 }
 
 LaunchParams KernelExecutor::computeLaunchParams(
@@ -330,7 +371,6 @@ LaunchParams KernelExecutor::computeLaunchParams(
         parallel_iter_extents, launch_constraints);
     expr_eval.precomputedValues()->evaluate();
   }
-
   // If any dimension was set in launch constraints we need to run through
   // IterDomains that have been parallelized, and bind those values. Or make
   // sure if they could be inferred the inference matches what was set.
@@ -470,20 +510,15 @@ std::vector<GlobalBufferInfo> KernelExecutor::getIntermediateBufferInfo(
     info.zero_init = alloc->zeroInit();
     info.resets_to_zero = alloc->resetsToZero();
     // TODO: Allocation size needs to consider both expanded domains
-    // as well as halo. Currently, allocation of tensors with halo is
-    // only supported by inferShapeOfIntermediate, whereas expanded
-    // domains are only supported by inferShapeOfOutput. Until the
-    // halo support is revisited, use the former for all tensors
-    // unless expanded and the latter otherwise. This assumes there's
-    // no expanded domains with halo, which is fine for now.
-    const auto has_expanded_domains = std::any_of(
-        tv->getMaybeAllocationDomain().begin(),
-        tv->getMaybeAllocationDomain().end(),
-        [](IterDomain* id) { return id->hasExpandedExtent(); });
-    std::tie(info.sizes, info.strides) = has_expanded_domains
-        ? inferShapeOfOutput(tv, expr_eval)
-        : inferShapeOfIntermediate(tv, alloc, expr_eval);
-    auto dtype = (tv->dtype() == DataType::Index ? index_type : tv->dtype());
+    // as well as halo. Currently, halo support has bene removed so we only need
+    // to worry about the expand case which is handled in inferShapeofOutputs.
+    // There used to also be a inferShapeOfIntermediate function before this
+    // commit, but that was safely removed with halo support. This will need to
+    // be revisited when halo support is added again.
+    auto [sizes, strides] = inferShapeOfOutput(tv, expr_eval);
+    info.shape_info.logical_sizes = sizes;
+    info.shape_info.logical_strides = strides;
+    auto dtype = tv->dtype() == DataType::Index ? index_type : tv->dtype();
     info.type = data_type_to_aten(dtype);
 
     // Remember the tensor buffer used for storing kernel profile
@@ -609,7 +644,7 @@ void dumpKernelArgs(
 } // namespace
 
 void KernelExecutor::initializeExecutorEntry(
-    ExecutorEntry& executor_entry,
+    KernelExecutorEntry& executor_entry,
     const KernelArgumentHolder& args,
     const LaunchParams& launch_constraints,
     const CompileParams& compile_params,
@@ -617,9 +652,8 @@ void KernelExecutor::initializeExecutorEntry(
     DataType index_type) {
   FUSER_PERF_SCOPE("KernelExecutor::initializeExecutorEntry");
 
-  ExpressionEvaluator expr_eval;
-  evaluatorPrecomputedValues()->bindInputs(args);
-  expr_eval.precomputedValues() = evaluatorPrecomputedValues().get();
+  ExpressionEvaluator expr_eval =
+      executor_utils::bindInputs(args, compiled_kernel_->kernel());
 
   auto launch_params = computeLaunchParams(
       launch_constraints, expr_eval, warp_size_, index_type);
@@ -651,6 +685,48 @@ void KernelExecutor::initializeExecutorEntry(
       "Expected blockDim.x >= 32 but found ",
       launch_params.bdimx());
 
+  std::vector<GlobalBufferInfo> input_info;
+  NVF_ERROR(
+      compiled_kernel_->kernel()->inputs().size() == args.size(),
+      "Input size mismatch, expected: ",
+      compiled_kernel_->kernel()->inputs().size(),
+      " got: ",
+      args.size());
+  for (auto inp_idx :
+       c10::irange(compiled_kernel_->kernel()->inputs().size())) {
+    auto input = compiled_kernel_->kernel()->inputs()[inp_idx];
+    if (auto input_tv = dynamic_cast<TensorView*>(input)) {
+      auto at_tensor = args[inp_idx].as<at::Tensor>();
+
+      std::vector<int64_t> alloc_sizes;
+      std::vector<int64_t> alloc_strides;
+      if (input_tv->hasAllocation()) {
+        std::tie(alloc_sizes, alloc_strides) =
+            inferAndValidateAllocationSizesAndStrides(
+                at_tensor, input_tv, expr_eval);
+      }
+
+      TensorShapeInfo shape_info;
+      shape_info.logical_sizes = args[inp_idx].as<at::Tensor>().sizes().vec();
+      shape_info.logical_strides =
+          args[inp_idx].as<at::Tensor>().strides().vec();
+      if (isSharded(input_tv)) {
+        shape_info.unsharded_logical_sizes =
+            unshardedSizes(input_tv, shape_info.logical_sizes);
+      }
+      shape_info.allocation_sizes = alloc_sizes;
+      shape_info.allocation_strides = alloc_strides;
+      GlobalBufferInfo info{
+          input_tv,
+          shape_info,
+          data_type_to_aten(input_tv->dtype()),
+          false,
+          false,
+          false};
+      input_info.emplace_back(info);
+    }
+  }
+
   std::vector<GlobalBufferInfo> output_info;
 
   if (output_args.empty()) {
@@ -658,155 +734,119 @@ void KernelExecutor::initializeExecutorEntry(
         expr_eval, index_type, compiled_kernel_->kernel()->outputs());
   } else {
     // Need to save the information necessary for allocations as
-    // future uses of this ExecutorEntry may not be provided with
+    // future uses of this KernelExecutorEntry may not be provided with
     // allocated outputs
-    for (const auto& output : output_args) {
-      const auto& out_tensor = output.as<at::Tensor>();
-      output_info.emplace_back(GlobalBufferInfo{
-          .sizes = out_tensor.sizes().vec(),
-          .strides = out_tensor.strides().vec(),
-          .type = out_tensor.scalar_type()});
+    for (auto output_idx : c10::irange(output_args.size())) {
+      NVF_ERROR(
+          output_args[output_idx].hasValue() &&
+              output_args[output_idx].is<at::Tensor>(),
+          "Output is not populated or not a Tensor");
+      const auto& output_tensor = output_args[output_idx].as<at::Tensor>();
+      GlobalBufferInfo info;
+      info.type = output_tensor.scalar_type();
+      auto out_val = compiled_kernel_->kernel()->outputs()[output_idx];
+      NVF_ERROR(out_val->isA<TensorView>(), "Output is not a TensorView");
+      info.tv = out_val->as<TensorView>();
+      NVF_ERROR(
+          !info.tv->hasAllocation(),
+          "Accepting allocated outputs is not currently supported with allocation domain. ",
+          "Allocation domain found for tv: ",
+          info.tv->toString());
+      info.shape_info.logical_sizes = output_tensor.sizes().vec();
+      info.shape_info.logical_strides = output_tensor.strides().vec();
+      output_info.emplace_back(info);
     }
   }
 
   auto intermediates = getIntermediateBufferInfo(expr_eval, index_type);
 
-  // All information is gathered. Save it to ExecutorEntry
+  // All information is gathered. Save it to KernelExecutorEntry
   executor_entry.launch_params = launch_params;
   executor_entry.outputs = output_info;
+  executor_entry.output_aliased_to_input =
+      executor_utils::getOutputAliasToInputMap(compiled_kernel_->kernel());
   executor_entry.intermediates = intermediates;
+  executor_entry.inputs = input_info;
   executor_entry.init = true;
 }
 
-/// Copies the data, logical_size, and alloc_stride parameters to the
-/// appropriate parts of entry.args[idx].
-///
-/// For GPU tensors, we pass a Tensor<type, rank, rank> struct (see
-/// runtime/tensor.cu), where the rank describes the number of elements in the
-/// shape and stride arrays. The actual shapes and strides are dynamic, but the
-/// type and rank of the tensors are actually static (changing them would need
-/// a new FusionDefinition). So we create the storage area for the
-/// Tensor<t,r,r> during ::computeArgs, and then in this function we just
-/// update that memory with the current values for the tensor's base address,
-/// shape, and strides.
-///
-/// @param entry the entry we have previously setup for this fusion
-/// @param idx the index into entry.args and related parallel arrays in the
-///            entry.
-/// @param idx_type_size generally sizeof(int32_t) or sizeof(int64_t); used for
-///                      computing how large the arrays to copy are.
-static void fillTensorArgMetadata(
-    KernelExecutor::ExecutorEntry& entry,
-    const PolymorphicValue& tensor_metadata,
-    size_t idx,
-    size_t idx_type_size) {
-  void* data = tensor_metadata->*&TensorMetaData::data;
-  // g++ has trouble inferring the types of more complicated fields through our
-  // *& operators. Creating an `auto` alias as a temporary resolves this
-  // problem.
-#define TMD_ARRAY_REF(pv, field)                  \
-  ({                                              \
-    const auto& fld_tmp_ = pv->*&field;           \
-    const c10::IntArrayRef& fld_aref_ = fld_tmp_; \
-    fld_aref_;                                    \
-  })
-  const c10::IntArrayRef& shape =
-      TMD_ARRAY_REF(tensor_metadata, TensorMetaData::logical_size);
-  const c10::IntArrayRef& strides =
-      TMD_ARRAY_REF(tensor_metadata, TensorMetaData::alloc_stride);
-#undef TMD_ARRAY_REF
-
-  // These are the three offsets we need to copy into.
-  std::array<std::byte*, 3> offsets = {
-      entry.args[idx].data(), // data ptr
-      entry.args[idx].data() + sizeof(void*), // shape array
-      // strides array:
-      entry.args[idx].data() + sizeof(void*) + shape.size() * idx_type_size,
-  };
-
-  memcpy(offsets[0], &data, sizeof(void*));
-  switch (idx_type_size) {
-    case sizeof(int64_t): {
-      // we use i64's for our sizes, so can use a simple copy here
-      memcpy(offsets[1], shape.data(), shape.size() * sizeof(int64_t));
-      memcpy(offsets[2], strides.data(), strides.size() * sizeof(int64_t));
-    } break;
-    case sizeof(int32_t): {
-      // we need to cast per-element, so need a loop.
-      // This case happens when the kernel uses 32bit indices. Since we
-      // (specifically TensorMetaData) store indices in 64bit, we can't
-      // directly copy our buffer into the args buffer. We thus have to
-      // manually downcast each element to fit in the smaller buffer.
-      for (size_t i = 0; i < shape.size(); ++i) {
-        const int32_t shp = static_cast<int32_t>(shape[i]);
-        memcpy(offsets[1] + i * sizeof(int32_t), &shp, sizeof(int32_t));
-      }
-      // In rare cases we have fewer strides than shapes
-      for (size_t i = 0; i < strides.size(); ++i) {
-        const int32_t strd = static_cast<int32_t>(strides[i]);
-        memcpy(offsets[2] + i * sizeof(int32_t), &strd, sizeof(int32_t));
-      }
-    } break;
-    default:
-      NVF_CHECK(0, "Unhandled index type size");
-      break;
+namespace {
+GlobalBufferInfo& linear_buffer_info_getter(
+    KernelExecutorEntry& entry,
+    size_t idx) {
+  if (idx < entry.inputs.size()) {
+    return entry.inputs[idx];
+  } else if (idx < entry.inputs.size() + entry.outputs.size()) {
+    return entry.outputs[idx - entry.inputs.size()];
+  } else if (
+      idx <
+      entry.inputs.size() + entry.outputs.size() + entry.intermediates.size()) {
+    return entry
+        .intermediates[idx - entry.inputs.size() - entry.outputs.size()];
+  } else {
+    NVF_CHECK(
+        0,
+        "Invalid buffer index: ",
+        idx,
+        " input size: ",
+        entry.inputs.size(),
+        " output size: ",
+        entry.outputs.size(),
+        " intermediate size: ",
+        entry.intermediates.size());
   }
-}
+};
+} // namespace
 
-// set the arguments that we'll pass to cuLaunchKernel. This should happen
-// when we change the rank of a tensor or the number of arguments to a kernel.
-// It does not need to happen when only shapes change---use recomputeArgs for
-// that.
 void KernelExecutor::computeArgs(
-    ExecutorEntry& entry,
-    ExpressionEvaluator& expr_eval,
-    const kir::Kernel* kernel) const {
+    KernelExecutorEntry& entry,
+    const KernelArgumentHolder& args) const {
   FUSER_PERF_SCOPE("KernelExecutor::computeArgs");
+  if (entry.args.size() != args.size()) {
+    entry.args.resize(args.size());
+    entry.arg_ptrs.resize(args.size());
+  }
 
-  const std::vector<Val*>& params = kernel->parameters();
-  entry.args.resize(params.size());
-  entry.arg_ptrs.resize(params.size());
-  const PrimDataType idx_type = kernel->indexType();
-  for (size_t p = 0; p < params.size(); ++p) {
-    entry.args[p] = getKernelArgument(expr_eval, params[p], idx_type);
-    entry.arg_ptrs[p] = entry.args[p].data();
+  NVF_ERROR(
+      args.size() == compiled_kernel_->kernel()->parameters().size(),
+      "Argument size mismatch, expected: ",
+      compiled_kernel_->kernel()->parameters().size(),
+      " got: ",
+      args.size());
+
+  for (auto inp : compiled_kernel_->kernel()->inputs()) {
+    if (!inp->isA<TensorView>()) {
+      continue;
+    }
   }
-}
 
-// Reset the arguments that we'll pass to cuLaunchKernel. This needs to be
-// invoked on every shape change.
-void KernelExecutor::recomputeArgs(
-    ExecutorEntry& entry,
-    ExpressionEvaluator& expr_eval,
-    const kir::Kernel* kernel) const {
-  FUSER_PERF_SCOPE("KernelExecutor::recomputeArgs");
-  // assert(entry.init && "entry was never initialized");
-
-  const std::vector<Val*>& params = kernel->parameters();
-  const PrimDataType idx_type = kernel->indexType();
-  // assert(entry.args.size() == params.size());
-  // assert(entry.arg_ptrs.size() == params.size());
-  // assert(params.size() >= args.size());
-  for (size_t p = 0; p < params.size(); ++p) {
-    PolymorphicValue pv = expr_eval.evaluate(params[p]);
-    if (pv.is<at::Tensor>() && pv.as<at::Tensor>().is_cuda()) {
-      // GPU tensors are not passed directly: instead we pass a Tensor<type,
-      // rank, rank> struct. The pointer and dimensions are dynamic, but the
-      // types and ranks are actually static (changing the rank or the types
-      // would need to be done via a new FusionDefinition). As such, we created
-      // the Tensor<t, r, r> struct during ::computeArgs, and here we just fill
-      // in the base address, shape, and stride arrays to cover whatever new
-      // tensors we got this round.
-      TensorView* mtv = dynamic_cast<TensorView*>(params[p]);
-      const Val* mdexpr = IrBuilder::metadataExpr(mtv);
-      const PolymorphicValue& tmd = expr_eval.evaluate(mdexpr);
-      const size_t idx_type_size =
-          PrimDataType::Int == idx_type ? sizeof(int64_t) : sizeof(int32_t);
-      fillTensorArgMetadata(entry, tmd, p, idx_type_size);
+  const PrimDataType idx_type = compiled_kernel_->kernel()->indexType();
+  int64_t buffer_info_idx = 0;
+  for (size_t arg_idx = 0; arg_idx < args.size(); ++arg_idx) {
+    if (args[arg_idx].is<at::Tensor>() &&
+        args[arg_idx].as<at::Tensor>().is_cuda()) {
+      const auto& buffer_info =
+          linear_buffer_info_getter(entry, buffer_info_idx++);
+      entry.args[arg_idx] = tensorToBytes(
+          args[arg_idx],
+          buffer_info.shape_info.logical_sizes,
+          buffer_info.shape_info.allocation_strides.empty()
+              ? buffer_info.shape_info.logical_strides
+              : buffer_info.shape_info.allocation_strides,
+          idx_type,
+          buffer_info.shape_info.unsharded_logical_sizes);
+      entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data();
     } else {
-      entry.args[p] = getKernelArgument(expr_eval, params[p], idx_type);
+      if (args[arg_idx].is<at::Tensor>()) {
+        buffer_info_idx++;
+      }
+      auto bytes = polymorphicValueToBytes(
+          args[arg_idx],
+          compiled_kernel_->kernel()->parameters()[arg_idx]->dtype(),
+          idx_type);
+      entry.args[arg_idx] = bytes;
+      entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data();
     }
-    entry.arg_ptrs[p] = entry.args[p].data();
   }
 }
 
@@ -885,6 +925,62 @@ void KernelExecutor::resetCompiledKernelProperties() {
   static_smem_size_.reset();
 }
 
+namespace {
+KernelArgumentHolder resolveRNGSeed(
+    const kir::Kernel* kernel,
+    KernelArgumentHolder& args) {
+  ExpressionEvaluator expr_eval;
+  KernelArgumentHolder resolved_args;
+  resolved_args.reserve(args.size());
+  int64_t arg_idx = 0;
+  for (auto param : kernel->parameters()) {
+    if (param->definition() &&
+        param->definition()->isA<kir::GetRNGSeedAndOffsetFromHost>()) {
+      resolved_args.push(expr_eval.evaluate(param));
+    } else {
+      resolved_args.push(args[arg_idx++]);
+    }
+  }
+  return resolved_args;
+}
+} // namespace
+
+// TODO: Reduce bindings to only those necessary to resolve missing params.
+// TODO: Check if this could be reused to also resolve dynamic aliases.
+KernelArgumentHolder KernelExecutor::resolveTMA(
+    KernelExecutorEntry& entry,
+    const KernelArgumentHolder& args) const {
+  ExpressionEvaluator expr_eval;
+  int64_t arg_idx = 0;
+  NVF_ERROR(
+      entry.inputs.size() == compiled_kernel_->kernel()->inputs().size(),
+      "Input size mismatch");
+  for (auto inp_idx : c10::irange(entry.inputs.size())) {
+    expr_eval.bind(
+        compiled_kernel_->kernel()->inputs()[inp_idx], args[arg_idx++]);
+  }
+
+  NVF_ERROR(
+      entry.outputs.size() == compiled_kernel_->kernel()->outputs().size(),
+      "Output size mismatch");
+  for (auto out_idx : c10::irange(entry.outputs.size())) {
+    expr_eval.bind(
+        compiled_kernel_->kernel()->outputs()[out_idx], args[arg_idx++]);
+  }
+
+  for (const auto& intermediate_entry : entry.intermediates) {
+    if (args[arg_idx].hasValue()) {
+      expr_eval.bind(intermediate_entry.tv, args[arg_idx++]);
+    }
+  }
+
+  KernelArgumentHolder resolved_args;
+  for (auto param : compiled_kernel_->kernel()->parameters()) {
+    resolved_args.push(expr_eval.evaluate(param));
+  }
+  return resolved_args;
+}
+
 KernelArgumentHolder KernelExecutor::run(
     KernelArgumentHolder args,
     KernelArgumentHolder output_args,
@@ -930,9 +1026,9 @@ KernelArgumentHolder KernelExecutor::run(
   NVF_ERROR(compiled_kernel_->lowered());
 
   // Placeholder for the case where parameter cache is not used
-  ExecutorEntry temporary_executor_entry;
+  KernelExecutorEntry temporary_executor_entry;
 
-  ExecutorEntry* executor_entry = args.getCacheId().has_value() &&
+  KernelExecutorEntry* executor_entry = args.getCacheId().has_value() &&
           !compiled_kernel_->disablePaarameterCache()
       ? &executor_entry_lookup_[*args.getCacheId()]
       : &temporary_executor_entry;
@@ -962,49 +1058,72 @@ KernelArgumentHolder KernelExecutor::run(
   // context manager to disable auto grad for `empty_cuda` calls later
   at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
 
-  // Bind fusion inputs
-  auto expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel());
-
   // only allocate outputs when not given
   if (output_args.empty()) {
     output_args = allocateOutputs(
-        compiled_kernel_->kernel()->as<Fusion>(),
+        compiled_kernel_->kernel(),
         executor_entry->outputs,
+        executor_entry->output_aliased_to_input,
         compiled_kernel_->device(),
-        expr_eval);
-  }
-  args.push(output_args);
+        args,
+        has_dynamic_alias_);
+    if (has_dynamic_alias_) {
+      ExpressionEvaluator expr_eval;
+      if (has_dynamic_alias_ || has_tma_) {
+        expr_eval =
+            executor_utils::bindInputs(args, compiled_kernel_->kernel());
+      }
 
-  for (const auto i : c10::irange(output_args.size())) {
-    auto output = compiled_kernel_->kernel()->outputs()[i];
-    if (std::any_of(
-            compiled_kernel_->kernel()->inputs().begin(),
-            compiled_kernel_->kernel()->inputs().end(),
-            [&](const auto& in) { return in == output; })) {
-      // Skip trivially forwarded outputs because they are just placeholders
-      continue;
+      for (const auto i :
+           c10::irange(compiled_kernel_->kernel()->outputs().size())) {
+        auto param = compiled_kernel_->kernel()->outputs()[i];
+        if (!param->isA<TensorView>()) {
+          continue;
+        }
+        if (compiled_kernel_->kernel()
+                ->getOutputAlias(param->as<TensorView>())
+                .type == AllocationType::Evaluate) {
+          output_args[i] = expr_eval.evaluate(param);
+        }
+      }
     }
-    expr_eval.bind(
-        output, args[compiled_kernel_->kernel()->inputs().size() + i]);
+    NVF_ERROR(
+        std::all_of(
+            output_args.begin(),
+            output_args.end(),
+            [](const PolymorphicValue& arg) {
+              return arg.hasValue() && arg.is<at::Tensor>();
+            }),
+        "Output is not populated or not a Tensor");
   }
 
+  args.push(output_args);
+
   KernelArgumentHolder intermediate_args;
   at::Tensor profile_buffer;
   {
     FUSER_PERF_SCOPE("KernelExecutor::runFusion::intermediates");
+    // Intermediates just use logical sizes and strides even though they're
+    // really allocation sizes and strides.
+    //
+    // This is simply because the convention used is that allocation
+    // sizes/strides are optional, logical are not.
     for (const auto intermediate_i :
          c10::irange(executor_entry->intermediates.size())) {
       const auto& buf_info = executor_entry->intermediates.at(intermediate_i);
       bool has_expansion = false;
       std::vector<int64_t> unexpanded_sizes;
-      unexpanded_sizes.reserve(buf_info.sizes.size());
-      NVF_ERROR(buf_info.sizes.size() == buf_info.strides.size())
-      for (const auto dim_i : c10::irange(buf_info.sizes.size())) {
-        if (buf_info.strides[dim_i] == 0) {
+      unexpanded_sizes.reserve(buf_info.shape_info.logical_sizes.size());
+      NVF_ERROR(
+          buf_info.shape_info.logical_sizes.size() ==
+          buf_info.shape_info.logical_strides.size())
+      for (const auto j :
+           c10::irange(buf_info.shape_info.logical_sizes.size())) {
+        if (buf_info.shape_info.logical_strides[j] == 0) {
           has_expansion = true;
           unexpanded_sizes.push_back(1L);
         } else {
-          unexpanded_sizes.push_back(buf_info.sizes[dim_i]);
+          unexpanded_sizes.push_back(buf_info.shape_info.logical_sizes[j]);
         }
       }
       at::Tensor intermediate_buffer;
@@ -1035,29 +1154,40 @@ KernelArgumentHolder KernelExecutor::run(
         }
       }
       if (has_expansion) {
-        intermediate_buffer =
-            at::native::expand(intermediate_buffer, buf_info.sizes);
+        intermediate_buffer = at::native::expand(
+            intermediate_buffer, buf_info.shape_info.logical_sizes);
       }
       args.push(intermediate_buffer);
       intermediate_args.push(intermediate_buffer);
-      expr_eval.bind(
-          compiled_kernel_->kernel()
-              ->summary()
-              .global_allocations.at(intermediate_i)
-              ->buffer(),
-          args
-              [compiled_kernel_->kernel()->inputs().size() +
-               output_args.size() + intermediate_i]);
       if (buf_info.is_profile_buffer) {
         profile_buffer = intermediate_buffer;
       }
     }
   }
 
-  if (executor_entry->args.empty()) {
-    computeArgs(*executor_entry, expr_eval, compiled_kernel_->kernel());
+  if (args.size() != compiled_kernel_->kernel()->parameters().size()) {
+    NVF_ERROR(
+        has_tma_ || has_rng_,
+        "No TMA or RNG found in the kernel, but detected an argument size mismatch.");
+    // If args don't match one of two things is happening. We need to add TMA
+    // related args or RNG related args. Resolve these scenarios.
+    if (has_tma_) {
+      // Resolving TMA requires binding all values and evaluating the TMA
+      // arguments
+      //
+      // Resolving TMA also resolves RNG, so if TMA exists the resolveRNGSeed
+      // function shouldn't also be called.
+      args = resolveTMA(*executor_entry, args);
+    } else if (has_rng_) {
+      // Resolving RNG seed requires evaluating and adding those values, but
+      // doesn't require binding all values as getting RNG seed and offset
+      // doesn't depend on other values
+      args = resolveRNGSeed(compiled_kernel_->kernel(), args);
+    }
   }
 
+  computeArgs(*executor_entry, args);
+
   if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) {
     launch_params_.print();
   }
@@ -1082,8 +1212,6 @@ KernelArgumentHolder KernelExecutor::run(
     FUSER_PERF_SCOPE("KernelExecutor::runFusion::execute_kernel");
     ensureAvailableDynamicSmemSize(executor_entry->launch_params.smem());
 
-    recomputeArgs(*executor_entry, expr_eval, compiled_kernel_->kernel());
-
     if (isDebugDumpEnabled(DebugDumpOption::Occupancy) ||
         isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
       int blocks_per_sm = -1;
@@ -1161,7 +1289,7 @@ KernelArgumentHolder KernelExecutor::run(
 flatbuffers::Offset<serde::KernelExecutor> KernelExecutor::serialize(
     flatbuffers::FlatBufferBuilder& builder) const {
   // See table definition for KernelExecutor in serde/fusion_cache.fbs
-  using fb_executor_entry = flatbuffers::Offset<serde::ExecutorEntry>;
+  using fb_executor_entry = flatbuffers::Offset<serde::KernelExecutorEntry>;
 
   // Separate unordered_map for executor_entry_lookup into key and value
   // vectors. The key value is the cache_id value in the KernelArgumentHolder.
@@ -1193,7 +1321,10 @@ flatbuffers::Offset<serde::KernelExecutor> KernelExecutor::serialize(
       &executor_entry_lookup_keys_fb,
       &executor_entry_lookup_values_fb,
       toUnderlying(compiledKernel()->kernel()->indexType()),
-      serialize(builder, compiledKernel()->cudaExecutable().get()));
+      serialize(builder, compiledKernel()->cudaExecutable().get()),
+      has_rng_,
+      has_tma_,
+      has_dynamic_alias_);
 }
 
 flatbuffers::Offset<serde::CudaKernel> KernelExecutor::serialize(
@@ -1242,10 +1373,10 @@ flatbuffers::Offset<serde::CudaKernel> KernelExecutor::serialize(
   return ckb.Finish();
 }
 
-flatbuffers::Offset<serde::ExecutorEntry> KernelExecutor::serialize(
+flatbuffers::Offset<serde::KernelExecutorEntry> KernelExecutor::serialize(
     flatbuffers::FlatBufferBuilder& builder,
-    const ExecutorEntry& data) const {
-  // See table definition for ExecutorEntry in serde/fusion_cache.fbs
+    const KernelExecutorEntry& data) const {
+  // See table definition for KernelExecutorEntry in serde/fusion_cache.fbs
 
   // Serialize GlobalBufferInfo for outputs.
   // We map the output TensorView pointer to its corresponding position in
@@ -1262,8 +1393,14 @@ flatbuffers::Offset<serde::ExecutorEntry> KernelExecutor::serialize(
         ? -1
         : std::distance(
               compiledKernel()->kernel()->outputs().cbegin(), tv_iter);
-    outputs_fb.push_back(
-        serialize(builder, buffer, tv_position, true /* is_fusion_output */));
+    NVF_ERROR(
+        tv_position != -1, "Output TensorView not found in kernel outputs");
+    outputs_fb.push_back(serialize(
+        builder,
+        buffer,
+        tv_position,
+        true /* is_fusion_output */,
+        false /* is_fusion_input */));
   }
 
   // Serialize GlobalBufferInfo for intermediates.
@@ -1287,34 +1424,66 @@ flatbuffers::Offset<serde::ExecutorEntry> KernelExecutor::serialize(
         : std::distance(
               compiledKernel()->kernel()->summary().global_allocations.cbegin(),
               tv_iter);
-    intermediates_fb.push_back(
-        serialize(builder, buffer, tv_position, false /* is_fusion_output */));
+    NVF_ERROR(
+        tv_position != -1,
+        "Intermediate TensorView not found in kernel global allocations");
+    intermediates_fb.push_back(serialize(
+        builder,
+        buffer,
+        tv_position,
+        false /* is_fusion_output */,
+        false /* is_fusion_input */));
   }
 
-  return serde::CreateExecutorEntryDirect(
+  std::vector<fb_global_buffer_info> inputs_fb;
+  inputs_fb.reserve(data.inputs.size());
+  for (const auto& buffer : data.inputs) {
+    auto tv_iter = std::find(
+        compiledKernel()->kernel()->inputs().cbegin(),
+        compiledKernel()->kernel()->inputs().cend(),
+        buffer.tv);
+    auto tv_position = (tv_iter == compiledKernel()->kernel()->inputs().cend())
+        ? -1
+        : std::distance(compiledKernel()->kernel()->inputs().cbegin(), tv_iter);
+    NVF_ERROR(tv_position != -1, "Input TensorView not found in kernel inputs");
+    inputs_fb.push_back(serialize(
+        builder,
+        buffer,
+        tv_position,
+        false /* is_fusion_output */,
+        true /* is_fusion_input */));
+  }
+  return serde::CreateKernelExecutorEntryDirect(
       builder,
       data.init,
       data.launch_params.serialize(builder),
       &outputs_fb,
-      &intermediates_fb);
+      &intermediates_fb,
+      &inputs_fb,
+      &data.output_aliased_to_input);
 }
 
 flatbuffers::Offset<serde::GlobalBufferInfo> KernelExecutor::serialize(
     flatbuffers::FlatBufferBuilder& builder,
     const GlobalBufferInfo& data,
     int64_t tv_position,
-    bool is_fusion_output) const {
+    bool is_fusion_output,
+    bool is_fusion_input) const {
   // See table definition for GlobalBufferInfo in serde/fusion_cache.fbs
   return serde::CreateGlobalBufferInfoDirect(
       builder,
       tv_position,
-      &data.sizes,
-      &data.strides,
+      &data.shape_info.logical_sizes,
+      &data.shape_info.logical_strides,
+      &data.shape_info.unsharded_logical_sizes,
+      &data.shape_info.allocation_sizes,
+      &data.shape_info.allocation_strides,
       nvfuser::toUnderlying(data.type),
       data.zero_init,
       data.resets_to_zero,
       data.is_profile_buffer,
-      is_fusion_output);
+      is_fusion_output,
+      is_fusion_input);
 }
 
 void KernelExecutor::deserialize(
@@ -1328,7 +1497,6 @@ void KernelExecutor::deserialize(
     int64_t runtime_id,
     int64_t group_id) {
   // See table definition for KernelExecutor in serde/fusion_cache.fbs
-
   NVF_ERROR(buffer != nullptr, "serde::KernelExecutor is nullptr.");
   NVF_ERROR(_fusion != nullptr, "Fusion is nullptr.");
 
@@ -1379,15 +1547,19 @@ void KernelExecutor::deserialize(
         buffer->executor_entry_lookup_keys()->Get(idx),
         deserialize(buffer->executor_entry_lookup_values()->Get(idx)));
   }
+
+  has_rng_ = buffer->has_rng();
+  has_tma_ = buffer->has_tma();
+  has_dynamic_alias_ = buffer->has_dynamic_alias();
 }
 
-KernelExecutor::ExecutorEntry KernelExecutor::deserialize(
-    const serde::ExecutorEntry* buffer) {
-  // See table definition for ExecutorEntry in serde/fusion_cache.fbs
+KernelExecutorEntry KernelExecutor::deserialize(
+    const serde::KernelExecutorEntry* buffer) {
+  // See table definition for KernelExecutorEntry in serde/fusion_cache.fbs
 
-  NVF_ERROR(buffer != nullptr, "serde::ExecutorEntry is nullptr.");
+  NVF_ERROR(buffer != nullptr, "serde::KernelExecutorEntry is nullptr.");
 
-  ExecutorEntry entry;
+  KernelExecutorEntry entry;
 
   entry.init = buffer->init();
 
@@ -1401,6 +1573,14 @@ KernelExecutor::ExecutorEntry KernelExecutor::deserialize(
     entry.intermediates.push_back(deserialize(intermediate_buffer));
   }
 
+  for (auto input_buffer : *buffer->inputs()) {
+    entry.inputs.push_back(deserialize(input_buffer));
+  }
+
+  for (auto output_aliased_to_input : *buffer->output_aliased_to_input()) {
+    entry.output_aliased_to_input.push_back(output_aliased_to_input);
+  }
+
   return entry;
 }
 
@@ -1411,7 +1591,8 @@ GlobalBufferInfo KernelExecutor::deserialize(
   NVF_ERROR(buffer != nullptr, "serde::GlobalBufferInfo is nullptr.");
 
   NVF_ERROR(
-      buffer->tv() != -1, "Serialization failed to encode buffer tv position.");
+      buffer->tv_pos() != -1,
+      "Serialization failed to encode buffer tv position.");
 
   NVF_ERROR(
       compiled_kernel_->lowered() != nullptr,
@@ -1419,24 +1600,44 @@ GlobalBufferInfo KernelExecutor::deserialize(
 
   GlobalBufferInfo info;
   if (buffer->is_fusion_output()) {
-    auto out_val = compiled_kernel_->kernel()->outputs().at(buffer->tv());
+    auto out_val = compiled_kernel_->kernel()->outputs().at(buffer->tv_pos());
     NVF_ERROR(out_val != nullptr);
     info.tv = dynamic_cast<TensorView*>(out_val);
+  } else if (buffer->is_fusion_input()) {
+    auto in_val = compiled_kernel_->kernel()->inputs().at(buffer->tv_pos());
+    NVF_ERROR(in_val != nullptr);
+    info.tv = dynamic_cast<TensorView*>(in_val);
   } else {
     auto out_val = compiled_kernel_->kernel()->summary().global_allocations.at(
-        buffer->tv());
+        buffer->tv_pos());
     NVF_ERROR(out_val != nullptr);
     info.tv = dynamic_cast<TensorView*>(out_val->buffer());
   }
 
-  for (auto dim_size : *buffer->sizes()) {
-    info.sizes.emplace_back(dim_size);
+  TensorShapeInfo shape_info;
+
+  for (auto dim_size : *buffer->logical_sizes()) {
+    shape_info.logical_sizes.emplace_back(dim_size);
+  }
+
+  for (auto dim_stride : *buffer->logical_strides()) {
+    shape_info.logical_strides.emplace_back(dim_stride);
+  }
+
+  for (auto dim_size : *buffer->unsharded_logical_sizes()) {
+    shape_info.unsharded_logical_sizes.emplace_back(dim_size);
   }
 
-  for (auto dim_stride : *buffer->strides()) {
-    info.strides.emplace_back(dim_stride);
+  for (auto dim_size : *buffer->alloc_sizes()) {
+    shape_info.allocation_sizes.emplace_back(dim_size);
   }
 
+  for (auto dim_stride : *buffer->alloc_strides()) {
+    shape_info.allocation_strides.emplace_back(dim_stride);
+  }
+
+  info.shape_info = shape_info;
+
   info.type = serde::mapToAtenDtype(buffer->dtype());
   info.zero_init = buffer->zero_init();
   info.resets_to_zero = buffer->resets_to_zero();
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index 27862c7ecb0..4b8c5e12be5 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -57,6 +57,35 @@ class ExprEvalExecutor : public ExecutorAbstract {
   std::unique_ptr<Fusion> fusion_;
 };
 
+// struct used to hold necessary information to launch compiled kernel on a
+// given input set.
+//
+// TODO: strides would also be important when we handle permutations in
+//       codegen.
+//
+struct KernelExecutorEntry {
+  bool init = false;
+  LaunchParams launch_params;
+  std::vector<GlobalBufferInfo> outputs;
+  // If an output is aliased to an input, this will hold the index of the
+  // input that it is aliased to. If not aliased, it will hold -1.
+  std::vector<int> output_aliased_to_input;
+  // Temporary work buffers and intemediate global-memory tensors
+  std::vector<GlobalBufferInfo> intermediates;
+  std::vector<GlobalBufferInfo> inputs;
+  // The arguments to the kernel. These are configured in computeArgs and
+  // recomputeArgs.
+  // For the common case of a tensor argument, these correspond to the
+  // `struct Tensor` data in runtime/tensor.cu. That means each tensor
+  // element in `args` would be a sizeof(void*) + len(shape)*sizeof(int) +
+  // len(shape)*sizeof(int) byte array (here "int" is used in place of the
+  // index type, which varies in practice).
+  std::vector<std::vector<std::byte>> args;
+  // This is just the data() pointers to the above `args`; cuLaunchKernel
+  // requires an array of this form.
+  std::vector<void*> arg_ptrs;
+};
+
 class KernelExecutor : public ExecutorAbstract {
  public:
   // NVF_API was added for nvfuser_extension. See examples/sinh_extension.
@@ -111,31 +140,6 @@ class KernelExecutor : public ExecutorAbstract {
     executor_entry_lookup_.erase(cache_id);
   }
 
-  // struct used to hold necessary information to launch compiled kernel on a
-  // given input set.
-  //
-  // TODO: strides would also be important when we handle permutations in
-  //       codegen.
-  //
-  struct ExecutorEntry {
-    bool init = false;
-    LaunchParams launch_params;
-    std::vector<GlobalBufferInfo> outputs;
-    // Temporary work buffers and intemediate global-memory tensors
-    std::vector<GlobalBufferInfo> intermediates;
-    // The arguments to the kernel. These are configured in computeArgs and
-    // recomputeArgs.
-    // For the common case of a tensor argument, these correspond to the
-    // `struct Tensor` data in runtime/tensor.cu. That means each tensor
-    // element in `args` would be a sizeof(void*) + len(shape)*sizeof(int) +
-    // len(shape)*sizeof(int) byte array (here "int" is used in place of the
-    // index type, which varies in practice).
-    std::vector<std::vector<std::byte>> args;
-    // This is just the data() pointers to the above `args`; cuLaunchKernel
-    // requires an array of this form.
-    std::vector<void*> arg_ptrs;
-  };
-
   using ExecutorCompileTimeInfoCache =
       executor_utils::caching::ExecutorCompileTimeInfoCache;
 
@@ -212,9 +216,9 @@ class KernelExecutor : public ExecutorAbstract {
     return &compile_time_info_cache_;
   }
 
-  //! TODO: Consider changing this to a constructor of ExecutorEntry
+  //! TODO: Consider changing this to a constructor of KernelExecutorEntry
   void initializeExecutorEntry(
-      ExecutorEntry& executor_entry,
+      KernelExecutorEntry& executor_entry,
       const KernelArgumentHolder& args,
       const LaunchParams& launch_constraints,
       const CompileParams& compile_params,
@@ -225,30 +229,27 @@ class KernelExecutor : public ExecutorAbstract {
 
   // Creates the initial set of arguments to a kernel, based on the arguments
   // to we have now.
-  void computeArgs(ExecutorEntry&, ExpressionEvaluator&, const kir::Kernel*)
-      const;
-  // Updates an existing set of arguments based on the current arguments. It is
-  // is an error to call this before `computeArgs` has been invoked.
-  // recomputeArgs will fail if the arity of the function changes, or the rank
-  // of any tensor changes (as these are compiled-in to the generated kernel
-  // and therefore would require us to do a larger recompilation).
-  void recomputeArgs(ExecutorEntry&, ExpressionEvaluator&, const kir::Kernel*)
+  void computeArgs(KernelExecutorEntry& entry, const KernelArgumentHolder& args)
       const;
 
+  KernelArgumentHolder resolveTMA(
+      KernelExecutorEntry& entry,
+      const KernelArgumentHolder& args) const;
+
   //! Serialize CompiledKernel using flatbuffers
   flatbuffers::Offset<serde::CudaKernel> serialize(
       flatbuffers::FlatBufferBuilder& builder,
       const executor_utils::CudaExecutable* kernel) const;
 
-  // ExecutorEntry is an internal POD struct for the KernelExecutor class.
-  // We define ExecutorEntry's serialize and deserialize as private methods in
-  // KernelExecutor.
-  flatbuffers::Offset<serde::ExecutorEntry> serialize(
+  // KernelExecutorEntry is an internal POD struct for the KernelExecutor class.
+  // We define KernelExecutorEntry's serialize and deserialize as private
+  // methods in KernelExecutor.
+  flatbuffers::Offset<serde::KernelExecutorEntry> serialize(
       flatbuffers::FlatBufferBuilder& builder,
-      const ExecutorEntry& data) const;
+      const KernelExecutorEntry& data) const;
 
-  //! Deserialize ExecutorEntry using flatbuffers
-  ExecutorEntry deserialize(const serde::ExecutorEntry* buffer);
+  //! Deserialize KernelExecutorEntry using flatbuffers
+  KernelExecutorEntry deserialize(const serde::KernelExecutorEntry* buffer);
 
   // GlobalBufferInfo is an internal POD struct for the KernelExecutor class.
   // We define GlobalBufferInfo's serialize and deserialize as private methods
@@ -257,7 +258,8 @@ class KernelExecutor : public ExecutorAbstract {
       flatbuffers::FlatBufferBuilder& builder,
       const GlobalBufferInfo& data,
       int64_t tv_position,
-      bool is_fusion_output) const;
+      bool is_fusion_output,
+      bool is_fusion_input) const;
 
   //! Deserialize GlobalBufferInfo using flatbuffers
   GlobalBufferInfo deserialize(const serde::GlobalBufferInfo* buffer);
@@ -295,9 +297,21 @@ class KernelExecutor : public ExecutorAbstract {
 
   int64_t warp_size_ = 0;
 
+  // Has an RNG kernel and therefore needs to infer RNG state through expression
+  // evaluator
+  bool has_rng_ = false;
+
+  // Has a TMA kernel and therefore needs to infer TMA inputs through expression
+  // evaluator
+  bool has_tma_ = false;
+
+  // Has a dynamic alias and therefore needs to infer what they are through
+  // expression evaluator
+  bool has_dynamic_alias_ = false;
+
   // lookup table to take short cut to retrieve recorded information in order to
   // launch kernels without re-inference parameters.
-  std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;
+  std::unordered_map<size_t, KernelExecutorEntry> executor_entry_lookup_;
 
   // Compile time information caching. This is used for shape inference
   //  support. The cache stores graph information that are available
diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp
index 8a0fc4fe7ce..ccc2f7657f7 100644
--- a/csrc/runtime/executor_kernel_arg.cpp
+++ b/csrc/runtime/executor_kernel_arg.cpp
@@ -8,6 +8,7 @@
 #include <c10/util/irange.h>
 
 // Extract size and strides
+#include <runtime/allocations.h>
 #include <runtime/fusion_executor_cache.h>
 
 #include <instrumentation.h>
@@ -119,15 +120,6 @@ void KernelArgumentHolder::pushTensorProxy(
   push(meta_tensor);
 }
 
-std::vector<c10::IValue> KernelArgumentHolder::toC10Array() const {
-  std::vector<c10::IValue> ival_array;
-  ival_array.reserve(arguments_.size());
-  for (const auto& arg : arguments_) {
-    ival_array.push_back(PolymorphicValue_functions::toIValue(arg));
-  }
-  return ival_array;
-}
-
 void KernelArgumentHolder::setDeviceIndex(std::optional<int8_t> index) {
   if (index.has_value()) {
     device_index_ = index.value();
@@ -294,51 +286,11 @@ std::vector<std::byte> polymorphicValueToBytes(
   } else if (argument.is<StructHandle>()) {
     // FUSER_PERF_SCOPE("polymorphicValueToBytes(StructHandle)");
     std::vector<std::byte> buffer;
-    const auto& dtype_ = std::get<StructType>(dtype.type);
-    auto& data = argument->*&TensorMetaData::data;
-    auto& logical_size = argument->*&TensorMetaData::logical_size;
-    auto& alloc_stride = argument->*&TensorMetaData::alloc_stride;
     if (argument.as<StructHandle>().is<TensorMetaData>()) {
-      // special handle for TensorMetaData so that CPU overhead is minimal.
-      if (index_type == PrimDataType::Int) {
-        buffer.reserve(
-            sizeof(void*) + sizeof(int64_t) * logical_size.size() +
-            sizeof(int64_t) * alloc_stride.size());
-        buffer.insert(
-            buffer.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*));
-        buffer.insert(
-            buffer.end(),
-            (std::byte*)logical_size.data(),
-            (std::byte*)logical_size.data() +
-                sizeof(int64_t) * logical_size.size());
-        buffer.insert(
-            buffer.end(),
-            (std::byte*)alloc_stride.data(),
-            (std::byte*)alloc_stride.data() +
-                sizeof(int64_t) * alloc_stride.size());
-      } else {
-        buffer.reserve(
-            sizeof(void*) + sizeof(int32_t) * logical_size.size() +
-            sizeof(int32_t) * alloc_stride.size());
-        buffer.insert(
-            buffer.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*));
-        std::vector<int32_t> logical_size32(
-            logical_size.begin(), logical_size.end());
-        buffer.insert(
-            buffer.end(),
-            (std::byte*)logical_size32.data(),
-            (std::byte*)logical_size32.data() +
-                sizeof(int32_t) * logical_size32.size());
-        std::vector<int32_t> alloc_stride32(
-            alloc_stride.begin(), alloc_stride.end());
-        buffer.insert(
-            buffer.end(),
-            (std::byte*)alloc_stride32.data(),
-            (std::byte*)alloc_stride32.data() +
-                sizeof(int32_t) * alloc_stride32.size());
-      }
-      return buffer;
+      NVF_THROW(
+          "Don't send tensor metadata to this function directly, use tensorToBytes.");
     } else {
+      const auto& dtype_ = std::get<StructType>(dtype.type);
       for (const auto& field : dtype_.fields) {
         if (!field.used_in_kernel) {
           continue;
@@ -357,24 +309,58 @@ std::vector<std::byte> polymorphicValueToBytes(
   }
 }
 
-std::vector<std::byte> getKernelArgument(
-    ExpressionEvaluator& ee,
-    Val* parameter,
-    PrimDataType index_type) {
-  FUSER_PERF_SCOPE("getKernelArgument");
-  NVF_ERROR(parameter != nullptr);
-  PolymorphicValue pv = ee.evaluate(parameter);
-  if (auto tv = dynamic_cast<TensorView*>(parameter)) {
-    if (tv->isCpuScalar()) {
-      return polymorphicValueToBytes(pv, tv->dtype(), index_type);
-    } else {
-      const Val* metadata_val = IrBuilder::metadataExpr(tv);
-      const PolymorphicValue& metadata = ee.evaluate(metadata_val);
-      return polymorphicValueToBytes(
-          metadata, metadata_val->dtype(), index_type);
-    }
+std::vector<std::byte> tensorToBytes(
+    const PolymorphicValue& argument,
+    const std::vector<int64_t>& logical_sizes,
+    const std::vector<int64_t>& alloc_strides,
+    PrimDataType idx_type,
+    const std::vector<int64_t>& unsharded_logical_sizes) {
+  std::vector<std::byte> bytes;
+  NVF_ERROR(
+      argument.is<at::Tensor>() && argument.as<at::Tensor>().is_cuda(),
+      "Argument is not a CUDA tensor.");
+  const auto& tensor = argument.as<at::Tensor>();
+  auto data = tensor.data_ptr();
+
+  const auto& size_to_use =
+      logical_sizes.size() == unsharded_logical_sizes.size()
+      ? unsharded_logical_sizes
+      : logical_sizes;
+  // special handle for TensorMetaData so that CPU overhead is minimal.
+  if (idx_type == PrimDataType::Int) {
+    bytes.reserve(
+        sizeof(void*) + sizeof(int64_t) * size_to_use.size() +
+        sizeof(int64_t) * alloc_strides.size());
+    bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1));
+    bytes.insert(
+        bytes.end(),
+        (std::byte*)size_to_use.data(),
+        (std::byte*)size_to_use.data() + sizeof(int64_t) * size_to_use.size());
+    bytes.insert(
+        bytes.end(),
+        (std::byte*)alloc_strides.data(),
+        (std::byte*)alloc_strides.data() +
+            sizeof(int64_t) * alloc_strides.size());
+  } else {
+    bytes.reserve(
+        sizeof(void*) + sizeof(int32_t) * size_to_use.size() +
+        sizeof(int32_t) * alloc_strides.size());
+    bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1));
+    std::vector<int32_t> logical_size32(size_to_use.begin(), size_to_use.end());
+    bytes.insert(
+        bytes.end(),
+        (std::byte*)logical_size32.data(),
+        (std::byte*)logical_size32.data() +
+            sizeof(int32_t) * logical_size32.size());
+    std::vector<int32_t> alloc_stride32(
+        alloc_strides.begin(), alloc_strides.end());
+    bytes.insert(
+        bytes.end(),
+        (std::byte*)alloc_stride32.data(),
+        (std::byte*)alloc_stride32.data() +
+            sizeof(int32_t) * alloc_stride32.size());
   }
-  return polymorphicValueToBytes(pv, parameter->dtype(), index_type);
+  return bytes;
 }
 
 int64_t computeBytes(const KernelArgumentHolder& args) {
@@ -389,15 +375,4 @@ int64_t computeBytes(const KernelArgumentHolder& args) {
   return num_bytes;
 }
 
-int64_t computeBytes(const std::vector<at::Tensor>& outputs) {
-  int64_t num_bytes = 0;
-  for (auto i : c10::irange(outputs.size())) {
-    const auto& output = outputs.at(i);
-    // NOTE: this assumes that all output elements correspond to a single
-    // store
-    num_bytes += static_cast<int64_t>(output.storage().nbytes());
-  }
-  return num_bytes;
-}
-
 } // namespace nvfuser
diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h
index e4a8c6b4ea9..77198ae9002 100644
--- a/csrc/runtime/executor_kernel_arg.h
+++ b/csrc/runtime/executor_kernel_arg.h
@@ -22,6 +22,8 @@
 
 namespace nvfuser {
 
+struct GlobalBufferInfo;
+
 //! KernelArgumentHolder copies meta information from kernel inputs, including
 //! tensor sizes/shapes/dtype/memory_ptr and copies scalar inputs. It is used
 //! for both compilation as well as kernel execution. It takes ownership of
@@ -72,6 +74,14 @@ class NVF_API KernelArgumentHolder {
     }
   }
 
+  void reserve(size_t size) {
+    arguments_.reserve(size);
+  }
+
+  void resize(size_t size) {
+    arguments_.resize(size);
+  }
+
   void push(const std::vector<at::Tensor>& tensors);
   void push(const c10::ArrayRef<c10::IValue>& args);
   void push(std::initializer_list<c10::IValue> args) {
@@ -97,8 +107,6 @@ class NVF_API KernelArgumentHolder {
 
   void erase(const PolymorphicValue& arg_to_delete);
 
-  std::vector<c10::IValue> toC10Array() const;
-
   PolymorphicValue& back() {
     return arguments_.back();
   }
@@ -212,18 +220,21 @@ class NVF_API KernelArgumentHolder {
   std::optional<size_t> cache_id_ = std::nullopt;
 };
 
+// Used to convert a polymorphic value to a byte vector. Do not use for CUDA
+// Tensors, use tensorToBytes instead.
 std::vector<std::byte> polymorphicValueToBytes(
     const PolymorphicValue& argument,
     const DataType& dtype,
     PrimDataType index_type);
 
-std::vector<std::byte> getKernelArgument(
-    ExpressionEvaluator& ee,
-    Val* parameter,
-    PrimDataType index_type);
+// Used to convert a CUDA tensor to a byte vector.
+std::vector<std::byte> tensorToBytes(
+    const PolymorphicValue& argument,
+    const std::vector<int64_t>& logical_sizes,
+    const std::vector<int64_t>& allocation_strides,
+    PrimDataType idx_type,
+    const std::vector<int64_t>& unsharded_logical_sizes = {});
 
 int64_t computeBytes(const KernelArgumentHolder& args);
 
-int64_t computeBytes(const std::vector<at::Tensor>& outputs);
-
 } // namespace nvfuser
diff --git a/csrc/runtime/executor_utils.cpp b/csrc/runtime/executor_utils.cpp
index 6e0c2d769a5..2900a1bb022 100644
--- a/csrc/runtime/executor_utils.cpp
+++ b/csrc/runtime/executor_utils.cpp
@@ -599,6 +599,44 @@ ExpressionEvaluator bindInputs(
   return expr_eval;
 }
 
+std::vector<int> getOutputAliasToInputMap(const Fusion* fusion) {
+  std::vector<int> output_to_input_map(fusion->outputs().size(), -1);
+  for (auto output_idx : c10::irange(fusion->outputs().size())) {
+    auto alias_info = fusion->getOutputAlias(fusion->outputs()[output_idx]);
+    if (alias_info.type == AllocationType::New) {
+      continue;
+    }
+    NVF_ERROR(
+        alias_info.aliased_io && alias_info.aliased_io->isA<TensorView>(),
+        "Alias information is missing the aliased tensor.");
+
+    auto aliased_to = alias_info.aliased_io->as<TensorView>();
+    auto aliased_to_idx = std::distance(
+        fusion->inputs().begin(),
+        std::find(
+            fusion->inputs().begin(), fusion->inputs().end(), aliased_to));
+    if (aliased_to_idx < (int64_t)fusion->inputs().size()) {
+      output_to_input_map[output_idx] = (int)aliased_to_idx;
+    } else {
+      auto aliased_out = std::find(
+          fusion->outputs().begin(), fusion->outputs().end(), aliased_to);
+      NVF_ERROR(
+          aliased_out != fusion->outputs().end(),
+          "Could not find the alias tensor of: ",
+          fusion->outputs()[output_idx]->toString(),
+          "\nAliased to: ",
+          aliased_to->toString());
+      NVF_THROW(
+          "Kernel found with output to output aliasing, this is unsupported at this moment.\n",
+          "Output: ",
+          fusion->outputs()[output_idx]->toString(),
+          "\nAliased to: ",
+          aliased_to->toString());
+    }
+  }
+  return output_to_input_map;
+}
+
 CudaExecutable::~CudaExecutable() {
   if (module != nullptr) {
     NVFUSER_CUDA_SAFE_CALL(cuModuleUnload(module));
diff --git a/csrc/runtime/executor_utils.h b/csrc/runtime/executor_utils.h
index 9d631f2548e..211e612c31c 100644
--- a/csrc/runtime/executor_utils.h
+++ b/csrc/runtime/executor_utils.h
@@ -51,6 +51,11 @@ struct CudaExecutable : public NonCopyable {
 NVF_API ExpressionEvaluator
 bindInputs(const KernelArgumentHolder& args, Fusion* fusion);
 
+// Returns a vector where vector[out_idx] == the input index in fusion->inputs()
+// that output[out_idx] is aliased to. If output[out_idx] is not aliased to any
+// input, then vector[out_idx] is -1.
+std::vector<int> getOutputAliasToInputMap(const Fusion* fusion);
+
 // Compile time cache for execution
 namespace caching {
 // TODO: Could consider putting some of
diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs
index 562f35d9e59..a09c3497e73 100644
--- a/csrc/serde/fusion_cache.fbs
+++ b/csrc/serde/fusion_cache.fbs
@@ -189,7 +189,7 @@ table KernelArgumentHolder {
 
 //
 // =====================================================================================
-// Tables for LaunchParams, GlobalBufferInfo, ExecutorEntry, and TensorShape used in KernelExecutor
+// Tables for LaunchParams, GlobalBufferInfo, KernelExecutorEntry, and TensorShape used in KernelExecutor
 
 // Data representing a tensor shape used in LaunchParam
 table TensorShape {
@@ -214,22 +214,28 @@ table LaunchParams {
 // For output tensors, we use its position in the fusion outputs.
 // For intermediate tensors, we use its position in the KernelSummary global_allocations.
 table GlobalBufferInfo {
-  tv : long = -1;
-  sizes : [long];
-  strides : [long];
+  tv_pos : long = -1;
+  logical_sizes : [long];
+  logical_strides : [long];
+  unsharded_logical_sizes : [long];
+  alloc_sizes : [long];
+  alloc_strides : [long];
   dtype : long;
   zero_init : bool;
   resets_to_zero : bool;
   is_profile_buffer : bool;
   is_fusion_output : bool;
+  is_fusion_input : bool;
 }
 
-// This table describes the cached ExecutorEntry for a kernel.
-table ExecutorEntry {
+// This table describes the cached KernelExecutorEntry for a kernel.
+table KernelExecutorEntry {
   init : bool;
   launch_params : LaunchParams;
   outputs : [GlobalBufferInfo];
   intermediates : [GlobalBufferInfo];
+  inputs : [GlobalBufferInfo];
+  output_aliased_to_input : [int];
 }
 
 // =====================================================================================
@@ -368,10 +374,13 @@ table KernelExecutor {
   group_id: long;
   kernel_code: string;
   executor_entry_lookup_keys: [ulong];
-  executor_entry_lookup_values: [ExecutorEntry];
+  executor_entry_lookup_values: [KernelExecutorEntry];
   // Is this kernel being compiled with int32 or int64 indexing?
   index_type : long;
   compiled_kernel: CudaKernel;
+  has_rng: bool;
+  has_tma: bool;
+  has_dynamic_alias: bool;
 }
 
 // A directed edge on DAG, which wraps a value that connects segmented groups.
diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp
index e82aeef08c0..e5dd17f4aca 100644
--- a/csrc/tensor_metadata.cpp
+++ b/csrc/tensor_metadata.cpp
@@ -356,7 +356,9 @@ std::vector<PolymorphicValue> GetMetaData::evaluate(
   } else {
     metadata->logical_size = input.sizes();
   }
-  metadata->logical_stride = input.strides();
+  metadata->logical_stride_data =
+      std::vector<int64_t>(input.strides().begin(), input.strides().end());
+  metadata->logical_stride = c10::makeArrayRef(metadata->logical_stride_data);
 
   auto [allocation_sizes, allocation_strides] =
       inferAndValidateAllocationSizesAndStrides(input, tv, ee);
diff --git a/tests/python/test_schedule_ops.py b/tests/python/test_schedule_ops.py
index 0e5bc1e864f..26ba00b88f7 100644
--- a/tests/python/test_schedule_ops.py
+++ b/tests/python/test_schedule_ops.py
@@ -1012,6 +1012,9 @@ def schedule(self):
         self.assertEqual(nvf_out[1], inputs[4])
         self.assertEqual(nvf_out[2], torch_ref)
 
+    @pytest.mark.skip(
+        reason="Disable test, the scheduler is not actually sending to ExprEvalExec but is sending to KernelExecutor which will correctly error."
+    )
     def test_matmul_auto_scheduler(self):
         """
         Implement a simple matmul kernel with a user defined schedule