From 9a0dc9e8d6e0cdc8699384a8bae203591d01ec4b Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 26 Mar 2025 04:29:58 -0700
Subject: [PATCH 01/38] host ir alias and prealloc output support

---
 csrc/host_ir/container.cpp  |   5 +
 csrc/host_ir/container.h    |  12 ++
 csrc/host_ir/executor.cpp   | 216 ++++++++++++++++++------------------
 csrc/host_ir/executor.h     |  33 +++++-
 tests/cpp/test_host_irs.cpp | 112 ++++++++++++++++++-
 5 files changed, 268 insertions(+), 110 deletions(-)
diff --git a/csrc/host_ir/container.cpp b/csrc/host_ir/container.cpp
index ca4c98f2a56..a133c0521ba 100644
--- a/csrc/host_ir/container.cpp
+++ b/csrc/host_ir/container.cpp
@@ -35,6 +35,11 @@ Stream* HostIrContainer::getDefaultStream() {
 std::ostream& HostIrContainer::print(std::ostream& os) const {
   IrMathPrinter op_exprs(os);
   op_exprs.handle(this);
+  os << "Aliases:{";
+  for (const auto& alias : alias_) {
+    os << "\n  " << alias.first << " -> " << alias.second;
+  }
+  os << "\n}\n";
   return os;
 }
 
diff --git a/csrc/host_ir/container.h b/csrc/host_ir/container.h
index ab029f63fa2..7dcd66b4436 100644
--- a/csrc/host_ir/container.h
+++ b/csrc/host_ir/container.h
@@ -55,10 +55,22 @@ class HostIrContainer final : public Fusion {
 
   Stream* getDefaultStream();
 
+  void markAlias(TensorView* original, const TensorView* new_alias) {
+    if (alias_.count(original)) {
+      original = alias_[original]->as<TensorView>();
+    }
+    alias_[new_alias] = original;
+  }
+
+  const auto& alias() const {
+    return alias_;
+  }
+
  private:
   std::vector<Expr*> top_level_exprs_;
   std::vector<std::unique_ptr<KernelExecutor>> kernel_executors_;
   Stream* default_stream_ = nullptr;
+  std::unordered_map<const Val*, Val*> alias_;
 };
 
 } // namespace hir
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 4b6c2fe6796..4b631bd0061 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -171,32 +171,6 @@ KernelArgumentHolder HostIrExecutor::run(
 
 namespace hir {
 
-namespace {
-
-at::Tensor getKnownTensorOrUndefined(
-    Val* val,
-    const ExpressionEvaluator& expr_evaluator) {
-  return expr_evaluator.isKnown(val)
-      ? expr_evaluator.evaluate(val).as<at::Tensor>()
-      : at::Tensor();
-}
-
-KernelArgumentHolder getKnownTensorOrUndefined(
-    const std::vector<Val*>& vals,
-    const ExpressionEvaluator& expr_evaluator) {
-  std::vector<at::Tensor> tensors(vals.size());
-  std::transform(
-      vals.begin(),
-      vals.end(),
-      tensors.begin(),
-      [&expr_evaluator](Val* val) -> at::Tensor {
-        return getKnownTensorOrUndefined(val, expr_evaluator);
-      });
-  return KernelArgumentHolder(tensors);
-}
-
-} // namespace
-
 HostIrEvaluator::HostIrEvaluator(
     std::unique_ptr<HostIrContainer> container,
     Communicator* communicator,
@@ -216,10 +190,23 @@ HostIrEvaluator::HostIrEvaluator(
       {container_->getDefaultStream(),
        c10::cuda::getDefaultCUDAStream(
            static_cast<c10::DeviceIndex>(device_index))});
-  expr_evaluator_.bind("numberOfStreams", params_.number_of_streams);
+  NVF_ERROR(
+      std::all_of(
+          container_->inputs().begin(),
+          container_->inputs().end(),
+          [this](Val* input) { return !container_->alias().count(input); }),
+      "Inputs cannot be aliased");
 }
 
-KernelArgumentHolder HostIrEvaluator::dispatchAndCollectOutputs() {
+KernelArgumentHolder HostIrEvaluator::runWithInput(
+    const std::unordered_map<Val*, PolymorphicValue>& val_to_PValue) {
+  expr_evaluator_ = ExpressionEvaluator();
+  expr_evaluator_.bind("numberOfStreams", params_.number_of_streams);
+  // process input values, converting IValue to PolymorphicValue
+  for (const auto& [val, pvalue] : val_to_PValue) {
+    bind(val, pvalue);
+  }
+
   // Interpret each instruction in an "eager" way by iterate over the Host Ir
   // Container's top level expression list
   for (auto expr : container_->topLevelExprs()) {
@@ -227,17 +214,15 @@ KernelArgumentHolder HostIrEvaluator::dispatchAndCollectOutputs() {
   }
 
   // Collect global outputs
-  return getKnownTensorOrUndefined(container_->outputs(), expr_evaluator_);
-}
-
-KernelArgumentHolder HostIrEvaluator::runWithInput(
-    const std::unordered_map<Val*, PolymorphicValue>& val_to_PValue) {
-  // process input values, converting IValue to PolymorphicValue
-  for (const auto& [val, pvalue] : val_to_PValue) {
-    expr_evaluator_.bind(val, pvalue);
-  }
-
-  return dispatchAndCollectOutputs();
+  std::vector<at::Tensor> outputs(container_->outputs().size());
+  std::transform(
+      container_->outputs().begin(),
+      container_->outputs().end(),
+      outputs.begin(),
+      [this](Val* val) -> at::Tensor {
+        return this->getKnownTensorOrUndefined(val);
+      });
+  return KernelArgumentHolder(outputs);
 }
 
 std::string HostIrEvaluator::canRun() const {
@@ -320,13 +305,7 @@ void HostIrEvaluator::handle(Synchronize* synchronize) {
 void HostIrEvaluator::handle(LaunchKernel* launch_kernel) {
   KernelArgumentHolder args;
   for (auto& input : launch_kernel->inputs()) {
-    NVF_ERROR(
-        expr_evaluator_.isKnown(input),
-        "No buffer associated with Val ",
-        input,
-        " for handling ",
-        launch_kernel->toString());
-    args.push(expr_evaluator_.evaluate(input));
+    args.push(getKnownConcreteData(input));
   }
   args.setDeviceIndex();
 
@@ -341,25 +320,35 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) {
 
   // Store the outputs in the context
   for (auto output_idx : c10::irange(outputs.size())) {
-    expr_evaluator_.bind(
-        launch_kernel->outputs().at(output_idx), outputs[output_idx]);
+    bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]);
   }
 }
 
 void HostIrEvaluator::handle(PostOnStream* post_ir) {
   KernelArgumentHolder input_args;
   for (auto& input : post_ir->inputs()) {
-    NVF_ERROR(
-        expr_evaluator_.isKnown(input),
-        "No buffer associated with Val ",
-        input,
-        " for handling ",
-        post_ir->toString());
-    input_args.push(expr_evaluator_.evaluate(input));
+    input_args.push(getKnownConcreteData(input));
   }
   input_args.setDeviceIndex();
   // placeholder for storing the outputs
   KernelArgumentHolder outputs;
+  bool use_preallocated_outputs = std::all_of(
+      post_ir->outputs().begin(),
+      post_ir->outputs().end(),
+      [this](Val* output) { return this->isKnown(output); });
+  NVF_ERROR(
+      use_preallocated_outputs ||
+          std::all_of(
+              post_ir->outputs().begin(),
+              post_ir->outputs().end(),
+              [this](Val* output) { return !this->isKnown(output); }),
+      "outputs must be all or none preallocated in expr ",
+      post_ir);
+  if (use_preallocated_outputs) {
+    for (auto output : post_ir->outputs()) {
+      outputs.push(getKnownConcreteData(output));
+    }
+  }
 
   NVF_ERROR(
       post_ir->hostOpToPost()->isA<HostUnit>(),
@@ -376,16 +365,23 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) {
           /*fusion_id=*/0,
           !params_.skip_auto_scheduling);
     }
-    outputs = fec_.at(hu).runFusionWithInputs(input_args);
+    if (use_preallocated_outputs) {
+      TORCH_WARN(
+          "FusionExecutorCache does not support with preallocated outputs, so we are copying the outputs in expr ",
+          post_ir);
+      auto tmp_outputs = fec_.at(hu).runFusionWithInputs(input_args);
+      for (auto output_idx : c10::irange(tmp_outputs.size())) {
+        outputs[output_idx].as<at::Tensor>().copy_(
+            tmp_outputs[output_idx].as<at::Tensor>());
+      }
+    } else {
+      outputs = fec_.at(hu).runFusionWithInputs(input_args);
+    }
   } else {
     // This path should generally be avoided as it will likely send the fusion
     // held in HostUnit directly to KernelExecutor which means it will try to
     // compile and run a device kernel with a single thread.
-    if (auto it = executors_.find(hu); it != executors_.end()) {
-      ExecutorAbstract* ea = it->second.get();
-      outputs = ExecutorDispatch::run(ea, input_args);
-
-    } else {
+    if (auto it = executors_.find(hu); it == executors_.end()) {
       DynamicTransform::concretizeFusion(hu->fusion_to_execute(), input_args);
       auto it2 = executors_.insert(
           {hu,
@@ -402,14 +398,20 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) {
       } else {
         ExecutorDispatch::compile(ea, hu->fusion_to_execute());
       }
+    }
+    ExecutorAbstract* ea = executors_[hu].get();
+    if (use_preallocated_outputs) {
+      ExecutorDispatch::run(ea, input_args, outputs);
+    } else {
       outputs = ExecutorDispatch::run(ea, input_args);
     }
   }
 
-  // Store the outputs in the context
-  for (auto output_idx : c10::irange(outputs.size())) {
-    expr_evaluator_.bind(
-        post_ir->outputs().at(output_idx), outputs[output_idx]);
+  if (!use_preallocated_outputs) {
+    // Store the outputs in the context
+    for (auto output_idx : c10::irange(outputs.size())) {
+      bind(post_ir->outputs().at(output_idx), outputs[output_idx]);
+    }
   }
 }
 
@@ -418,10 +420,9 @@ void HostIrEvaluator::handle(Communication* communication) {
       communicator_ != nullptr && communicator_->is_available(),
       "A valid communicator must be provided");
 
-  at::Tensor input_tensor =
-      getKnownTensorOrUndefined(communication->input(0), expr_evaluator_);
+  at::Tensor input_tensor = getKnownTensorOrUndefined(communication->input(0));
   at::Tensor output_tensor =
-      getKnownTensorOrUndefined(communication->output(0), expr_evaluator_);
+      getKnownTensorOrUndefined(communication->output(0));
 
   CommunicatorBackend backend_type = communication->backend();
   c10d::Backend* backend =
@@ -439,8 +440,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
       communicator_ != nullptr && communicator_->is_available(),
       "A valid communicator must be provided");
 
-  at::Tensor buffer =
-      getKnownTensorOrUndefined(communication->buffer(), expr_evaluator_);
+  at::Tensor buffer = getKnownTensorOrUndefined(communication->buffer());
 
   works_[communication] = postSingleCommunication(
       communication,
@@ -495,11 +495,11 @@ void HostIrEvaluator::handle(ForLoop* for_loop) {
 
   for (auto i = start; i < stop; i += step) {
     // invalidate i and its consumers before binding
-    expr_evaluator_.invalidate(for_loop->index());
+    invalidate(for_loop->index());
     for (auto consumer : allConsumerValsOf(for_loop->index())) {
-      expr_evaluator_.invalidate(consumer);
+      invalidate(consumer);
     }
-    expr_evaluator_.bind(for_loop->index(), i);
+    bind(for_loop->index(), i);
     for (Expr* expr : for_loop->body().exprs()) {
       dispatch(expr);
     }
@@ -536,15 +536,11 @@ void HostIrEvaluator::handle(MatmulOp* matmul) {
   TensorView* a = matmul->inA();
   TensorView* b = matmul->inB();
   TensorView* out = matmul->out();
-  NVF_ERROR(
-      expr_evaluator_.isKnown(a) && expr_evaluator_.isKnown(b),
-      "Inputs of the matmul ",
-      matmul->toString(),
-      "must be precomputed before being retrieved");
-  if (expr_evaluator_.isKnown(out)) {
-    auto t_a = expr_evaluator_.evaluate(a).as<at::Tensor>();
-    auto t_b = expr_evaluator_.evaluate(b).as<at::Tensor>();
-    auto t_out = expr_evaluator_.evaluate(out).as<at::Tensor>();
+
+  if (isKnown(out)) {
+    auto t_a = getKnownConcreteData(a).as<at::Tensor>();
+    auto t_b = getKnownConcreteData(b).as<at::Tensor>();
+    auto t_out = getKnownConcreteData(out).as<at::Tensor>();
     at::matmul_out(t_out, t_a, t_b);
   } else {
     unhandled(matmul);
@@ -556,24 +552,18 @@ void HostIrEvaluator::handle(LinearOp* linear) {
   TensorView* weight = linear->inB()->as<TensorView>();
   TensorView* bias = linear->bias()->as<TensorView>();
   TensorView* out = linear->out()->as<TensorView>();
-  NVF_ERROR(
-      expr_evaluator_.isKnown(in) && expr_evaluator_.isKnown(weight) &&
-          (!linear->has_bias() || expr_evaluator_.isKnown(bias)),
-      "Inputs of the Linear Op ",
-      linear->toString(),
-      "must be precomputed before being retrieved");
 
-  if (!expr_evaluator_.isKnown(out)) {
+  if (!isKnown(out)) {
     unhandled(linear);
     return;
   }
 
-  auto in_at = expr_evaluator_.evaluate(in).as<at::Tensor>();
-  auto weight_at = expr_evaluator_.evaluate(weight).as<at::Tensor>();
-  auto out_at = expr_evaluator_.evaluate(out).as<at::Tensor>();
+  auto in_at = getKnownConcreteData(in).as<at::Tensor>();
+  auto weight_at = getKnownConcreteData(weight).as<at::Tensor>();
+  auto out_at = getKnownConcreteData(out).as<at::Tensor>();
 
   if (linear->has_bias()) {
-    auto bias_at = expr_evaluator_.evaluate(bias).as<at::Tensor>();
+    auto bias_at = getKnownConcreteData(bias).as<at::Tensor>();
     at::linear_out(out_at, in_at, weight_at.squeeze(), bias_at.squeeze());
   } else {
     at::linear_out(out_at, in_at, weight_at.squeeze());
@@ -600,25 +590,37 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) {
       c10::nullopt,
       device,
       c10::nullopt);
-
-  expr_evaluator_.bind(tv, tensor);
+  bind(tv, tensor);
 }
 
 void HostIrEvaluator::unhandled(Statement* stmt) {
   NVF_ERROR(stmt->isA<Expr>(), stmt, " must be an Expr");
   auto* expr = stmt->as<Expr>();
-  for (auto input : ir_utils::filterByType<TensorView>(expr->inputs())) {
-    NVF_ERROR(
-        expr_evaluator_.isKnown(input),
-        "input ",
-        input->toString(),
-        " of the expression ",
-        expr->toString(),
-        "must be precomputed before being retrieved");
-  }
-  for (auto output : expr->outputs()) {
-    expr_evaluator_.bind(
-        output, expr_evaluator_.evaluate(output), /*evaluate_validate=*/true);
+  std::vector<PolymorphicValue> inputs;
+  for (auto input : expr->inputs()) {
+    if (input->isA<TensorView>()) {
+      // Tensor inputs must be already computed at this point
+      inputs.push_back(getKnownConcreteData(input));
+    } else {
+      inputs.push_back(expr_evaluator_.evaluate(input));
+    }
+  }
+
+  // Check that there is no pre-allocated output
+  NVF_ERROR(
+      std::all_of(
+          expr->outputs().begin(),
+          expr->outputs().end(),
+          [this](Val* output) {
+            return !this->expr_evaluator_.isKnown(output);
+          }),
+      "Do not support pre-allocated outputs for the op ",
+      expr);
+  // using ExpressionEvaluator::evaluate to evaluate the output is not valid
+  // here if the output or one of its producer is an alias
+  auto concrete_outputs = expr->evaluate(expr_evaluator_, inputs);
+  for (int64_t i : c10::irange(expr->outputs().size())) {
+    bind(expr->output(i), concrete_outputs.at(i));
   }
 }
 
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index f1b8ed4ef88..73f52a7bb90 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -133,7 +133,38 @@ class HostIrEvaluator final : public OptOutDispatch {
 
   c10::cuda::CUDAStream getCUDAStream(Stream* stream);
 
-  KernelArgumentHolder dispatchAndCollectOutputs();
+  Val* getAlias(Val* val) const {
+    const auto& aliases = container_->alias();
+    auto it = aliases.find(val);
+    return it != aliases.end() ? it->second : val;
+  }
+
+  bool isKnown(Val* value) const {
+    return expr_evaluator_.isKnown(getAlias(value));
+  }
+
+  PolymorphicValue getKnownConcreteData(Val* val) const {
+    NVF_ERROR(
+        isKnown(val),
+        "value ",
+        val->toString(),
+        "must be precomputed before being retrieved");
+    return expr_evaluator_.evaluate(getAlias(val));
+  }
+
+  at::Tensor getKnownTensorOrUndefined(Val* val) const {
+    return isKnown(val)
+        ? expr_evaluator_.evaluate(getAlias(val)).as<at::Tensor>()
+        : at::Tensor();
+  }
+
+  void bind(Val* value, PolymorphicValue concrete_value) {
+    expr_evaluator_.bind(getAlias(value), concrete_value);
+  }
+
+  void invalidate(Val* value) {
+    expr_evaluator_.invalidate(getAlias(value));
+  }
 
   std::unique_ptr<HostIrContainer> container_;
   Communicator* communicator_;
diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
index da85466c8a0..ba296f6f357 100644
--- a/tests/cpp/test_host_irs.cpp
+++ b/tests/cpp/test_host_irs.cpp
@@ -456,6 +456,62 @@ TEST_P(HostIrTest, ForLoops) {
   EXPECT_TRUE(expected_result.equal(buffer_at));
 }
 
+TEST_P(HostIrTest, PreAllocatedOutputs) {
+  const std::vector<int64_t> input_sizes = {4, 8, 32};
+  const std::vector<int64_t> output_sizes = {
+      input_sizes.at(1), input_sizes.at(2)};
+
+  auto get_fusion = [input_sizes]() -> std::unique_ptr<Fusion> {
+    auto fusion = std::make_unique<Fusion>();
+    FusionGuard fg(fusion.get());
+
+    auto tv0 = makeConcreteTensor(input_sizes);
+    auto tv1 = add(tv0, tv0);
+    auto tv2 = sum(tv1, {0});
+    fusion->addInput(tv0);
+    fusion->addOutput(tv2);
+    return fusion;
+  };
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  auto host_unit = IrBuilder::create<HostUnit>(get_fusion());
+
+  IrCloner ir_cloner(hic.get());
+  std::vector<Val*> post_on_stream_inputs = {
+      ir_cloner.clone(host_unit->fusion_to_execute()->inputs().at(0))};
+  std::vector<Val*> post_on_stream_outputs = {
+      ir_cloner.clone(host_unit->fusion_to_execute()->outputs().at(0))};
+
+  auto post_on_stream = IrBuilder::create<PostOnStream>(
+      host_unit, post_on_stream_inputs, post_on_stream_outputs);
+
+  hic->pushBackTopLevelExprs(post_on_stream);
+
+  hic->addInput(post_on_stream->inputs().at(0));
+  hic->addInput(post_on_stream->outputs().at(0));
+
+  HostIrEvaluatorParams params;
+  auto [use_fusion_executor_cache] = GetParam();
+  params.use_fusion_executor_cache = use_fusion_executor_cache;
+  HostIrEvaluator hie(std::move(hic), nullptr, params);
+
+  // define concrete inputs and compute ref output for validation
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto input = at::randn(input_sizes, options);
+  auto output = at::empty(output_sizes, options);
+  auto ref_output = at::sum(input * 2, {0});
+
+  hie.runWithInput(
+      {{post_on_stream->inputs().at(0), input},
+       {post_on_stream->outputs().at(0), output}});
+
+  // validate the obtained results
+  GTEST_EXPECT_TRUE(torch::allclose(ref_output, output))
+      << "Output: " << output << " Expected: " << ref_output;
+}
+
 INSTANTIATE_TEST_SUITE_P(
     ,
     HostIrTest,
@@ -1095,7 +1151,12 @@ TEST_F(IfThenElseTest, HostIr) {
   hic->addOutput(output_buffer);
   hic->pushBackTopLevelExprs(if_then_else);
 
-  HostIrEvaluator hie(std::move(hic));
+  // Need to use FusionExecutorCache, otherwise hitting error
+  // https://github.com/NVIDIA/Fuser/blob/4d032f74d2347fd68f5be607ef94956500eb917b/csrc/runtime/executor.cpp#L750
+  HostIrEvaluator hie(
+      std::move(hic),
+      /*Communicator=*/nullptr,
+      {.use_fusion_executor_cache = true});
 
   for (auto boolean : {true, false}) {
     auto options =
@@ -1155,7 +1216,7 @@ TEST_F(AllocationTest, inHostForLoop) {
   TensorView* tv0 = makeConcreteTensor(sizes);
   tv0->setMemoryType(MemoryType::Global);
   auto* allocate = IrBuilder::create<kir::Allocate>(tv0, MemoryType::Global);
-  TensorView* tv1 = abs(tv0);
+  TensorView* tv1 = set(tv0);
 
   for_loop->body().push_back(allocate);
   for_loop->body().push_back(tv1->definition());
@@ -1170,6 +1231,53 @@ TEST_F(AllocationTest, inHostForLoop) {
   EXPECT_EQ(sizes, outputs[0].as<at::Tensor>().sizes());
 }
 
+using HirAlias = NVFuserTest;
+
+TEST_F(HirAlias, SetAndGet) {
+  const std::vector<int64_t> sizes = {8, 64};
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  TensorView* tv0 = makeConcreteTensor(sizes);
+  TensorView* tv1 = set(tv0);
+  TensorView* tv2 = makeConcreteTensor(sizes);
+  hic->markAlias(tv1, tv2);
+  TensorView* tv3 = set(tv2);
+  TensorView* tv4 = makeConcreteTensor(sizes);
+  hic->markAlias(tv3, tv4);
+  hic->addInput(tv0);
+  hic->addOutput(tv4);
+  hic->pushBackTopLevelExprs(tv1->definition());
+  hic->pushBackTopLevelExprs(tv3->definition());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor tv0_aten = at::randn(sizes, options);
+
+  at::Tensor out_aten = hie.runWithInput({{tv0, tv0_aten}})[0].as<at::Tensor>();
+
+  at::Tensor expected_out = tv0_aten;
+  EXPECT_TRUE(out_aten.equal(expected_out))
+      << "Obtained output: " << out_aten << "\n"
+      << "Expected output: " << expected_out;
+}
+
+TEST_F(HirAlias, ThrowOnInputAlias) {
+  const std::vector<int64_t> sizes = {8, 64};
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  TensorView* tv0 = makeConcreteTensor(sizes);
+  TensorView* tv1 = set(tv0);
+  hic->markAlias(tv1, tv0);
+  hic->addInput(tv0);
+
+  EXPECT_ANY_THROW(HostIrEvaluator hie(std::move(hic)));
+}
+
 } // namespace hir
 
 } // namespace nvfuser

From 9820d5aba5b81a02ee96dbcc5ba651837d6add8c Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 26 Mar 2025 04:48:56 -0700
Subject: [PATCH 02/38] harden and simplify allocation in for loop test

---
 tests/cpp/test_host_irs.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
index ba296f6f357..654e60c5d31 100644
--- a/tests/cpp/test_host_irs.cpp
+++ b/tests/cpp/test_host_irs.cpp
@@ -1216,13 +1216,11 @@ TEST_F(AllocationTest, inHostForLoop) {
   TensorView* tv0 = makeConcreteTensor(sizes);
   tv0->setMemoryType(MemoryType::Global);
   auto* allocate = IrBuilder::create<kir::Allocate>(tv0, MemoryType::Global);
-  TensorView* tv1 = set(tv0);
 
   for_loop->body().push_back(allocate);
-  for_loop->body().push_back(tv1->definition());
 
   hic->pushBackTopLevelExprs(for_loop);
-  hic->addOutput(tv1);
+  hic->addOutput(tv0);
 
   HostIrEvaluator hie(std::move(hic));
 

From 2ad510d47138e675e45ae9ff0af4c2726fea7c0a Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 26 Mar 2025 05:16:37 -0700
Subject: [PATCH 03/38] refactor and clean host ir lowering and segmentation

---
 csrc/fusion_segmenter.cpp     | 15 ++----
 csrc/fusion_segmenter.h       |  3 +-
 csrc/host_ir/container.h      |  4 ++
 csrc/host_ir/lower.cpp        | 87 ++++++++++++++++++++++++-----------
 csrc/host_ir/lower.h          |  7 +++
 tests/cpp/test_resharding.cpp |  2 +-
 6 files changed, 79 insertions(+), 39 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index d5368c41066..90e5ba20cf8 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -3919,15 +3919,8 @@ bool SegmentCandidateFinder::codeGenSupportedMerge(
   NVF_ERROR(
       areDirectlyConnected(group1, group2),
       "only support testing immediate producer-consumer groups");
-  if (options_.only_segment_resharding_exprs) {
-    for (auto group : {group1, group2}) {
-      for (auto expr : group->exprs()) {
-        if (isResharding(expr)) {
-          return false;
-        }
-      }
-    }
-    return true;
+  if (options_.custom_should_merge_groups != nullptr) {
+    return (*options_.custom_should_merge_groups)(group1, group2);
   }
   return tryMerge(segmented_fusion_.get(), runtimeInfo(), group1, group2) !=
       SchedulerType::None;
@@ -3938,7 +3931,7 @@ bool SegmentCandidateFinder::codeGenSupportedMerge(
 SchedulerType SegmentCandidateFinder::deriveSchedulerType(
     SegmentedGroup* group) {
   FUSER_PERF_SCOPE("SegmentCandidateFinder::deriveSchedulerType");
-  if (options_.only_segment_resharding_exprs) {
+  if (options_.custom_should_merge_groups != nullptr) {
     // We don't need to generate a SchedulerType for multidevice segments at
     // this moment
     return SchedulerType::None;
@@ -3958,7 +3951,7 @@ SegmentCandidateFinder::SegmentCandidateFinder(
     : options_(options), runtime_inputs_(inputs) {
   FUSER_PERF_SCOPE("SegmentCandidateFinder::SegmentCandidateFinder");
   NVF_ERROR(
-      !options_.only_segment_resharding_exprs ||
+      options_.custom_should_merge_groups == nullptr ||
           (!options_.run_translate_welford &&
            !options_.run_combine_reductions && options_.run_herrmann_merge &&
            options_.run_final_merge),
diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h
index 1e32d42bb36..cc721d59301 100644
--- a/csrc/fusion_segmenter.h
+++ b/csrc/fusion_segmenter.h
@@ -501,7 +501,8 @@ struct SegmentCandidateFinderOptions {
   bool run_combine_reductions = true;
   bool run_herrmann_merge = true;
   bool run_final_merge = true;
-  bool only_segment_resharding_exprs = false;
+  bool (*custom_should_merge_groups)(SegmentedGroup*, SegmentedGroup*) =
+      nullptr;
 };
 
 //!  SegmentCandidateFinder
diff --git a/csrc/host_ir/container.h b/csrc/host_ir/container.h
index 7dcd66b4436..eb322275422 100644
--- a/csrc/host_ir/container.h
+++ b/csrc/host_ir/container.h
@@ -41,6 +41,10 @@ class HostIrContainer final : public Fusion {
   //! Print to an output stream
   std::ostream& print(std::ostream& os) const;
 
+  void resetTopLevelExprs(std::vector<Expr*> exprs) {
+    top_level_exprs_ = std::move(exprs);
+  }
+
   const std::vector<Expr*>& topLevelExprs() const;
 
   void pushBackTopLevelExprs(Expr* expr);
diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index 32febda37a0..4838961ed85 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -6,8 +6,8 @@
  */
 // clang-format on
 #include <device_lower/utils.h>
-#include <fusion_segmenter.h>
 #include <host_ir/lower.h>
+#include <ir/all_nodes.h>
 #include <ir/builder.h>
 #include <ir/interface_nodes.h>
 #include <ir/iostream.h>
@@ -592,6 +592,29 @@ std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
       get_current_stream, allocate_tva_allgathered, allocate_tv_out, for_loop};
 }
 
+bool HostIrLower::isLoweredAsStandaloneHostOp(Expr* expr) {
+  return expr->isOneOf<
+             MatmulOp,
+             SliceOp,
+             SelectOp,
+             LinearOp,
+             Communication,
+             P2PCommunication>();
+}
+
+bool HostIrLower::ShouldMergeSegmentedGroups(
+    SegmentedGroup* group1,
+    SegmentedGroup* group2) {
+  for (auto group : {group1, group2}) {
+    for (auto expr : group->exprs()) {
+      if (isLoweredAsStandaloneHostOp(expr)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
     std::unique_ptr<Fusion> fusion,
     DeviceIdxType my_device_index) {
@@ -615,7 +638,7 @@ std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
       .run_combine_reductions = false,
       .run_herrmann_merge = true,
       .run_final_merge = true,
-      .only_segment_resharding_exprs = true};
+      .custom_should_merge_groups = &ShouldMergeSegmentedGroups};
   std::unique_ptr<SegmentedFusion> staged_fusion =
       SegmentCandidateFinder::segment(
           std::move(fusion), KernelArgumentHolder(), options, true);
@@ -643,32 +666,18 @@ std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
     if (involvedDevices(group->exprs().at(0)).count(my_device_index) == 0) {
       continue;
     }
-    const bool is_resharding = std::any_of(
-        group->exprs().begin(), group->exprs().end(), [](auto expr) {
-          return isResharding(expr);
-        });
-    if (is_resharding) {
+    // we decide whether to insert the Expr as a standalone op in the
+    // HostIRContainer, which will result in using ATen Op to evaluate it --
+    // or, alternatively, to wrap them into a PostOnStream(HostUnit(.)) which
+    // will result in a kernel code generation.
+    if (std::all_of(
+            group->exprs().begin(),
+            group->exprs().end(),
+            isLoweredAsStandaloneHostOp)) {
       NVF_ERROR(
           group->exprs().size() == 1,
-          "Communication segments must contain only one Expr");
-      for (auto* expr : HostIrLower::lower(
-               ir_cloner.clone(group->exprs().at(0)), my_device_index)) {
-        // Allocate the recv buffers of communications
-        if (expr->isA<Communication>()) {
-          auto* communication = expr->as<Communication>();
-          TensorView* tv = communication->out();
-          if (tv->getDeviceMesh().has(my_device_index)) {
-            auto* allocate =
-                IrBuilder::create<kir::Allocate>(tv, MemoryType::Global);
-            hic->pushBackTopLevelExprs(allocate);
-          }
-        }
-        hic->pushBackTopLevelExprs(expr);
-        if (expr->isA<Communication>()) {
-          auto wait = IrBuilder::create<hir::Wait>(expr->as<Communication>());
-          hic->pushBackTopLevelExprs(wait);
-        }
-      }
+          "Expr executed as a standalone op cannot be fused");
+      hic->pushBackTopLevelExprs(ir_cloner.clone(group->exprs().at(0)));
     } else {
       auto host_unit = IrBuilder::create<hir::HostUnit>(
           staged_fusion->makeFusion(group).second);
@@ -684,6 +693,32 @@ std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
     hic->addOutput(ir_cloner.clone(output));
   }
 
+  std::vector<Expr*> new_top_level_exprs;
+  for (auto top_level_expr : hic->topLevelExprs()) {
+    if (!isResharding(top_level_expr)) {
+      new_top_level_exprs.push_back(top_level_expr);
+      continue;
+    }
+    for (auto* expr : HostIrLower::lower(top_level_expr, my_device_index)) {
+      // Allocate the recv buffers of communications
+      if (expr->isA<Communication>()) {
+        auto* communication = expr->as<Communication>();
+        TensorView* tv = communication->out();
+        if (tv->getDeviceMesh().has(my_device_index)) {
+          auto* allocate =
+              IrBuilder::create<kir::Allocate>(tv, MemoryType::Global);
+          new_top_level_exprs.push_back(allocate);
+        }
+      }
+      new_top_level_exprs.push_back(expr);
+      if (expr->isA<Communication>()) {
+        auto wait = IrBuilder::create<hir::Wait>(expr->as<Communication>());
+        new_top_level_exprs.push_back(wait);
+      }
+    }
+  }
+  hic->resetTopLevelExprs(new_top_level_exprs);
+
   return hic;
 }
 
diff --git a/csrc/host_ir/lower.h b/csrc/host_ir/lower.h
index 314c880c264..5ce2386a187 100644
--- a/csrc/host_ir/lower.h
+++ b/csrc/host_ir/lower.h
@@ -7,6 +7,7 @@
 // clang-format on
 #pragma once
 
+#include <fusion_segmenter.h>
 #include <host_ir/container.h>
 #include <ir/base_nodes.h>
 #include <multidevice/communication.h>
@@ -35,6 +36,12 @@ class HostIrLower {
       std::unique_ptr<Fusion> fusion,
       DeviceIdxType my_device_index);
 
+  static bool isLoweredAsStandaloneHostOp(Expr* expr);
+
+  static bool ShouldMergeSegmentedGroups(
+      SegmentedGroup* group1,
+      SegmentedGroup* group2);
+
  private:
   std::vector<Expr*> lowerToCollectiveBasedPipelinedGemmComm(Expr* expr);
   const HostIrLowerParams params_;
diff --git a/tests/cpp/test_resharding.cpp b/tests/cpp/test_resharding.cpp
index 3606582cdb1..69e6b307f42 100644
--- a/tests/cpp/test_resharding.cpp
+++ b/tests/cpp/test_resharding.cpp
@@ -53,7 +53,7 @@ class ReshardingTest : public NVFuserFixtureParamTest<ReshardingTestParams> {
         .run_combine_reductions = false,
         .run_herrmann_merge = true,
         .run_final_merge = true,
-        .only_segment_resharding_exprs = true};
+        .custom_should_merge_groups = &HostIrLower::ShouldMergeSegmentedGroups};
 
     auto segmented_fusion = SegmentCandidateFinder::segment(
         std::move(fusion_), KernelArgumentHolder(), options, true);

From 46c6717d5bea2a4cdf11bc418ac4e3dc201ac3a5 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 26 Mar 2025 05:55:05 -0700
Subject: [PATCH 04/38] lint

---
 csrc/host_ir/lower.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index 4838961ed85..9741aca1bf0 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -594,12 +594,12 @@ std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
 
 bool HostIrLower::isLoweredAsStandaloneHostOp(Expr* expr) {
   return expr->isOneOf<
-             MatmulOp,
-             SliceOp,
-             SelectOp,
-             LinearOp,
-             Communication,
-             P2PCommunication>();
+      MatmulOp,
+      SliceOp,
+      SelectOp,
+      LinearOp,
+      Communication,
+      P2PCommunication>();
 }
 
 bool HostIrLower::ShouldMergeSegmentedGroups(

From 73d5d7b4db958dc18698ae1edd577f63d12911e9 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 26 Mar 2025 07:26:05 -0700
Subject: [PATCH 05/38] put back isResharding as the condition for lower to a
 standalone host expr

---
 csrc/host_ir/lower.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index 9741aca1bf0..b2622cccd90 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -593,13 +593,7 @@ std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
 }
 
 bool HostIrLower::isLoweredAsStandaloneHostOp(Expr* expr) {
-  return expr->isOneOf<
-      MatmulOp,
-      SliceOp,
-      SelectOp,
-      LinearOp,
-      Communication,
-      P2PCommunication>();
+  return isResharding(expr);
 }
 
 bool HostIrLower::ShouldMergeSegmentedGroups(

From e35ddd0302b26c05464de7efc36de92660d871f3 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 11 Apr 2025 05:01:53 -0700
Subject: [PATCH 06/38] minor comments

---
 csrc/fusion_segmenter.cpp     |  2 +-
 csrc/fusion_segmenter.h       |  4 ++--
 csrc/host_ir/lower.cpp        | 12 ++++++------
 csrc/host_ir/lower.h          |  4 ++--
 tests/cpp/test_resharding.cpp |  2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 90e5ba20cf8..b522f6d11ef 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -3920,7 +3920,7 @@ bool SegmentCandidateFinder::codeGenSupportedMerge(
       areDirectlyConnected(group1, group2),
       "only support testing immediate producer-consumer groups");
   if (options_.custom_should_merge_groups != nullptr) {
-    return (*options_.custom_should_merge_groups)(group1, group2);
+    return (options_.custom_should_merge_groups)(group1, group2);
   }
   return tryMerge(segmented_fusion_.get(), runtimeInfo(), group1, group2) !=
       SchedulerType::None;
diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h
index cc721d59301..6cca1f0c727 100644
--- a/csrc/fusion_segmenter.h
+++ b/csrc/fusion_segmenter.h
@@ -19,6 +19,7 @@
 #include <visibility.h>
 
 #include <deque>
+#include <functional>
 #include <list>
 #include <unordered_set>
 #include <vector>
@@ -501,8 +502,7 @@ struct SegmentCandidateFinderOptions {
   bool run_combine_reductions = true;
   bool run_herrmann_merge = true;
   bool run_final_merge = true;
-  bool (*custom_should_merge_groups)(SegmentedGroup*, SegmentedGroup*) =
-      nullptr;
+  std::function<bool(SegmentedGroup*, SegmentedGroup*)> custom_should_merge_groups = nullptr;
 };
 
 //!  SegmentCandidateFinder
diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index b2622cccd90..d647f0d442e 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -592,16 +592,16 @@ std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
       get_current_stream, allocate_tva_allgathered, allocate_tv_out, for_loop};
 }
 
-bool HostIrLower::isLoweredAsStandaloneHostOp(Expr* expr) {
+bool HostIrLower::isLowerableAsStandaloneHostOp(Expr* expr) {
   return isResharding(expr);
 }
 
-bool HostIrLower::ShouldMergeSegmentedGroups(
+bool HostIrLower::shouldMergeSegmentedGroups(
     SegmentedGroup* group1,
     SegmentedGroup* group2) {
   for (auto group : {group1, group2}) {
-    for (auto expr : group->exprs()) {
-      if (isLoweredAsStandaloneHostOp(expr)) {
+    for (Expr* expr : group->exprs()) {
+      if (isLowerableAsStandaloneHostOp(expr)) {
         return false;
       }
     }
@@ -632,7 +632,7 @@ std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
       .run_combine_reductions = false,
       .run_herrmann_merge = true,
       .run_final_merge = true,
-      .custom_should_merge_groups = &ShouldMergeSegmentedGroups};
+      .custom_should_merge_groups = &shouldMergeSegmentedGroups};
   std::unique_ptr<SegmentedFusion> staged_fusion =
       SegmentCandidateFinder::segment(
           std::move(fusion), KernelArgumentHolder(), options, true);
@@ -667,7 +667,7 @@ std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
     if (std::all_of(
             group->exprs().begin(),
             group->exprs().end(),
-            isLoweredAsStandaloneHostOp)) {
+            isLowerableAsStandaloneHostOp)) {
       NVF_ERROR(
           group->exprs().size() == 1,
           "Expr executed as a standalone op cannot be fused");
diff --git a/csrc/host_ir/lower.h b/csrc/host_ir/lower.h
index 5ce2386a187..5b1ecadece8 100644
--- a/csrc/host_ir/lower.h
+++ b/csrc/host_ir/lower.h
@@ -36,9 +36,9 @@ class HostIrLower {
       std::unique_ptr<Fusion> fusion,
       DeviceIdxType my_device_index);
 
-  static bool isLoweredAsStandaloneHostOp(Expr* expr);
+  static bool isLowerableAsStandaloneHostOp(Expr* expr);
 
-  static bool ShouldMergeSegmentedGroups(
+  static bool shouldMergeSegmentedGroups(
       SegmentedGroup* group1,
       SegmentedGroup* group2);
 
diff --git a/tests/cpp/test_resharding.cpp b/tests/cpp/test_resharding.cpp
index 69e6b307f42..5e4bd2b749a 100644
--- a/tests/cpp/test_resharding.cpp
+++ b/tests/cpp/test_resharding.cpp
@@ -53,7 +53,7 @@ class ReshardingTest : public NVFuserFixtureParamTest<ReshardingTestParams> {
         .run_combine_reductions = false,
         .run_herrmann_merge = true,
         .run_final_merge = true,
-        .custom_should_merge_groups = &HostIrLower::ShouldMergeSegmentedGroups};
+        .custom_should_merge_groups = &HostIrLower::shouldMergeSegmentedGroups};
 
     auto segmented_fusion = SegmentCandidateFinder::segment(
         std::move(fusion_), KernelArgumentHolder(), options, true);

From 4964680a32b58bb0818a784eff6e758625e49ace Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 11 Apr 2025 05:07:59 -0700
Subject: [PATCH 07/38] lint

---
 csrc/fusion_segmenter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h
index 6cca1f0c727..a975716676d 100644
--- a/csrc/fusion_segmenter.h
+++ b/csrc/fusion_segmenter.h
@@ -502,7 +502,8 @@ struct SegmentCandidateFinderOptions {
   bool run_combine_reductions = true;
   bool run_herrmann_merge = true;
   bool run_final_merge = true;
-  std::function<bool(SegmentedGroup*, SegmentedGroup*)> custom_should_merge_groups = nullptr;
+  std::function<bool(SegmentedGroup*, SegmentedGroup*)>
+      custom_should_merge_groups = nullptr;
 };
 
 //!  SegmentCandidateFinder

From ed8dc7c825a466f6cf9b3e9ebbe8fb8017eda182 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 26 Mar 2025 05:52:08 -0700
Subject: [PATCH 08/38] add host ir support for set reduce and binary op

---
 csrc/host_ir/executor.cpp               |  99 +++++++++++++
 csrc/host_ir/executor.h                 |   3 +
 csrc/host_ir/lower.cpp                  |  13 +-
 tests/cpp/test_host_irs.cpp             | 180 ++++++++++++++++++++++++
 tests/cpp/test_multidevice_pipeline.cpp | 131 -----------------
 5 files changed, 293 insertions(+), 133 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 30bca9920db..4c0e7f441eb 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -570,6 +570,31 @@ void HostIrEvaluator::handle(LinearOp* linear) {
   }
 }
 
+void HostIrEvaluator::handle(LoadStoreOp* load_store_op) {
+  NVF_ERROR(
+      load_store_op->out()->isA<TensorView>(), "out must be a TensorView");
+  auto* out_tv = load_store_op->out()->as<TensorView>();
+  auto in_tensor = getKnownConcreteData(load_store_op->in()).as<at::Tensor>();
+
+  // If output has root domain, compute and apply permutation
+  if (out_tv->hasRoot()) {
+    auto permutation = ir_utils::computePermutation(
+        out_tv->getRootDomain(), out_tv->getLogicalDomain());
+    NVF_ERROR(
+        permutation.has_value(),
+        "The logical domain of a Set.Permute is supposed to be a permutation of the root domain: ",
+        out_tv->toString());
+    in_tensor = in_tensor.permute(*permutation).contiguous();
+  }
+  if (!isKnown(load_store_op->out())) {
+    bind(load_store_op->out(), in_tensor);
+  } else {
+    auto out_tensor =
+        getKnownConcreteData(load_store_op->out()).as<at::Tensor>();
+    out_tensor.copy_(in_tensor);
+  }
+}
+
 void HostIrEvaluator::handle(kir::Allocate* allocate) {
   NVF_ERROR(
       allocate->buffer()->isA<TensorView>(),
@@ -593,6 +618,80 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) {
   bind(tv, tensor);
 }
 
+void HostIrEvaluator::handle(BinaryOp* binary_op) {
+  if (!isKnown(binary_op->outputs().at(0))) {
+    return unhandled(binary_op);
+  }
+
+  auto lhs = getKnownConcreteData(binary_op->inputs().at(0)).as<at::Tensor>();
+  auto rhs = getKnownConcreteData(binary_op->inputs().at(1)).as<at::Tensor>();
+  auto output =
+      getKnownConcreteData(binary_op->outputs().at(0)).as<at::Tensor>();
+
+  switch (binary_op->getBinaryOpType()) {
+    case BinaryOpType::Add:
+      at::add_out(output, lhs, rhs);
+      break;
+    case BinaryOpType::Sub:
+      at::sub_out(output, lhs, rhs);
+      break;
+    case BinaryOpType::Mul:
+      at::mul_out(output, lhs, rhs);
+      break;
+    case BinaryOpType::Div:
+      at::div_out(output, lhs, rhs);
+      break;
+    default:
+      NVF_CHECK(
+          false,
+          "Unexpected operator type: ",
+          binary_op->getBinaryOpType(),
+          " in ",
+          binary_op);
+  }
+}
+
+void HostIrEvaluator::handle(ReductionOp* reduction_op) {
+  auto input_tv = reduction_op->in()->as<TensorView>();
+  auto output_tv = reduction_op->out()->as<TensorView>();
+  if (!isKnown(output_tv)) {
+    return unhandled(reduction_op);
+  }
+
+  NVF_ERROR(
+      !output_tv->hasRoot(),
+      "Evaluation for rFactored reductions is not supported.");
+  auto input = getKnownConcreteData(input_tv).as<at::Tensor>();
+  auto output = getKnownConcreteData(output_tv).as<at::Tensor>();
+
+  std::vector<int64_t> reduction_axes;
+  for (const auto i :
+       c10::irange(int64_t(output_tv->getLogicalDomain().size()))) {
+    auto ax = output_tv->getLogicalDomain().at(i);
+    if (ax->isReduction()) {
+      reduction_axes.push_back(i);
+    }
+  }
+  switch (reduction_op->getReductionOpType()) {
+    case BinaryOpType::Add:
+      at::sum_out(output, input, reduction_axes);
+      return;
+    case BinaryOpType::Max:
+      at::amax_out(output, input, reduction_axes);
+      return;
+    case BinaryOpType::Min:
+      at::amin_out(output, input, reduction_axes);
+      return;
+    default:
+      NVF_CHECK(
+          false,
+          "Unexpected operator type: ",
+          reduction_op->getReductionOpType(),
+          " in ",
+          reduction_op);
+  }
+}
+
 void HostIrEvaluator::unhandled(Statement* stmt) {
   NVF_ERROR(stmt->isA<Expr>(), stmt, " must be an Expr");
   auto* expr = stmt->as<Expr>();
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index 8f4d425fd09..faf301f0819 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -133,6 +133,9 @@ class HostIrEvaluator final : public OptOutDispatch {
   void handle(MatmulOp* matmul) override;
   void handle(LinearOp* linear) override;
   void handle(kir::Allocate* allocate) override;
+  void handle(LoadStoreOp* load_store_op) override;
+  void handle(BinaryOp* binary_op) override;
+  void handle(ReductionOp* reduction_op) override;
   void unhandled(Statement* stmt) override;
 
   c10::cuda::CUDAStream getCUDAStream(Stream* stream);
diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index d647f0d442e..d8302a8a119 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -592,8 +592,17 @@ std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
       get_current_stream, allocate_tva_allgathered, allocate_tv_out, for_loop};
 }
 
-bool HostIrLower::isLowerableAsStandaloneHostOp(Expr* expr) {
-  return isResharding(expr);
+bool HostIrLower::isLoweredAsStandaloneHostOp(Expr* expr) {
+  return expr->isOneOf<
+      MatmulOp,
+      SliceOp,
+      SelectOp,
+      LinearOp,
+      LoadStoreOp,
+      BinaryOp,
+      ReductionOp,
+      Communication,
+      P2PCommunication>();
 }
 
 bool HostIrLower::shouldMergeSegmentedGroups(
diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
index 654e60c5d31..751d5251aab 100644
--- a/tests/cpp/test_host_irs.cpp
+++ b/tests/cpp/test_host_irs.cpp
@@ -1276,6 +1276,186 @@ TEST_F(HirAlias, ThrowOnInputAlias) {
   EXPECT_ANY_THROW(HostIrEvaluator hie(std::move(hic)));
 }
 
+using HirSetTest = NVFuserTest;
+
+TEST_F(HirSetTest, HostIr) {
+  const std::vector<int64_t> sizes = {8, 64};
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  auto* in = makeConcreteTensor(sizes);
+  auto* out = makeConcreteTensor(sizes);
+  auto* set = IrBuilder::create<LoadStoreOp>(LoadStoreOpType::Set, out, in);
+  hic->addInput(in);
+  hic->addInput(out);
+  hic->pushBackTopLevelExprs(set);
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto in_aten = at::randn(sizes, options);
+  auto out_aten = at::empty(sizes, options);
+
+  hie.runWithInput({{in, in_aten}, {out, out_aten}});
+
+  EXPECT_TRUE(out_aten.equal(in_aten))
+      << "Obtained output: " << out_aten << "\n"
+      << "Expected output: " << in_aten;
+}
+
+class HirBinaryOpTest : public NVFuserFixtureParamTest<BinaryOpType> {
+ protected:
+  at::Tensor executeBinaryOp(at::Tensor lhs, at::Tensor rhs) {
+    switch (GetParam()) {
+      case BinaryOpType::Add:
+        return lhs + rhs;
+      case BinaryOpType::Sub:
+        return lhs - rhs;
+      case BinaryOpType::Mul:
+        return lhs * rhs;
+      case BinaryOpType::Div:
+        return lhs / rhs;
+      default:
+        NVF_ERROR("Unsupported binary op type ", GetParam());
+        return at::Tensor();
+    }
+  }
+};
+
+TEST_P(HirBinaryOpTest, PreAllocatedOutputs) {
+  const std::vector<int64_t> sizes = {8, 64};
+  const auto& binary_op_type = GetParam();
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  auto* lhs = makeConcreteTensor(sizes);
+  auto* rhs = makeConcreteTensor(sizes);
+  auto* out = makeConcreteTensor(sizes);
+  auto* binary_op = IrBuilder::create<BinaryOp>(binary_op_type, out, lhs, rhs);
+  hic->addInput(lhs);
+  hic->addInput(rhs);
+  hic->addInput(out);
+  hic->pushBackTopLevelExprs(binary_op);
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto lhs_aten = at::randn(sizes, options);
+  auto rhs_aten = at::randn(sizes, options);
+  auto out_aten = at::empty(sizes, options);
+
+  hie.runWithInput({{lhs, lhs_aten}, {rhs, rhs_aten}, {out, out_aten}});
+
+  at::Tensor expected_out = executeBinaryOp(lhs_aten, rhs_aten);
+  EXPECT_TRUE(expected_out.equal(out_aten))
+      << "Obtained output: " << out_aten << "\n"
+      << "Expected output: " << expected_out;
+}
+
+TEST_P(HirBinaryOpTest, NonPreAllocatedOutputs) {
+  const std::vector<int64_t> sizes = {8, 64};
+  const auto& binary_op_type = GetParam();
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  auto* lhs = makeConcreteTensor(sizes);
+  auto* rhs = makeConcreteTensor(sizes);
+  auto* out = binaryOp(binary_op_type, lhs, rhs);
+  hic->addInput(lhs);
+  hic->addInput(rhs);
+  hic->addOutput(out);
+  hic->pushBackTopLevelExprs(out->definition());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto lhs_aten = at::randn(sizes, options);
+  auto rhs_aten = at::randn(sizes, options);
+
+  auto out_aten =
+      hie.runWithInput({{lhs, lhs_aten}, {rhs, rhs_aten}})[0].as<at::Tensor>();
+
+  at::Tensor expected_out = executeBinaryOp(lhs_aten, rhs_aten);
+  EXPECT_TRUE(expected_out.equal(out_aten))
+      << "Obtained output: " << out_aten << "\n"
+      << "Expected output: " << expected_out;
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ,
+    HirBinaryOpTest,
+    testing::Values(
+        BinaryOpType::Add,
+        BinaryOpType::Sub,
+        BinaryOpType::Mul,
+        BinaryOpType::Div),
+    [](const testing::TestParamInfo<BinaryOpType>& info) -> std::string {
+      std::stringstream ss;
+      ss << "BinaryOpType_" << info.param;
+      return ss.str();
+    });
+
+using HirReductionOpTest = NVFuserTest;
+
+TEST_F(HirReductionOpTest, PreAllocatedOutputs) {
+  constexpr int64_t size0 = 8, size1 = 64;
+  constexpr int64_t reduction_axis = 1;
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  auto* in = makeConcreteTensor({size0, size1});
+  auto* out = newForReduction(in, {reduction_axis}, in->dtype());
+  auto* reduction_op = IrBuilder::create<ReductionOp>(
+      BinaryOpType::Add, hic->zeroVal(), out, in);
+  hic->addInput(in);
+  hic->addOutput(out);
+  hic->pushBackTopLevelExprs(reduction_op);
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto in_aten = at::randn({size0, size1}, options);
+  auto out_aten = at::empty({size0}, options);
+
+  hie.runWithInput({{in, in_aten}, {out, out_aten}});
+
+  at::Tensor expected_out = in_aten.sum(reduction_axis);
+  EXPECT_TRUE(expected_out.equal(out_aten))
+      << "Obtained output: " << out_aten << "\n"
+      << "Expected output: " << expected_out;
+}
+
+TEST_F(HirReductionOpTest, NonPreAllocatedOutputs) {
+  constexpr int64_t size0 = 8, size1 = 64;
+  constexpr int64_t reduction_axis = 1;
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  auto* in = makeConcreteTensor({size0, size1});
+  auto* out = sum(in, {reduction_axis});
+  hic->addInput(in);
+  hic->addOutput(out);
+  hic->pushBackTopLevelExprs(out->definition());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto in_aten = at::randn({size0, size1}, options);
+  auto out_aten = at::empty({size0}, options);
+
+  hie.runWithInput({{in, in_aten}, {out, out_aten}});
+
+  at::Tensor expected_out = in_aten.sum(reduction_axis);
+  EXPECT_TRUE(expected_out.equal(out_aten))
+      << "Obtained output: " << out_aten << "\n"
+      << "Expected output: " << expected_out;
+}
+
 } // namespace hir
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_multidevice_pipeline.cpp b/tests/cpp/test_multidevice_pipeline.cpp
index 5985571c57a..12dfed5dd43 100644
--- a/tests/cpp/test_multidevice_pipeline.cpp
+++ b/tests/cpp/test_multidevice_pipeline.cpp
@@ -457,135 +457,4 @@ INSTANTIATE_TEST_SUITE_P(
         testing::Values(0, 1),
         testing::Values(true)));
 
-// Different scheduling modes used in
-// PipelineTestStagedReduction.StagedReduction
-enum class SchedulingMode {
-  // Manual interdevice scheduling, no intra-device scheduling
-  InterDeviceOnly,
-  // Manual inter-/intra-device scheduling
-  Manual,
-  // Manual inter-device scheduling, composed with fully automated intra-device
-  // scheduling (through FusionExecutorCache)
-  Automatic,
-};
-
-std::ostream& operator<<(std::ostream& out, const SchedulingMode& mode) {
-  switch (mode) {
-    case SchedulingMode::InterDeviceOnly:
-      out << "InterDeviceOnly";
-      break;
-    case SchedulingMode::Manual:
-      out << "Manual";
-      break;
-    case SchedulingMode::Automatic:
-      out << "Automatic";
-      break;
-  }
-  return out;
-}
-
-class PipelineTestStagedReduction
-    : public PipelineTest,
-      public ::testing::WithParamInterface<SchedulingMode> {};
-
-// 1D staged reduction
-// Inputs: X[num_devices,B,C]
-TEST_P(PipelineTestStagedReduction, StagedReduction) {
-  auto scheduling_mode = GetParam();
-
-  const int num_devices = communicator_->size();
-  constexpr int B = 8;
-  constexpr int C = 64;
-
-  FusionGuard fg(fusion.get());
-  // The first dimension is made symbolic so `tv_out->definition()` won't
-  // become a squeeze when num_devices == 1. This wouldn't be a problem for
-  // automatic mode. However, for the manual mode, the scheduling code below
-  // assumes `tv_out->definition()` can be lowered to communication. A squeeze
-  // can't.
-  TensorView* tv0 = TensorViewBuilder()
-                        .dtype(DataType::Float)
-                        .contiguity(true)
-                        .shape({-1, B, C})
-                        .build();
-  auto mesh = DeviceMesh::createForNumDevices(num_devices);
-  tv0->setDeviceMesh(mesh);
-  TensorView* tv1 = sum(tv0, {2});
-  TensorView* tv_out = sum(tv1, {0});
-  fusion->addInput(tv0);
-  fusion->addOutput(tv_out);
-
-  for (auto* tv : {tv0, tv1}) {
-    tv->axis(0)->parallelize(ParallelType::DIDx);
-  }
-
-  // Intra-device reduction scheduling for the first reduction:
-  switch (scheduling_mode) {
-    case SchedulingMode::InterDeviceOnly:
-      break;
-    case SchedulingMode::Manual: {
-      // inspired from NVFuserTest.FusionReduction1_CUDA
-      // tv0[I0{A}, I1{B}, I2{C}]
-      tv1->split(2, 32);
-      // tv1[I0{A}, I1{B}, R2o{C/32}, R2i{32}] = tv0[I0{A}, I1{B}, I2{C}]
-      tv1->split(2, 4);
-      // clang-format off
-      // tv1[I0{A}, I1{B}, R2oo{C/32/4)}, R2oi{4}, R2i{32}] = tv0[I0{A}, I1{B}, I2{C}]
-      // clang-format on
-
-      TensorView* tv2 = tv1->rFactor({2});
-      // clang-format off
-      // tv2[I0{A}, I1{B}, R2oo{C/32/4)}, I2oi{4}, I2i{32}] = tv0[I0{A}, I1{B}, I2{C}]
-      // tv1[I0{A}, I1{B},                R2oi{4}, R2i{32}] = tv2[I0{A}, I1{B}, R2oo{C/32/4)}, I2oi{4}, I2i{32}]
-      // clang-format on
-
-      TensorView* tv3 = tv1->rFactor({2});
-      // clang-format off
-      // tv2[I0{A}, I1{B}, R2oo{C/32/4)}, I2oi{4}, I2i{32}] = tv0[I0{A}, I1{B}, I2{C}]
-      // tv3[I0{A}, I1{B},                R2oi{4}, I2i{32}] = tv2[I0{A}, I1{B}, R2oo{C/32/4)}, I2oi{4}, I2i{32}]
-      // tv1[I0{A}, I1{B},                         R2i{32}] = tv3[I0{A}, I1{B},                R2oi{4}, I2i{32}]
-      // clang-format on
-
-      // tv1 is a segment boundary so must be in global. This wouldn't be
-      // needed if the fusion were scheduled automatically.
-      tv1->setMemoryType(MemoryType::Global);
-
-      // Use `tv2` as the reference tensor because it contains the most
-      // parallel IterDomains.
-      tv2->axis(1)->parallelize(ParallelType::BIDx);
-      tv2->axis(3)->parallelize(ParallelType::Unroll);
-      tv2->axis(-1)->parallelize(ParallelType::TIDx);
-      scheduler_utils::parallelizeAllLike(
-          tv2,
-          /*pos=*/-1,
-          // Don't propagate the parallelization to `tv_out` because that's in
-          // a different, resharding segment.
-          /*selected_tv=*/{tv0, tv1, tv2, tv3});
-      inlineMost();
-      break;
-    }
-    case SchedulingMode::Automatic:
-      host_ir_executor_params.use_fusion_executor_cache = true;
-      break;
-  }
-
-  at::Tensor unsharded_input_tensor =
-      at::randn({num_devices, B, C}, tensor_options);
-  at::Tensor ref_unsharded_output_tensor =
-      unsharded_input_tensor.sum(at::IntArrayRef({0, 2}));
-  unsharded_args = {unsharded_input_tensor};
-  ref_unsharded_outputs = {ref_unsharded_output_tensor};
-
-  executeAndValidate(/* validate_with_prescribed_values */ true);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    ,
-    PipelineTestStagedReduction,
-    testing::Values(
-        SchedulingMode::InterDeviceOnly,
-        SchedulingMode::Manual,
-        SchedulingMode::Automatic),
-    testing::PrintToStringParamName());
-
 } // namespace nvfuser

From 85b7b751b82c31f5b019773d5adfdec21210ab8b Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 11 Apr 2025 06:26:51 -0700
Subject: [PATCH 09/38] move .contiguous to be in postScatter

---
 csrc/host_ir/executor.cpp          | 2 +-
 csrc/multidevice/communication.cpp | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 4c0e7f441eb..ab3e08bfb6f 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -584,7 +584,7 @@ void HostIrEvaluator::handle(LoadStoreOp* load_store_op) {
         permutation.has_value(),
         "The logical domain of a Set.Permute is supposed to be a permutation of the root domain: ",
         out_tv->toString());
-    in_tensor = in_tensor.permute(*permutation).contiguous();
+    in_tensor = in_tensor.permute(*permutation);
   }
   if (!isKnown(load_store_op->out())) {
     bind(load_store_op->out(), in_tensor);
diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp
index 13fc3ce36ea..463cffd2879 100644
--- a/csrc/multidevice/communication.cpp
+++ b/csrc/multidevice/communication.cpp
@@ -352,6 +352,11 @@ c10::intrusive_ptr<c10d::Work> postScatter(
     c10d::Backend* backend,
     at::Tensor input_tensor,
     at::Tensor output_tensor) {
+
+  if (my_device_index == communication->root()) {
+    input_tensor = input_tensor.contiguous();
+  }
+
   if (my_device_index == communication->root() &&
       !communication->out()->getDeviceMesh().has(communication->root())) {
     output_tensor = at::empty_like(input_tensor.slice(0, 0, 1));

From 01e94a7bf3dd6ab271338d23853cd37117ece3af Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 11 Apr 2025 06:57:46 -0700
Subject: [PATCH 10/38] lint and build issue

---
 csrc/host_ir/lower.cpp             | 2 +-
 csrc/multidevice/communication.cpp | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index d8302a8a119..93badafceb4 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -592,7 +592,7 @@ std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
       get_current_stream, allocate_tva_allgathered, allocate_tv_out, for_loop};
 }
 
-bool HostIrLower::isLoweredAsStandaloneHostOp(Expr* expr) {
+bool HostIrLower::isLowerableAsStandaloneHostOp(Expr* expr) {
   return expr->isOneOf<
       MatmulOp,
       SliceOp,
diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp
index 463cffd2879..b8bcf857fa2 100644
--- a/csrc/multidevice/communication.cpp
+++ b/csrc/multidevice/communication.cpp
@@ -352,7 +352,6 @@ c10::intrusive_ptr<c10d::Work> postScatter(
     c10d::Backend* backend,
     at::Tensor input_tensor,
     at::Tensor output_tensor) {
-
   if (my_device_index == communication->root()) {
     input_tensor = input_tensor.contiguous();
   }

From e1db5183158a3662827a6cb70caae7e63c0e2191 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 14 Apr 2025 06:49:48 -0700
Subject: [PATCH 11/38] reviews

---
 csrc/host_ir/container.h    |  2 +-
 csrc/host_ir/executor.h     |  2 +-
 tests/cpp/test_host_irs.cpp | 41 ++++++++++++++++++++++++++++++++-----
 3 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/csrc/host_ir/container.h b/csrc/host_ir/container.h
index 7dcd66b4436..07bd896790b 100644
--- a/csrc/host_ir/container.h
+++ b/csrc/host_ir/container.h
@@ -56,7 +56,7 @@ class HostIrContainer final : public Fusion {
   Stream* getDefaultStream();
 
   void markAlias(TensorView* original, const TensorView* new_alias) {
-    if (alias_.count(original)) {
+    while (alias_.count(original)) {
       original = alias_[original]->as<TensorView>();
     }
     alias_[new_alias] = original;
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index 8f4d425fd09..c1486285d19 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -140,7 +140,7 @@ class HostIrEvaluator final : public OptOutDispatch {
   Val* getAlias(Val* val) const {
     const auto& aliases = container_->alias();
     auto it = aliases.find(val);
-    return it != aliases.end() ? it->second : val;
+    return it != aliases.end() ? getAlias(it->second) : val;
   }
 
   bool isKnown(Val* value) const {
diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
index 654e60c5d31..6a41e47c744 100644
--- a/tests/cpp/test_host_irs.cpp
+++ b/tests/cpp/test_host_irs.cpp
@@ -139,7 +139,7 @@ TEST_P(HostIrTest, SingleFusion) {
   auto outputs = hie.runWithInput({{post_on_stream->inputs().at(0), t0}});
 
   // validate the obtained results
-  GTEST_EXPECT_TRUE(torch::allclose(ref_output, outputs[0].as<at::Tensor>()));
+  EXPECT_TRUE(torch::allclose(ref_output, outputs[0].as<at::Tensor>()));
 }
 
 /*
@@ -236,7 +236,7 @@ TEST_P(HostIrTest, TwoFusions) {
   auto outputs = hie.runWithInput({{post_on_stream_0->inputs().at(0), t0}});
 
   // validate the obtained results
-  GTEST_EXPECT_TRUE(torch::allclose(ref_output, outputs[0].as<at::Tensor>()));
+  EXPECT_TRUE(torch::allclose(ref_output, outputs[0].as<at::Tensor>()));
 }
 
 /*
@@ -365,7 +365,7 @@ TEST_P(HostIrTest, ThreeFusions) {
   auto outputs = hie.runWithInput({{post_on_stream_0->inputs().at(0), t0_0}});
 
   // validate the obtained results
-  GTEST_EXPECT_TRUE(torch::allclose(t2_2, outputs[0].as<at::Tensor>()));
+  EXPECT_TRUE(torch::allclose(t2_2, outputs[0].as<at::Tensor>()));
 }
 
 // This unit test the for-loop IR by implementing a program that could be
@@ -508,7 +508,7 @@ TEST_P(HostIrTest, PreAllocatedOutputs) {
        {post_on_stream->outputs().at(0), output}});
 
   // validate the obtained results
-  GTEST_EXPECT_TRUE(torch::allclose(ref_output, output))
+  EXPECT_TRUE(torch::allclose(ref_output, output))
       << "Output: " << output << " Expected: " << ref_output;
 }
 
@@ -724,7 +724,7 @@ TEST_P(StreamHostIrTest, SingleFusionMultipleStreams) {
 
   // validate the obtained results
   for (int i = 0; i < n_iterations; i++) {
-    GTEST_EXPECT_TRUE(torch::allclose(ref_output, outputs[i].as<at::Tensor>()));
+    EXPECT_TRUE(torch::allclose(ref_output, outputs[i].as<at::Tensor>()));
   }
   EXPECT_NE(
       c10::cuda::getDefaultCUDAStream(0), c10::cuda::getCurrentCUDAStream(0));
@@ -1262,6 +1262,37 @@ TEST_F(HirAlias, SetAndGet) {
       << "Expected output: " << expected_out;
 }
 
+TEST_F(HirAlias, SetAndGetReversedOrder) {
+  const std::vector<int64_t> sizes = {8, 64};
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  TensorView* tv0 = makeConcreteTensor(sizes);
+  TensorView* tv1 = set(tv0);
+  TensorView* tv2 = makeConcreteTensor(sizes);
+  TensorView* tv3 = set(tv2);
+  TensorView* tv4 = makeConcreteTensor(sizes);
+  hic->markAlias(tv3, tv4);
+  hic->markAlias(tv1, tv2);
+  hic->addInput(tv0);
+  hic->addOutput(tv4);
+  hic->pushBackTopLevelExprs(tv1->definition());
+  hic->pushBackTopLevelExprs(tv3->definition());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor tv0_aten = at::randn(sizes, options);
+
+  at::Tensor out_aten = hie.runWithInput({{tv0, tv0_aten}})[0].as<at::Tensor>();
+
+  at::Tensor expected_out = tv0_aten;
+  EXPECT_TRUE(out_aten.equal(expected_out))
+      << "Obtained output: " << out_aten << "\n"
+      << "Expected output: " << expected_out;
+}
+
 TEST_F(HirAlias, ThrowOnInputAlias) {
   const std::vector<int64_t> sizes = {8, 64};
 

From 59622ff48520269c02ed7a6893fd32c39a2a5848 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Tue, 15 Apr 2025 09:33:13 -0700
Subject: [PATCH 12/38] add comment

---
 csrc/fusion_segmenter.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h
index a975716676d..7141085dcdf 100644
--- a/csrc/fusion_segmenter.h
+++ b/csrc/fusion_segmenter.h
@@ -502,6 +502,10 @@ struct SegmentCandidateFinderOptions {
   bool run_combine_reductions = true;
   bool run_herrmann_merge = true;
   bool run_final_merge = true;
+  // if provided, this custom function will be used to determine if two groups
+  // should be merged. If not provided, the tryMerge function will be used. This
+  // option is used in the context of MultiGpus where we proceed to a first
+  // segmentation to scoop out communications from compute.
   std::function<bool(SegmentedGroup*, SegmentedGroup*)>
       custom_should_merge_groups = nullptr;
 };

From 25c618c787ef37cf671cae3d652c9cc778443d52 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Tue, 15 Apr 2025 11:04:14 -0700
Subject: [PATCH 13/38] add comment

---
 csrc/fusion_segmenter.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index b522f6d11ef..c98786a086d 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -3919,6 +3919,10 @@ bool SegmentCandidateFinder::codeGenSupportedMerge(
   NVF_ERROR(
       areDirectlyConnected(group1, group2),
       "only support testing immediate producer-consumer groups");
+  // The segmemter should ideally be redesigned to be more flexible and
+  // decoupled from the schedulers, but for now, we just return
+  // `SchedulerType::None` as it is not relevant when the segmenter is
+  // used with a custom should-merge function.
   if (options_.custom_should_merge_groups != nullptr) {
     return (options_.custom_should_merge_groups)(group1, group2);
   }

From eb46aef5aa46340e883d858fb4e707f27e6d28d3 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 16 Apr 2025 02:22:24 -0700
Subject: [PATCH 14/38] minor comment

---
 csrc/host_ir/container.cpp  |  2 +-
 csrc/host_ir/executor.cpp   | 22 +++++++++++-----------
 csrc/host_ir/executor.h     |  2 +-
 tests/cpp/test_host_irs.cpp |  1 +
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/csrc/host_ir/container.cpp b/csrc/host_ir/container.cpp
index a133c0521ba..83e668770fc 100644
--- a/csrc/host_ir/container.cpp
+++ b/csrc/host_ir/container.cpp
@@ -26,7 +26,7 @@ HostIrContainer::HostIrContainer(int64_t num_kernel_executors)
 HostIrContainer::~HostIrContainer() = default;
 
 Stream* HostIrContainer::getDefaultStream() {
-  if (!default_stream_) {
+  if (default_stream_ == nullptr) {
     default_stream_ = IrBuilder::createInContainer<Stream>(this);
   }
   return default_stream_;
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 551f35487c9..89710eaae4b 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -327,7 +327,7 @@ void HostIrEvaluator::handle(Synchronize* synchronize) {
 void HostIrEvaluator::handle(LaunchKernel* launch_kernel) {
   KernelArgumentHolder args;
   for (auto& input : launch_kernel->inputs()) {
-    args.push(getKnownConcreteData(input));
+    args.push(getKnownConcreteValue(input));
   }
   args.setDeviceIndex();
 
@@ -349,7 +349,7 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) {
 void HostIrEvaluator::handle(PostOnStream* post_ir) {
   KernelArgumentHolder input_args;
   for (auto& input : post_ir->inputs()) {
-    input_args.push(getKnownConcreteData(input));
+    input_args.push(getKnownConcreteValue(input));
   }
   input_args.setDeviceIndex();
   // placeholder for storing the outputs
@@ -368,7 +368,7 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) {
       post_ir);
   if (use_preallocated_outputs) {
     for (auto output : post_ir->outputs()) {
-      outputs.push(getKnownConcreteData(output));
+      outputs.push(getKnownConcreteValue(output));
     }
   }
 
@@ -599,9 +599,9 @@ void HostIrEvaluator::handle(MatmulOp* matmul) {
   TensorView* out = matmul->out();
 
   if (isKnown(out)) {
-    auto t_a = getKnownConcreteData(a).as<at::Tensor>();
-    auto t_b = getKnownConcreteData(b).as<at::Tensor>();
-    auto t_out = getKnownConcreteData(out).as<at::Tensor>();
+    auto t_a = getKnownConcreteValue(a).as<at::Tensor>();
+    auto t_b = getKnownConcreteValue(b).as<at::Tensor>();
+    auto t_out = getKnownConcreteValue(out).as<at::Tensor>();
     at::matmul_out(t_out, t_a, t_b);
   } else {
     unhandled(matmul);
@@ -619,12 +619,12 @@ void HostIrEvaluator::handle(LinearOp* linear) {
     return;
   }
 
-  auto in_at = getKnownConcreteData(in).as<at::Tensor>();
-  auto weight_at = getKnownConcreteData(weight).as<at::Tensor>();
-  auto out_at = getKnownConcreteData(out).as<at::Tensor>();
+  auto in_at = getKnownConcreteValue(in).as<at::Tensor>();
+  auto weight_at = getKnownConcreteValue(weight).as<at::Tensor>();
+  auto out_at = getKnownConcreteValue(out).as<at::Tensor>();
 
   if (linear->has_bias()) {
-    auto bias_at = getKnownConcreteData(bias).as<at::Tensor>();
+    auto bias_at = getKnownConcreteValue(bias).as<at::Tensor>();
     at::linear_out(out_at, in_at, weight_at.squeeze(), bias_at.squeeze());
   } else {
     at::linear_out(out_at, in_at, weight_at.squeeze());
@@ -661,7 +661,7 @@ void HostIrEvaluator::unhandled(Statement* stmt) {
   for (auto input : expr->inputs()) {
     if (input->isA<TensorView>()) {
       // Tensor inputs must be already computed at this point
-      inputs.push_back(getKnownConcreteData(input));
+      inputs.push_back(getKnownConcreteValue(input));
     } else {
       inputs.push_back(expr_evaluator_.evaluate(input));
     }
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index b603359d5b3..d71b74e0dda 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -149,7 +149,7 @@ class HostIrEvaluator final : public OptOutDispatch {
     return expr_evaluator_.isKnown(getAlias(value));
   }
 
-  PolymorphicValue getKnownConcreteData(Val* val) const {
+  PolymorphicValue getKnownConcreteValue(Val* val) const {
     NVF_ERROR(
         isKnown(val),
         "value ",
diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
index 6a41e47c744..f8660f3bb83 100644
--- a/tests/cpp/test_host_irs.cpp
+++ b/tests/cpp/test_host_irs.cpp
@@ -6,6 +6,7 @@
  */
 // clang-format on
 #include <gtest/gtest.h>
+#include <gmock/gmock-matchers.h>
 
 #include <fusion.h>
 #include <fusion_segmenter.h>

From 5f161f5357d9994cd8e9122e06262b02a343103b Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 16 Apr 2025 02:44:03 -0700
Subject: [PATCH 15/38] lint

---
 tests/cpp/test_host_irs.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
index f8660f3bb83..6a41e47c744 100644
--- a/tests/cpp/test_host_irs.cpp
+++ b/tests/cpp/test_host_irs.cpp
@@ -6,7 +6,6 @@
  */
 // clang-format on
 #include <gtest/gtest.h>
-#include <gmock/gmock-matchers.h>
 
 #include <fusion.h>
 #include <fusion_segmenter.h>

From d2655842281d22f3fdc811f0074ea959f12f378f Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 16 Apr 2025 03:29:01 -0700
Subject: [PATCH 16/38] Revert "move .contiguous to be in postScatter"

This reverts commit 85b7b751b82c31f5b019773d5adfdec21210ab8b.
---
 csrc/host_ir/executor.cpp          | 2 +-
 csrc/multidevice/communication.cpp | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index e5dd2a86ba1..b973c7f3277 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -645,7 +645,7 @@ void HostIrEvaluator::handle(LoadStoreOp* load_store_op) {
         permutation.has_value(),
         "The logical domain of a Set.Permute is supposed to be a permutation of the root domain: ",
         out_tv->toString());
-    in_tensor = in_tensor.permute(*permutation);
+    in_tensor = in_tensor.permute(*permutation).contiguous();
   }
   if (!isKnown(load_store_op->out())) {
     bind(load_store_op->out(), in_tensor);
diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp
index c88aa5944e7..041e13eb80c 100644
--- a/csrc/multidevice/communication.cpp
+++ b/csrc/multidevice/communication.cpp
@@ -378,10 +378,6 @@ c10::intrusive_ptr<c10d::Work> postScatter(
     c10d::Backend* backend,
     at::Tensor input_tensor,
     at::Tensor output_tensor) {
-  if (my_device_index == communication->root()) {
-    input_tensor = input_tensor.contiguous();
-  }
-
   if (my_device_index == communication->root() &&
       !communication->out()->getDeviceMesh().has(communication->root())) {
     output_tensor = at::empty_like(input_tensor.slice(0, 0, 1));

From a177cb4a015c49e7dfe7d376f237ecd643bda422 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 26 Mar 2025 06:08:43 -0700
Subject: [PATCH 17/38] add ParallelType::Stream lowering pass in host Ir for
 single device fusions

---
 CMakeLists.txt                              |   2 +
 csrc/host_ir/executor.h                     |   4 +
 csrc/host_ir/lower.cpp                      |   8 +
 csrc/ir/internal_nodes.h                    |   4 +
 csrc/multidevice/executor.h                 |   4 +
 csrc/ops/indexing.cpp                       |  10 +-
 csrc/ops/indexing.h                         |   6 +-
 csrc/ops/utils.cpp                          |  27 +-
 csrc/ops/utils.h                            |  14 +-
 csrc/preseg_passes/stream_parallel_type.cpp | 347 +++++++++
 csrc/preseg_passes/stream_parallel_type.h   |  26 +
 tests/cpp/test_host_ir_stream_lowering.cpp  | 823 ++++++++++++++++++++
 tests/cpp/test_multidevice_host_ir.cpp      |  10 +
 13 files changed, 1271 insertions(+), 14 deletions(-)
 create mode 100644 csrc/preseg_passes/stream_parallel_type.cpp
 create mode 100644 csrc/preseg_passes/stream_parallel_type.h
 create mode 100644 tests/cpp/test_host_ir_stream_lowering.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b9865da34a7..3f2750b59b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -212,6 +212,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/preseg_passes/remove_empty.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/segment_inplace_update.cpp
+  ${NVFUSER_SRCS_DIR}/preseg_passes/stream_parallel_type.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/translate_no_reduction_matmul_to_mul_squeeze.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/translate_repeat_to_expand.cpp
   ${NVFUSER_SRCS_DIR}/rng.cpp
@@ -731,6 +732,7 @@ if(BUILD_TEST)
   list(APPEND HOSTIR_TEST_SRCS
     ${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_host_ir_integration.cpp
+    ${NVFUSER_ROOT}/tests/cpp/test_host_ir_stream_lowering.cpp
   )
   add_test(test_host_ir "${HOSTIR_TEST_SRCS}" "")
   list(APPEND TEST_BINARIES test_host_ir)
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index dfe84fba068..89ac5119681 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -97,6 +97,10 @@ class HostIrEvaluator final : public OptOutDispatch {
     return container_->outputs();
   }
 
+  auto* container() const {
+    return container_.get();
+  }
+
   std::ostream& print(std::ostream& os) const {
     return container_->print(os);
   };
diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index 308e1399872..1a74d9a9f01 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -19,6 +19,7 @@
 #include <preseg_passes/make_resharding_contiguous.h>
 #include <preseg_passes/propagate_shardings.h>
 #include <preseg_passes/reorder_sharded_axis.h>
+#include <preseg_passes/stream_parallel_type.h>
 #include <runtime/fusion_kernel_runtime.h>
 #include <limits>
 
@@ -718,6 +719,10 @@ std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
     hic->addOutput(ir_cloner.clone(output));
   }
 
+  for (auto tv : hic->allTvs()) {
+    tv->setMemoryType(MemoryType::Global);
+  }
+
   std::vector<Expr*> new_top_level_exprs;
   for (auto top_level_expr : hic->topLevelExprs()) {
     if (!isResharding(top_level_expr)) {
@@ -744,6 +749,9 @@ std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
   }
   hic->resetTopLevelExprs(new_top_level_exprs);
 
+  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+      hic.get());
+
   return hic;
 }
 
diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
index 91d3ca4ec39..1a2bb1634bb 100644
--- a/csrc/ir/internal_nodes.h
+++ b/csrc/ir/internal_nodes.h
@@ -2477,6 +2477,10 @@ class ForLoop final : public Expr {
     return input(0);
   }
 
+  IterDomain* iterDomain() const {
+    return input(1)->as<IterDomain>();
+  }
+
   Val* indexOrStartIfTrivial() const {
     return isTrivial() ? start() : index();
   }
diff --git a/csrc/multidevice/executor.h b/csrc/multidevice/executor.h
index c1cc3e31cfe..7dd08a87f0a 100644
--- a/csrc/multidevice/executor.h
+++ b/csrc/multidevice/executor.h
@@ -103,6 +103,10 @@ class MultiDeviceExecutor {
     return host_ir_executor_->getFusionExecutorCaches();
   };
 
+  auto* hostIrEvaluator() const {
+    return host_ir_executor_.get();
+  }
+
  private:
   // holds the Communicator to be used for execution
   Communicator& comm_;
diff --git a/csrc/ops/indexing.cpp b/csrc/ops/indexing.cpp
index 5ff75065ff2..80c0ff84b85 100644
--- a/csrc/ops/indexing.cpp
+++ b/csrc/ops/indexing.cpp
@@ -19,8 +19,14 @@
 
 namespace nvfuser {
 
-TensorView* select(TensorView* tv, int64_t dim, Val* index) {
-  auto dom = TensorDomain::noReductions(tv->getLogicalDomain());
+TensorView* select(
+    TensorView* tv,
+    int64_t dim,
+    Val* index,
+    bool keep_reduction_axis) {
+  auto dom = keep_reduction_axis
+      ? tv->getLogicalDomain()
+      : TensorDomain::noReductions(tv->getLogicalDomain());
   NVF_CHECK(!dom.empty(), "select can not be applied to 0d tensor.");
 
   std::vector<IterDomain*> new_root;
diff --git a/csrc/ops/indexing.h b/csrc/ops/indexing.h
index c8152c33f82..7a219c534a3 100644
--- a/csrc/ops/indexing.h
+++ b/csrc/ops/indexing.h
@@ -15,7 +15,11 @@
 
 namespace nvfuser {
 
-NVF_API TensorView* select(TensorView* tv, int64_t dim, Val* index);
+NVF_API TensorView* select(
+    TensorView* tv,
+    int64_t dim,
+    Val* index,
+    bool keep_reduction_axis = false);
 
 // torch.index_select
 NVF_API TensorView* indexSelect(
diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index 8d3870d1a84..5d32c22e212 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -432,7 +432,9 @@ IterDomain* newOutputIterDomain(
 #pragma GCC diagnostic pop
 #endif
 
-std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals) {
+std::vector<IterDomain*> newOutputDomain(
+    const std::vector<Val*>& vals,
+    bool keep_reduction_axis) {
   std::vector<TensorView*> tvs;
   for (auto val : vals) {
     if (auto* tv = dynamic_cast<TensorView*>(val)) {
@@ -443,14 +445,20 @@ std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals) {
       !tvs.empty(),
       "Tried to create new output TensorView but received empty list.");
 
-  std::vector<IterDomain*> out_domain(
-      TensorDomain::noReductions(tvs[0]->getLogicalDomain()).size(), nullptr);
+  auto getLogicalDomain =
+      [keep_reduction_axis](TensorView* tv) -> std::vector<IterDomain*> {
+    return keep_reduction_axis
+        ? tv->getLogicalDomain()
+        : TensorDomain::noReductions(tv->getLogicalDomain());
+  };
+
+  std::vector<IterDomain*> out_domain(getLogicalDomain(tvs[0]).size(), nullptr);
 
   for (const auto dim_i : arange(out_domain.size())) {
     std::vector<IterDomain*> input_ids;
     input_ids.reserve(tvs.size());
     for (auto* tv : tvs) {
-      auto dom = TensorDomain::noReductions(tv->getLogicalDomain());
+      auto dom = getLogicalDomain(tv);
       input_ids.emplace_back(dom[dim_i]);
     }
     out_domain[dim_i] = newOutputIterDomain(input_ids);
@@ -458,8 +466,11 @@ std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals) {
   return out_domain;
 }
 
-TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype) {
-  auto out_domain = newOutputDomain(vals);
+TensorView* newOutputTV(
+    const std::vector<Val*>& vals,
+    DataType dtype,
+    bool keep_reduction_axis) {
+  auto out_domain = newOutputDomain(vals, keep_reduction_axis);
   auto* new_out = IrBuilder::create<TensorView>(
       IrBuilder::create<TensorDomain>(
           out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)),
@@ -502,12 +513,12 @@ std::vector<Val*> maybeBroadcast(const std::vector<Val*>& vals) {
   return out_vals;
 }
 
-Val* newValLike(Val* val, DataType dtype) {
+Val* newValLike(Val* val, DataType dtype, bool keep_reduction_axis) {
   NVF_CHECK(
       dtype != DataType::Null, "Invalid datatype provided for new value.");
 
   if (val->isA<TensorView>()) {
-    return newOutputTV({val}, dtype);
+    return newOutputTV({val}, dtype, keep_reduction_axis);
   }
 
   return newScalar(ValType::Others, dtype);
diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h
index 94d6391cf45..1a2abda03fc 100644
--- a/csrc/ops/utils.h
+++ b/csrc/ops/utils.h
@@ -99,13 +99,21 @@ IterDomain* newOutputIterDomain(
 // output tensorview, e.g., for BinaryOp. `vals` can contain scalars, e.g, when
 // creating the output TensorView for `tv0+scalar`. This is for convenience and
 // scalars will be ignored.
-std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals);
+std::vector<IterDomain*> newOutputDomain(
+    const std::vector<Val*>& vals,
+    bool keep_reduction_axis = false);
 
-TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype);
+TensorView* newOutputTV(
+    const std::vector<Val*>& vals,
+    DataType dtype,
+    bool keep_reduction_axis = false);
 
 std::vector<Val*> maybeBroadcast(const std::vector<Val*>& vals);
 
-NVF_API Val* newValLike(Val* val, DataType dtype);
+NVF_API Val* newValLike(
+    Val* val,
+    DataType dtype,
+    bool keep_reduction_axis = false);
 
 // returns the minimum init value for reduction:
 //   -inf for floating type;
diff --git a/csrc/preseg_passes/stream_parallel_type.cpp b/csrc/preseg_passes/stream_parallel_type.cpp
new file mode 100644
index 00000000000..5a814c9a59a
--- /dev/null
+++ b/csrc/preseg_passes/stream_parallel_type.cpp
@@ -0,0 +1,347 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+
+#include <host_ir/container.h>
+#include <host_ir/lower.h>
+#include <id_model/id_model.h>
+#include <ir/all_nodes.h>
+#include <ir/builder.h>
+#include <ir/internal_base_nodes.h>
+#include <ir/utils.h>
+#include <kernel_ir.h>
+#include <ops/all_ops.h>
+#include <ops/utils.h>
+#include <preseg_passes/stream_parallel_type.h>
+
+namespace nvfuser::preseg_passes {
+
+// returns the first stream axis in the domain, or nullptr if there is none.
+// Throws if two axis are stream parallelized
+IterDomain* getStreamAxis(const std::vector<IterDomain*>& domain) {
+  IterDomain* ret = nullptr;
+  for (auto id : domain) {
+    if (id->getParallelType() == ParallelType::Stream) {
+      NVF_CHECK(
+          ret == nullptr,
+          "Expected at most one stream axis in the domain, but found ",
+          id,
+          " and ",
+          ret);
+      ret = id;
+    }
+  }
+  return ret;
+}
+
+
+
+// TODO: ideally we should look at the dag and use the segmenter. Here we take
+// advantage of the linear structure of HostIrContainer::topLevelExprs to
+// greedily merge the adjacent compatible stream for-loop bodies
+void StreamParallelType::runPass(Fusion* fusion) {
+  // check that there are no stream axes in the inputs
+  NVF_CHECK(
+      std::all_of(
+          fusion->inputs().begin(),
+          fusion->inputs().end(),
+          [](Val* input) {
+            auto input_tv = dynamic_cast<TensorView*>(input);
+            return input_tv == nullptr ||
+                getStreamAxis(input_tv->getLoopDomain()) == nullptr;
+          }),
+      "Expected no stream axis in the TensorView inputs.");
+
+  FusionGuard fg(fusion); // set as current container to register the newly
+                          // created for-loops
+  hir::HostIrContainer* hic = dynamic_cast<hir::HostIrContainer*>(fusion);
+  NVF_CHECK(hic, "Expected HostIrContainer");
+  // needed ?
+  IdModel id_model(fusion);
+  id_model.buildAlmostExactGraph();
+
+  std::vector<Expr*> new_top_level_exprs;
+  // Step 1. Find the segments of expressions that can be merged into a single
+  // stream for-loop At the end of this step, new_top_level_exprs contains a
+  // list of expressions including newly created for-loops that will represent
+  // the stream parallelization, and the relevant expressions grouped inside the
+  // for-loops bodies.
+  for (auto expr : hic->topLevelExprs()) {
+    // we only support exprs having at most 1 output for now
+    if (expr->outputs().size() == 0) {
+      new_top_level_exprs.push_back(expr);
+      continue;
+    }
+    NVF_CHECK(
+        expr->outputs().size() == 1,
+        "Each expr should have at most one output.");
+    TensorView* output = expr->output(0)->as<TensorView>();
+    // retrieves the Loop IterDomain that is stream parallelized, if any
+    IterDomain* stream_axis = getStreamAxis(output->getLoopDomain());
+    if (stream_axis == nullptr) {
+      // if the consumer is not stream parallelized, it means the expr need not
+      // be inside a stream for-loop
+      new_top_level_exprs.push_back(expr);
+      continue;
+    }
+    NVF_ERROR(
+        HostIrLower::isLoweredAsStandaloneHostOp(expr),
+        "Stream parallel type not supported for expr ",
+        expr);
+    // find the corresponding stream axis but in the Logical (and not Loop
+    // Domain)
+    auto it_logical_stream_axis = std::find(
+        output->getLogicalDomain().begin(),
+        output->getLogicalDomain().end(),
+        stream_axis);
+    // for now we do not support split/merge stream axis
+    NVF_ERROR(
+        it_logical_stream_axis != output->getLogicalDomain().end(),
+        "Cannot stream parallelize on a split/merge axis ",
+        stream_axis);
+    // we don't support reducing or broadcasting a stream axis
+    NVF_CHECK(
+        stream_axis->getIterType() == IterType::Iteration,
+        "Stream axis ",
+        stream_axis,
+        " should be an iteration axis.");
+    // check if the current expr can be merged with the previous stream for-loop
+    // We consider the previous expression to check whether the expr should
+    // create a new stream for-loop or be integrated into the previous one
+    if (!new_top_level_exprs.empty() &&
+        new_top_level_exprs.back()->isA<ForLoop>() &&
+        id_model.idGraph(IdMappingMode::ALMOSTEXACT)
+            .disjointValSets()
+            .strictAreMapped(
+                stream_axis,
+                new_top_level_exprs.back()->as<ForLoop>()->iterDomain())) {
+      // merge with previous for-loop
+      new_top_level_exprs.back()->as<ForLoop>()->body().push_back(expr);
+    } else {
+      // create a new for-loop
+      auto* j = IrBuilder::create<Val>(
+          DataType::Index); // running index of the for-loop
+      auto* start = hic->zeroVal();
+      auto* stop = stream_axis->extent();
+      auto* step = hic->oneVal();
+      auto* for_loop = IrBuilder::create<ForLoop>(
+          stream_axis,
+          /*index=*/j,
+          start,
+          stop,
+          step,
+          /*vectorize=*/false,
+          /*vectorize_shift=*/nullptr,
+          /*unroll_required=*/false,
+          CircularBufferLoopStage::NotApplicable,
+          /*circular_buffer_loop_stage_depth=*/0);
+      for_loop->body().push_back(expr);
+      // replace the current expr by the for-loop containing it
+      new_top_level_exprs.push_back(for_loop);
+    }
+  }
+
+  // Step 2. Setup each for loop's body by Slicing the tensors.
+  std::vector<Expr*> top_level_exprs = std::move(new_top_level_exprs);
+  new_top_level_exprs.clear();
+  for (auto top_level_expr : top_level_exprs) {
+    // TODO: change in place? consr issue
+    if (!top_level_expr->isA<ForLoop>()) {
+      new_top_level_exprs.push_back(top_level_expr);
+      continue;
+    }
+    auto* for_loop = top_level_expr->as<ForLoop>();
+    // this will contain the new body of the current for-loop
+    std::vector<Expr*> new_loop_body;
+
+    std::vector<Expr*> current_loop_body = for_loop->body().exprs();
+    for (auto it_expr = current_loop_body.begin();
+         it_expr != current_loop_body.end();
+         ++it_expr) {
+      Expr* expr = *it_expr;
+      for (auto* input : ir_utils::filterByType<TensorView>(expr->inputs())) {
+        int64_t input_stream_id_logical_index = -1;
+        for (auto id : input->getLoopDomain()) {
+          if (id_model.idGraph(IdMappingMode::ALMOSTEXACT)
+                  .disjointValSets()
+                  .strictAreMapped(for_loop->iterDomain(), id)) {
+            NVF_CHECK(
+                input_stream_id_logical_index == -1,
+                "Expected at most one axis mapping to the stream axis ",
+                for_loop->iterDomain(),
+                " in the tensor ",
+                input,
+                " loop's domain ",
+                input->getLoopDomain());
+            auto it_input_stream_id_logical = std::find(
+                input->getLogicalDomain().begin(),
+                input->getLogicalDomain().end(),
+                id);
+            NVF_CHECK(
+                it_input_stream_id_logical != input->getLogicalDomain().end(),
+                "Expected to find ",
+                id,
+                " in ",
+                input,
+                "'s logical domain ",
+                input->getLogicalDomain());
+            input_stream_id_logical_index = std::distance(
+                input->getLogicalDomain().begin(), it_input_stream_id_logical);
+          }
+        }
+        if (input_stream_id_logical_index == -1) {
+          continue;
+        }
+        TensorView* input_j = select(
+            input,
+            input_stream_id_logical_index,
+            for_loop->index(),
+            /*keep_reduction_axis=*/true);
+        new_loop_body.push_back(input_j->definition());
+        for (auto it_running_expr = current_loop_body.begin();
+             it_running_expr != current_loop_body.end();
+             ++it_running_expr) {
+          Expr* running_expr = *it_running_expr;
+          for (auto* running_input :
+               ir_utils::filterByType<TensorView>(running_expr->inputs())) {
+            if (running_input == input) {
+              *it_running_expr = ir_utils::replaceValInExprInputs(
+                  running_expr, input, input_j);
+            }
+          }
+        }
+      }
+
+      for (auto* output : ir_utils::filterByType<TensorView>(expr->outputs())) {
+        int64_t output_stream_id_logical_index = -1;
+        for (auto id : output->getLoopDomain()) {
+          if (id_model.idGraph(IdMappingMode::ALMOSTEXACT)
+                  .disjointValSets()
+                  .strictAreMapped(for_loop->iterDomain(), id)) {
+            NVF_CHECK(
+                output_stream_id_logical_index == -1,
+                "Expected at most one axis mapping to the stream axis ",
+                for_loop->iterDomain(),
+                " in the tensor ",
+                output,
+                " loop's domain ",
+                output->getLoopDomain());
+            auto it_output_stream_id_logical = std::find(
+                output->getLogicalDomain().begin(),
+                output->getLogicalDomain().end(),
+                id);
+            NVF_CHECK(
+                it_output_stream_id_logical != output->getLogicalDomain().end(),
+                "Expected to find ",
+                id,
+                " in ",
+                output,
+                "'s logical domain ",
+                output->getLogicalDomain());
+            output_stream_id_logical_index = std::distance(
+                output->getLogicalDomain().begin(),
+                it_output_stream_id_logical);
+          }
+        }
+        if (output_stream_id_logical_index == -1) {
+          continue;
+        }
+        TensorView* output_j = select(
+            output,
+            output_stream_id_logical_index,
+            for_loop->index(),
+            /*keep_reduction_axis=*/true);
+        new_top_level_exprs.push_back(
+            IrBuilder::create<kir::Allocate>(output, MemoryType::Global));
+        new_loop_body.push_back(output_j->definition());
+        for (auto it_running_expr = current_loop_body.begin();
+             it_running_expr != current_loop_body.end();
+             ++it_running_expr) {
+          Expr* running_expr = *it_running_expr;
+          for (auto* running_output :
+               ir_utils::filterByType<TensorView>(running_expr->outputs())) {
+            if (running_output == output) {
+              TensorView* output_j_alias =
+                  ops::newValLike(
+                      output_j, output_j->dtype(), /*keep_reduction_axis=*/true)
+                      ->as<TensorView>();
+              hic->markAlias(output_j, output_j_alias);
+              *it_running_expr = ir_utils::transferDefinitionToNewOutputs(
+                  running_expr, {output_j_alias});
+              if (Communication* comm = dynamic_cast<Communication*>(
+                      output_j_alias->definition());
+                  comm && comm->type() == CommunicationType::Allgather) {
+                std::cout << "HERE, with expr:" << *it_running_expr
+                          << std::endl;
+              }
+            }
+          }
+        }
+      }
+      new_loop_body.push_back(*it_expr);
+    }
+    // reseting the for-loop body
+    for_loop->body().clear();
+    for (auto* expr : new_loop_body) {
+      for_loop->body().push_back(expr);
+    }
+    new_top_level_exprs.push_back(top_level_expr);
+  }
+
+  // Step 3. Finalize the for-loop bodies by adding the stream setup and
+  // synchronization
+  for (auto* top_level_expr : new_top_level_exprs) {
+    if (!top_level_expr->isA<ForLoop>()) {
+      continue;
+    }
+    auto* for_loop = top_level_expr->as<ForLoop>();
+    std::vector<Expr*> new_loop_body;
+
+    // Get the current stream to later synchronize subsequent new streams
+    auto* get_current_stream = IrBuilder::create<hir::GetCurrentStream>();
+    hir::Stream* original_stream = get_current_stream->stream();
+    new_loop_body.push_back(get_current_stream);
+
+    // set the stream to the one corresponding to the current for-loop index
+    auto* j = for_loop->index();
+    auto* number_of_streams =
+        IrBuilder::create<NamedScalar>("numberOfStreams", DataType::Int);
+    auto* stream_index = mod(j, number_of_streams);
+    auto* stream = IrBuilder::create<hir::Stream>(stream_index);
+    auto* set_stream = IrBuilder::create<hir::SetCurrentStream>(stream);
+    new_loop_body.push_back(set_stream);
+
+    // sync the new stream with the original stream
+    auto* initial_sync_stream =
+        IrBuilder::create<hir::Synchronize>(original_stream);
+    new_loop_body.push_back(initial_sync_stream);
+
+    // add the actual exprs to the for-loop body
+    for (auto* expr : for_loop->body().exprs()) {
+      new_loop_body.push_back(expr);
+    }
+
+    // set back the original stream
+    auto* set_back_original_stream =
+        IrBuilder::create<hir::SetCurrentStream>(original_stream);
+    new_loop_body.push_back(set_back_original_stream);
+    // synchronize original stream with the for-loop's streams
+    auto* sync_stream = IrBuilder::create<hir::Synchronize>(stream);
+    new_loop_body.push_back(sync_stream);
+
+    // reset the for-loop's body to the one we constructed.
+    for_loop->body().clear();
+    for (auto* expr : new_loop_body) {
+      for_loop->body().push_back(expr);
+    }
+  }
+
+  // reset hic topLevelExprs to new_top_level_exprs
+  hic->resetTopLevelExprs(new_top_level_exprs);
+}
+
+} // namespace nvfuser::preseg_passes
diff --git a/csrc/preseg_passes/stream_parallel_type.h b/csrc/preseg_passes/stream_parallel_type.h
new file mode 100644
index 00000000000..a9600809e21
--- /dev/null
+++ b/csrc/preseg_passes/stream_parallel_type.h
@@ -0,0 +1,26 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <fusion.h>
+#include <preseg_passes/optimization_pass.h>
+
+namespace nvfuser::preseg_passes {
+
+// A pass used in HostIrLower that takes a HostIrContainer as input, reads the TensorView's ParallelType::Stream, and modify the the HostIrContainer's top level expressions with the corresponding Host For Loops, which bodies contain stream assignement, selecting on tensor's axis, and the exprs on those sliced tensors. After this pass, the ParallelType::Stream is removed from the TensorView's axis.
+class StreamParallelType : public OptimizationPass<StreamParallelType> {
+  friend class OptimizationPass<StreamParallelType>;
+
+ protected:
+  static void runPass(Fusion* fusion);
+  static constexpr std::string_view name() {
+    return "StreamParallelType";
+  }
+};
+
+} // namespace nvfuser::preseg_passes
diff --git a/tests/cpp/test_host_ir_stream_lowering.cpp b/tests/cpp/test_host_ir_stream_lowering.cpp
new file mode 100644
index 00000000000..9f3d9f432ed
--- /dev/null
+++ b/tests/cpp/test_host_ir_stream_lowering.cpp
@@ -0,0 +1,823 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include <gtest/gtest.h>
+
+#include <fusion.h>
+#include <host_ir/container.h>
+#include <host_ir/executor.h>
+#include <host_ir/lower.h>
+#include <ir/all_nodes.h>
+#include <ir/builder.h>
+#include <kernel_ir.h>
+#include <multidevice/executor.h>
+#include <ops/all_ops.h>
+#include <preseg_passes/stream_parallel_type.h>
+#include <tests/cpp/utils.h>
+
+#include <algorithm>
+#include <iostream>
+
+namespace nvfuser {
+
+namespace hir {
+
+using HirLowerStreamTest = NVFuserTest;
+
+TEST_F(HirLowerStreamTest, InputsAreNotStreamParallelized) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* tv = makeContigTensor(2);
+  hic->addInput(tv);
+  tv->axis(0)->parallelize(ParallelType::Stream);
+
+  EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
+                   preseg_passes::StreamParallelType>::runPass(hic.get()));
+}
+
+TEST_F(HirLowerStreamTest, Split) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = set(tv0);
+  hic->addInput(tv0);
+  hic->addOutput(tv1);
+  hic->pushBackTopLevelExprs(tv1->definition());
+  tv1->split(0, 2);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+
+  EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
+                   preseg_passes::StreamParallelType>::runPass(hic.get()));
+}
+
+TEST_F(HirLowerStreamTest, Merge) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = set(tv0);
+  hic->addInput(tv0);
+  hic->addOutput(tv1);
+  hic->pushBackTopLevelExprs(tv1->definition());
+  tv1->merge(0, 1);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+
+  EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
+                   preseg_passes::StreamParallelType>::runPass(hic.get()));
+}
+
+TEST_F(HirLowerStreamTest, SingleSetOp) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = set(tv0);
+  hic->addInput(tv0);
+  hic->addOutput(tv1);
+  hic->pushBackTopLevelExprs(tv1->definition());
+  tv0->setMemoryType(MemoryType::Global);
+  tv1->setMemoryType(MemoryType::Global);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+
+  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+      hic.get());
+
+  EXPECT_EQ(hic->topLevelExprs().size(), 2);
+  EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(hic->topLevelExprs().at(1)->isA<ForLoop>());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor input = at::rand({4, 8}, options);
+  auto output = hie.runWithInput({{tv0, input}})[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  EXPECT_TRUE(output.equal(input))
+      << "Output: " << output << " Expected: " << input;
+}
+
+TEST_F(HirLowerStreamTest, SingleSetOpNonOutermost) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = set(tv0);
+  hic->addInput(tv0);
+  hic->addOutput(tv1);
+  hic->pushBackTopLevelExprs(tv1->definition());
+  tv0->setMemoryType(MemoryType::Global);
+  tv1->setMemoryType(MemoryType::Global);
+  tv1->axis(1)->parallelize(ParallelType::Stream);
+
+  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+      hic.get());
+
+  EXPECT_EQ(hic->topLevelExprs().size(), 2);
+  EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(hic->topLevelExprs().at(1)->isA<ForLoop>());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor input = at::rand({4, 8}, options);
+  auto output = hie.runWithInput({{tv0, input}})[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  EXPECT_TRUE(output.equal(input))
+      << "Output: " << output << " Expected: " << input;
+}
+
+TEST_F(HirLowerStreamTest, SingleBinaryOp) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = makeContigTensor(2);
+  TensorView* tv2 = add(tv0, tv1);
+  hic->addInput(tv0);
+  hic->addInput(tv1);
+  hic->addOutput(tv2);
+  hic->pushBackTopLevelExprs(tv2->definition());
+  tv0->setMemoryType(MemoryType::Global);
+  tv1->setMemoryType(MemoryType::Global);
+  tv2->setMemoryType(MemoryType::Global);
+  tv2->axis(0)->parallelize(ParallelType::Stream);
+
+  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+      hic.get());
+
+  EXPECT_EQ(hic->topLevelExprs().size(), 2);
+  EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(hic->topLevelExprs().at(1)->isA<ForLoop>());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor tv0_input = at::rand({4, 4}, options);
+  at::Tensor tv1_input = at::rand({4, 4}, options);
+  // std::unordered_map<Val*, PolymorphicValue> inputs = {{tv0, input}};
+  auto output = hie.runWithInput({{tv0, tv0_input}, {tv1, tv1_input}})[0]
+                    .as<at::Tensor>();
+  auto expected_output = tv0_input + tv1_input;
+  EXPECT_TRUE(output.equal(expected_output))
+      << "Output: " << output << "Expected: " << expected_output;
+}
+
+TEST_F(HirLowerStreamTest, TwoSetOps) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = set(tv0);
+  TensorView* tv2 = set(tv1);
+  hic->addInput(tv0);
+  hic->addOutput(tv2);
+  hic->pushBackTopLevelExprs(tv1->definition());
+  hic->pushBackTopLevelExprs(tv2->definition());
+  tv0->setMemoryType(MemoryType::Global);
+  tv1->setMemoryType(MemoryType::Global);
+  tv2->setMemoryType(MemoryType::Global);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+  tv2->axis(0)->parallelize(ParallelType::Stream);
+
+  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+      hic.get());
+
+  EXPECT_EQ(hic->topLevelExprs().size(), 3);
+  EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(hic->topLevelExprs().at(1)->isA<kir::Allocate>());
+  EXPECT_TRUE(hic->topLevelExprs().at(2)->isA<ForLoop>());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor input = at::rand({4, 8}, options);
+  auto output = hie.runWithInput({{tv0, input}})[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  EXPECT_TRUE(output.equal(input))
+      << "Output: " << output << " Expected: " << input;
+}
+
+TEST_F(HirLowerStreamTest, ThreeSetOpsWithDisjointsForLoops) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = set(tv0);
+  TensorView* tv2 = set(tv1);
+  TensorView* tv3 = set(tv2);
+  hic->addInput(tv0);
+  hic->addOutput(tv3);
+  hic->pushBackTopLevelExprs(tv1->definition());
+  hic->pushBackTopLevelExprs(tv2->definition());
+  hic->pushBackTopLevelExprs(tv3->definition());
+  tv0->setMemoryType(MemoryType::Global);
+  tv1->setMemoryType(MemoryType::Global);
+  tv2->setMemoryType(MemoryType::Global);
+  tv3->setMemoryType(MemoryType::Global);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+  tv3->axis(0)->parallelize(ParallelType::Stream);
+
+  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+      hic.get());
+
+  EXPECT_EQ(hic->topLevelExprs().size(), 5);
+  EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(hic->topLevelExprs().at(1)->isA<ForLoop>());
+  EXPECT_TRUE(hic->topLevelExprs().at(2)->isA<LoadStoreOp>());
+  EXPECT_TRUE(hic->topLevelExprs().at(3)->isA<kir::Allocate>());
+  EXPECT_TRUE(hic->topLevelExprs().at(4)->isA<ForLoop>());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor input = at::rand({4, 8}, options);
+  auto output = hie.runWithInput({{tv0, input}})[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  EXPECT_TRUE(output.equal(input))
+      << "Output: " << output << " Expected: " << input;
+}
+
+TEST_F(HirLowerStreamTest, ReductionUnsupported) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = sum(tv0, {0});
+  hic->addInput(tv0);
+  hic->addOutput(tv1);
+  hic->pushBackTopLevelExprs(tv1->definition());
+  tv0->setMemoryType(MemoryType::Global);
+  tv1->setMemoryType(MemoryType::Global);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+
+  EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
+                   preseg_passes::StreamParallelType>::runPass(hic.get()));
+}
+
+TEST_F(HirLowerStreamTest, Reduction) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* tv0 = makeContigTensor(3);
+  TensorView* tv1 = sum(tv0, {2});
+  hic->addInput(tv0);
+  hic->addOutput(tv1);
+  hic->pushBackTopLevelExprs(tv1->definition());
+  tv0->setMemoryType(MemoryType::Global);
+  tv1->setMemoryType(MemoryType::Global);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+
+  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+      hic.get());
+
+  EXPECT_EQ(hic->topLevelExprs().size(), 2);
+  EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(hic->topLevelExprs().at(1)->isA<ForLoop>());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor input = at::rand({4, 8, 2}, options);
+  auto output = hie.runWithInput({{tv0, input}})[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  auto expected_output = input.sum(2);
+  EXPECT_TRUE(output.equal(expected_output))
+      << "Output: " << output << " Expected: " << expected_output;
+}
+
+TEST_F(HirLowerStreamTest, Matmul_M) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* a = makeContigTensor(2);
+  TensorView* b = makeContigTensor(2);
+  TensorView* c = matmul(a, b);
+  hic->addInput(a);
+  hic->addInput(b);
+  hic->addOutput(c);
+  hic->pushBackTopLevelExprs(c->definition());
+  a->setMemoryType(MemoryType::Global);
+  b->setMemoryType(MemoryType::Global);
+  c->setMemoryType(MemoryType::Global);
+  c->axis(0)->parallelize(ParallelType::Stream);
+
+  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+      hic.get());
+
+  EXPECT_EQ(hic->topLevelExprs().size(), 2);
+  EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(hic->topLevelExprs().at(1)->isA<ForLoop>());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  constexpr int64_t M = 8, K = 4, N = 2;
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor a_aten = at::rand({M, K}, options);
+  at::Tensor b_aten = at::rand({K, N}, options);
+  auto output =
+      hie.runWithInput({{a, a_aten}, {b, b_aten}})[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  auto expected_output = at::matmul(a_aten, b_aten);
+  EXPECT_TRUE(torch::allclose(output, expected_output, 1e-2, 1e-2))
+      << "Output: " << output << " Expected: " << expected_output;
+}
+
+TEST_F(HirLowerStreamTest, BatchedMatmul) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* a = makeContigTensor(3);
+  TensorView* b = makeContigTensor(2);
+  TensorView* c = matmul(a, b);
+  hic->addInput(a);
+  hic->addInput(b);
+  hic->addOutput(c);
+  hic->pushBackTopLevelExprs(c->definition());
+  a->setMemoryType(MemoryType::Global);
+  b->setMemoryType(MemoryType::Global);
+  c->setMemoryType(MemoryType::Global);
+  c->axis(0)->parallelize(ParallelType::Stream);
+
+  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+      hic.get());
+
+  EXPECT_EQ(hic->topLevelExprs().size(), 2);
+  EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(hic->topLevelExprs().at(1)->isA<ForLoop>());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  constexpr int64_t B = 16, M = 8, K = 4, N = 2;
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor a_aten = at::rand({B, M, K}, options);
+  at::Tensor b_aten = at::rand({K, N}, options);
+  auto output =
+      hie.runWithInput({{a, a_aten}, {b, b_aten}})[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  auto expected_output = at::matmul(a_aten, b_aten);
+  EXPECT_TRUE(torch::allclose(output, expected_output, 1e-2, 1e-2))
+      << "Output: " << output << " Expected: " << expected_output;
+}
+
+TEST_F(HirLowerStreamTest, Matmul_N) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* a = makeContigTensor(2);
+  TensorView* b = makeContigTensor(2);
+  TensorView* c = matmul(a, b);
+  hic->addInput(a);
+  hic->addInput(b);
+  hic->addOutput(c);
+  hic->pushBackTopLevelExprs(c->definition());
+  a->setMemoryType(MemoryType::Global);
+  b->setMemoryType(MemoryType::Global);
+  c->setMemoryType(MemoryType::Global);
+  c->axis(1)->parallelize(ParallelType::Stream);
+
+  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+      hic.get());
+
+  EXPECT_EQ(hic->topLevelExprs().size(), 2);
+  EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(hic->topLevelExprs().at(1)->isA<ForLoop>());
+
+  HostIrEvaluator hie(std::move(hic));
+
+  constexpr int64_t M = 8, K = 4, N = 2;
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor a_aten = at::rand({M, K}, options);
+  at::Tensor b_aten = at::rand({K, N}, options);
+  auto output =
+      hie.runWithInput({{a, a_aten}, {b, b_aten}})[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  auto expected_output = at::matmul(a_aten, b_aten);
+  EXPECT_TRUE(torch::allclose(output, expected_output, 1e-2, 1e-2))
+      << "Output: " << output << " Expected: " << expected_output;
+}
+
+TEST_F(HirLowerStreamTest, Matmul_K) {
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+  TensorView* a = makeContigTensor(2);
+  TensorView* b = makeContigTensor(2);
+  TensorView* c = matmul(a, b);
+  hic->addInput(a);
+  hic->addInput(b);
+  hic->addOutput(c);
+  hic->pushBackTopLevelExprs(c->definition());
+  a->setMemoryType(MemoryType::Global);
+  b->setMemoryType(MemoryType::Global);
+  c->setMemoryType(MemoryType::Global);
+  c->axis(-1)->parallelize(ParallelType::Stream);
+
+  EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
+                   preseg_passes::StreamParallelType>::runPass(hic.get()));
+}
+
+// We don's support PostOnStream because it does not support well pre-allocated
+// outputs. There is no strong motivation to support PostOnStream
+TEST_F(HirLowerStreamTest, DoNotSupportPostOnStream) {
+  const std::vector<int64_t> input_sizes = {4, 8, 32};
+  const std::vector<int64_t> output_sizes = {
+      input_sizes.at(1), input_sizes.at(2)};
+
+  auto get_fusion = [input_sizes]() -> std::unique_ptr<Fusion> {
+    auto fusion = std::make_unique<Fusion>();
+    FusionGuard fg(fusion.get());
+
+    auto tv0 = makeConcreteTensor(input_sizes);
+    auto tv1 = add(tv0, tv0);
+    auto tv2 = sum(tv1, {0});
+    fusion->addInput(tv0);
+    fusion->addOutput(tv2);
+    return fusion;
+  };
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  auto host_unit = IrBuilder::create<HostUnit>(get_fusion());
+
+  IrCloner ir_cloner(hic.get());
+  TensorView* input =
+      ir_cloner.clone(host_unit->fusion_to_execute()->inputs().at(0))
+          ->as<TensorView>();
+  TensorView* output =
+      ir_cloner.clone(host_unit->fusion_to_execute()->outputs().at(0))
+          ->as<TensorView>();
+
+  std::vector<Val*> inputs = {input};
+  std::vector<Val*> outputs = {output};
+  auto post_on_stream =
+      IrBuilder::create<PostOnStream>(host_unit, inputs, outputs);
+
+  hic->pushBackTopLevelExprs(post_on_stream);
+
+  hic->addInput(input);
+  hic->addOutput(output);
+
+  output->axis(-1)->parallelize(ParallelType::Stream);
+
+  EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
+                   preseg_passes::StreamParallelType>::runPass(hic.get()));
+}
+
+} // namespace hir
+
+using MultiDeviceExecutorLowerStreamTest = NVFuserTest;
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, InputsAreNotStreamParallelized) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* tv = makeContigTensor(2);
+  fusion->addInput(tv);
+  tv->axis(0)->parallelize(ParallelType::Stream);
+
+  EXPECT_ANY_THROW(
+      MultiDeviceExecutor(std::move(fusion), Communicator::getInstance()));
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, Split) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = set(tv0);
+  fusion->addInput(tv0);
+  fusion->addOutput(tv1);
+  tv1->split(0, 2);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+
+  EXPECT_ANY_THROW(
+      MultiDeviceExecutor(std::move(fusion), Communicator::getInstance()));
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, Merge) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = set(tv0);
+  fusion->addInput(tv0);
+  fusion->addOutput(tv1);
+  tv1->merge(0, 1);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+
+  EXPECT_ANY_THROW(
+      MultiDeviceExecutor(std::move(fusion), Communicator::getInstance()));
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, SingleSetOp) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = set(tv0);
+  fusion->addInput(tv0);
+  fusion->addOutput(tv1);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+
+  MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance());
+
+  hir::HostIrContainer* container = executor.hostIrEvaluator()->container();
+  EXPECT_EQ(container->topLevelExprs().size(), 2);
+  EXPECT_TRUE(container->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(container->topLevelExprs().at(1)->isA<ForLoop>());
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor input = at::rand({4, 8}, options);
+  auto output =
+      executor.runWithInput(KernelArgumentHolder({input}))[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  EXPECT_TRUE(output.equal(input))
+      << "Output: " << output << " Expected: " << input;
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, SingleSetOpNonOutermost) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = set(tv0);
+  fusion->addInput(tv0);
+  fusion->addOutput(tv1);
+  tv1->axis(1)->parallelize(ParallelType::Stream);
+
+  MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance());
+
+  hir::HostIrContainer* container = executor.hostIrEvaluator()->container();
+  EXPECT_EQ(container->topLevelExprs().size(), 2);
+  EXPECT_TRUE(container->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(container->topLevelExprs().at(1)->isA<ForLoop>());
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor input = at::rand({4, 8}, options);
+  auto output =
+      executor.runWithInput(KernelArgumentHolder({input}))[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  EXPECT_TRUE(output.equal(input))
+      << "Output: " << output << " Expected: " << input;
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, SingleBinaryOp) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = makeContigTensor(2);
+  TensorView* tv2 = add(tv0, tv1);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  fusion->addOutput(tv2);
+  tv2->axis(0)->parallelize(ParallelType::Stream);
+
+  MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance());
+
+  hir::HostIrContainer* container = executor.hostIrEvaluator()->container();
+  EXPECT_EQ(container->topLevelExprs().size(), 2);
+  EXPECT_TRUE(container->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(container->topLevelExprs().at(1)->isA<ForLoop>());
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+
+  at::Tensor tv0_input = at::rand({4, 4}, options);
+  at::Tensor tv1_input = at::rand({4, 4}, options);
+  auto output =
+      executor.runWithInput(KernelArgumentHolder({tv0_input, tv1_input}))[0]
+          .as<at::Tensor>();
+  auto expected_output = tv0_input + tv1_input;
+  EXPECT_TRUE(output.equal(expected_output))
+      << "Output: " << output << "Expected: " << expected_output;
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, TwoSetOps) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = set(tv0);
+  TensorView* tv2 = set(tv1);
+  fusion->addInput(tv0);
+  fusion->addOutput(tv2);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+  tv2->axis(0)->parallelize(ParallelType::Stream);
+
+  MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance());
+
+  hir::HostIrContainer* container = executor.hostIrEvaluator()->container();
+  EXPECT_EQ(container->topLevelExprs().size(), 3);
+  EXPECT_TRUE(container->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(container->topLevelExprs().at(1)->isA<kir::Allocate>());
+  EXPECT_TRUE(container->topLevelExprs().at(2)->isA<ForLoop>());
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor input = at::rand({4, 8}, options);
+  auto output =
+      executor.runWithInput(KernelArgumentHolder({input}))[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  EXPECT_TRUE(output.equal(input))
+      << "Output: " << output << " Expected: " << input;
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, ThreeSetOpsWithDisjointsForLoops) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = set(tv0);
+  TensorView* tv2 = set(tv1);
+  TensorView* tv3 = set(tv2);
+  fusion->addInput(tv0);
+  fusion->addOutput(tv3);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+  tv3->axis(0)->parallelize(ParallelType::Stream);
+
+  MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance());
+
+  hir::HostIrContainer* container = executor.hostIrEvaluator()->container();
+  EXPECT_EQ(container->topLevelExprs().size(), 5);
+  EXPECT_TRUE(container->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(container->topLevelExprs().at(1)->isA<ForLoop>());
+  EXPECT_TRUE(container->topLevelExprs().at(2)->isA<LoadStoreOp>());
+  EXPECT_TRUE(container->topLevelExprs().at(3)->isA<kir::Allocate>());
+  EXPECT_TRUE(container->topLevelExprs().at(4)->isA<ForLoop>());
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor input = at::rand({4, 8}, options);
+  auto output =
+      executor.runWithInput(KernelArgumentHolder({input}))[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  EXPECT_TRUE(output.equal(input))
+      << "Output: " << output << " Expected: " << input;
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, ReductionUnsupported) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = sum(tv0, {0});
+  fusion->addInput(tv0);
+  fusion->addOutput(tv1);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+
+  EXPECT_ANY_THROW(
+      MultiDeviceExecutor(std::move(fusion), Communicator::getInstance()));
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, Reduction) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* tv0 = makeContigTensor(3);
+  TensorView* tv1 = sum(tv0, {2});
+  fusion->addInput(tv0);
+  fusion->addOutput(tv1);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+
+  MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance());
+
+  hir::HostIrContainer* container = executor.hostIrEvaluator()->container();
+  EXPECT_EQ(container->topLevelExprs().size(), 2);
+  EXPECT_TRUE(container->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(container->topLevelExprs().at(1)->isA<ForLoop>());
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor input = at::rand({4, 8, 2}, options);
+  auto output =
+      executor.runWithInput(KernelArgumentHolder({input}))[0].as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  auto expected_output = input.sum(2);
+  EXPECT_TRUE(output.equal(expected_output))
+      << "Output: " << output << " Expected: " << expected_output;
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, Matmul_M) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* a = makeContigTensor(2);
+  TensorView* b = makeContigTensor(2);
+  TensorView* c = matmul(a, b);
+  fusion->addInput(a);
+  fusion->addInput(b);
+  fusion->addOutput(c);
+  c->axis(0)->parallelize(ParallelType::Stream);
+
+  MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance());
+
+  hir::HostIrContainer* container = executor.hostIrEvaluator()->container();
+  EXPECT_EQ(container->topLevelExprs().size(), 2);
+  EXPECT_TRUE(container->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(container->topLevelExprs().at(1)->isA<ForLoop>());
+
+  constexpr int64_t M = 8, K = 4, N = 2;
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor a_aten = at::rand({M, K}, options);
+  at::Tensor b_aten = at::rand({K, N}, options);
+  auto output = executor.runWithInput(KernelArgumentHolder({a_aten, b_aten}))[0]
+                    .as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  auto expected_output = at::matmul(a_aten, b_aten);
+  EXPECT_TRUE(torch::allclose(output, expected_output, 1e-2, 1e-2))
+      << "Output: " << output << " Expected: " << expected_output;
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, BatchedMatmul) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* a = makeContigTensor(3);
+  TensorView* b = makeContigTensor(2);
+  TensorView* c = matmul(a, b);
+  fusion->addInput(a);
+  fusion->addInput(b);
+  fusion->addOutput(c);
+  c->axis(0)->parallelize(ParallelType::Stream);
+
+  MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance());
+
+  hir::HostIrContainer* container = executor.hostIrEvaluator()->container();
+  EXPECT_EQ(container->topLevelExprs().size(), 2);
+  EXPECT_TRUE(container->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(container->topLevelExprs().at(1)->isA<ForLoop>());
+
+  constexpr int64_t B = 16, M = 8, K = 4, N = 2;
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor a_aten = at::rand({B, M, K}, options);
+  at::Tensor b_aten = at::rand({K, N}, options);
+  auto output = executor.runWithInput(KernelArgumentHolder({a_aten, b_aten}))[0]
+                    .as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  auto expected_output = at::matmul(a_aten, b_aten);
+  EXPECT_TRUE(torch::allclose(output, expected_output, 1e-2, 1e-2))
+      << "Output: " << output << " Expected: " << expected_output;
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, Matmul_N) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* a = makeContigTensor(2);
+  TensorView* b = makeContigTensor(2);
+  TensorView* c = matmul(a, b);
+  fusion->addInput(a);
+  fusion->addInput(b);
+  fusion->addOutput(c);
+  c->axis(1)->parallelize(ParallelType::Stream);
+
+  MultiDeviceExecutor executor(std::move(fusion), Communicator::getInstance());
+
+  hir::HostIrContainer* container = executor.hostIrEvaluator()->container();
+  EXPECT_EQ(container->topLevelExprs().size(), 2);
+  EXPECT_TRUE(container->topLevelExprs().at(0)->isA<kir::Allocate>());
+  EXPECT_TRUE(container->topLevelExprs().at(1)->isA<ForLoop>());
+
+  constexpr int64_t M = 8, K = 4, N = 2;
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  at::Tensor a_aten = at::rand({M, K}, options);
+  at::Tensor b_aten = at::rand({K, N}, options);
+  auto output = executor.runWithInput(KernelArgumentHolder({a_aten, b_aten}))[0]
+                    .as<at::Tensor>();
+
+  torch::cuda::synchronize();
+  auto expected_output = at::matmul(a_aten, b_aten);
+  EXPECT_TRUE(torch::allclose(output, expected_output, 1e-2, 1e-2))
+      << "Output: " << output << " Expected: " << expected_output;
+}
+
+TEST_F(MultiDeviceExecutorLowerStreamTest, Matmul_K) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* a = makeContigTensor(2);
+  TensorView* b = makeContigTensor(2);
+  TensorView* c = matmul(a, b);
+  fusion->addInput(a);
+  fusion->addInput(b);
+  fusion->addOutput(c);
+  c->axis(-1)->parallelize(ParallelType::Stream);
+
+  EXPECT_ANY_THROW(
+      MultiDeviceExecutor(std::move(fusion), Communicator::getInstance()));
+}
+
+// We only support Stream parallel type on ops that support pre-allocated
+// output, which means they need a special handle in HostIrEvaluator and they
+// need to be lowered as a Host Ir Op in the TopLevelExpression, no a
+// PostOnStream(HostUnit(.)) See HostIrLower::isLoweredAsStandaloneHostOp and
+// the test HirLowerStreamTest.DoNotSupportPostOnStream
+TEST_F(MultiDeviceExecutorLowerStreamTest, DoNotSupportPostOnStream) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 =
+      abs(tv0); // arbitrary example of an unsupported op. There is no deep
+                // reason why we not support it -- if needed we could widen the
+                // support. But I wanna make sure that an unsupported op do not
+                // silently fails
+  fusion->addInput(tv0);
+  fusion->addOutput(tv1);
+  tv1->axis(0)->parallelize(ParallelType::Stream);
+
+  EXPECT_ANY_THROW(
+      MultiDeviceExecutor(std::move(fusion), Communicator::getInstance()));
+}
+
+} // namespace nvfuser
diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp
index 88286d6e4c0..e9db27cfd7f 100644
--- a/tests/cpp/test_multidevice_host_ir.cpp
+++ b/tests/cpp/test_multidevice_host_ir.cpp
@@ -11,6 +11,7 @@
 #include <host_ir/executor.h>
 #include <ir/all_nodes.h>
 #include <ops/all_ops.h>
+#include <preseg_passes/stream_parallel_type.h>
 #include <tests/cpp/multidevice.h>
 
 namespace nvfuser {
@@ -362,6 +363,11 @@ TEST_F(P2PCommHostIrTest, CoalescedRingPairwiseExchange) {
 using OverlapDistributedMatmulTest = MultiDeviceTest;
 
 TEST_F(OverlapDistributedMatmulTest, AG_matmul) {
+  // Disable StreamParallelType pass temporarily as proper stream lowering gets
+  // implemented
+  preseg_passes::OptimizationPassGuard<preseg_passes::StreamParallelType> guard(
+      false);
+
   constexpr int64_t M = 32768;
   constexpr int64_t K = 32768;
   constexpr int64_t N = 1024;
@@ -417,6 +423,10 @@ TEST_F(OverlapDistributedMatmulTest, AG_matmul) {
 }
 
 TEST_F(OverlapDistributedMatmulTest, AG_linear) {
+  // Disable StreamParallelType pass tempor
+  preseg_passes::OptimizationPassGuard<preseg_passes::StreamParallelType> guard(
+      false);
+
   constexpr int64_t M = 32768;
   constexpr int64_t K = 32768;
   constexpr int64_t N = 1024;

From e8869419dd152f5d6f71f505830e35b97cb9f274 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 26 Mar 2025 06:29:12 -0700
Subject: [PATCH 18/38] improve comments

---
 csrc/preseg_passes/stream_parallel_type.cpp | 159 ++++++++++++--------
 csrc/preseg_passes/stream_parallel_type.h   |  11 +-
 2 files changed, 109 insertions(+), 61 deletions(-)

diff --git a/csrc/preseg_passes/stream_parallel_type.cpp b/csrc/preseg_passes/stream_parallel_type.cpp
index 5a814c9a59a..12fe2f6a285 100644
--- a/csrc/preseg_passes/stream_parallel_type.cpp
+++ b/csrc/preseg_passes/stream_parallel_type.cpp
@@ -20,8 +20,9 @@
 
 namespace nvfuser::preseg_passes {
 
-// returns the first stream axis in the domain, or nullptr if there is none.
-// Throws if two axis are stream parallelized
+// Helper function to find the first stream-parallelized axis in a domain.
+// This function throws if multiple stream-parallelized axes are found (only one
+// is allowed)
 IterDomain* getStreamAxis(const std::vector<IterDomain*>& domain) {
   IterDomain* ret = nullptr;
   for (auto id : domain) {
@@ -38,13 +39,27 @@ IterDomain* getStreamAxis(const std::vector<IterDomain*>& domain) {
   return ret;
 }
 
-
-
-// TODO: ideally we should look at the dag and use the segmenter. Here we take
-// advantage of the linear structure of HostIrContainer::topLevelExprs to
-// greedily merge the adjacent compatible stream for-loop bodies
+// StreamParallelType pass implementation.
+// This pass handles stream parallelization of operations in a fusion.
+// It works by:
+// 1. Identifying stream-parallelized axes in tensor operations
+// 2. Grouping compatible operations into stream-parallel for-loops
+// 3. Setting up proper stream synchronization and management
+//
+// The pass ensures that:
+// - Input tensors don't have stream axes
+// - Only one stream axis exists per tensor
+// - Stream axes are properly synchronized
+// - Operations are correctly grouped into stream-parallel regions
+// - The resulting HostIrContainer's top level expression is valid for execution
+// and does not contain any stream axes
+//
+// TODO: Here, we assume that the fusion input is a HostIrContainer and use the
+// linear structure of the HostIrContainer::topLevelExpr to greedily merge the
+// adjacent compatible stream for-loop bodies. Ideally we should look at the dag
+// and use the segmenter.
 void StreamParallelType::runPass(Fusion* fusion) {
-  // check that there are no stream axes in the inputs
+  // Verify that input tensors don't have stream axes
   NVF_CHECK(
       std::all_of(
           fusion->inputs().begin(),
@@ -56,62 +71,71 @@ void StreamParallelType::runPass(Fusion* fusion) {
           }),
       "Expected no stream axis in the TensorView inputs.");
 
-  FusionGuard fg(fusion); // set as current container to register the newly
-                          // created for-loops
+  // Set up the fusion environment and build the ID model
+  FusionGuard fg(fusion);
   hir::HostIrContainer* hic = dynamic_cast<hir::HostIrContainer*>(fusion);
   NVF_CHECK(hic, "Expected HostIrContainer");
-  // needed ?
+
   IdModel id_model(fusion);
   id_model.buildAlmostExactGraph();
 
   std::vector<Expr*> new_top_level_exprs;
-  // Step 1. Find the segments of expressions that can be merged into a single
-  // stream for-loop At the end of this step, new_top_level_exprs contains a
-  // list of expressions including newly created for-loops that will represent
-  // the stream parallelization, and the relevant expressions grouped inside the
-  // for-loops bodies.
+
+  // Step 1: Group expressions into stream-parallel regions
+  // This step identifies which expressions can be merged into single stream
+  // for-loops
+  //
+  // After this step, new_top_level_exprs contains a
+  // list of expressions including newly created for-loops representing
+  // the stream parallelization containing and the relevant expressions
   for (auto expr : hic->topLevelExprs()) {
-    // we only support exprs having at most 1 output for now
+    // Skip expressions with no outputs
     if (expr->outputs().size() == 0) {
       new_top_level_exprs.push_back(expr);
       continue;
     }
+
+    // Verify single output constraint
     NVF_CHECK(
         expr->outputs().size() == 1,
         "Each expr should have at most one output.");
+
+    // Get the output tensor and check for stream parallelization
     TensorView* output = expr->output(0)->as<TensorView>();
-    // retrieves the Loop IterDomain that is stream parallelized, if any
     IterDomain* stream_axis = getStreamAxis(output->getLoopDomain());
+
+    // If no stream axis, keep expression as is
     if (stream_axis == nullptr) {
-      // if the consumer is not stream parallelized, it means the expr need not
-      // be inside a stream for-loop
       new_top_level_exprs.push_back(expr);
       continue;
     }
+
+    // Verify expression can be handled as a standalone host operation
     NVF_ERROR(
         HostIrLower::isLoweredAsStandaloneHostOp(expr),
         "Stream parallel type not supported for expr ",
         expr);
-    // find the corresponding stream axis but in the Logical (and not Loop
-    // Domain)
+
+    // Find the stream axis in the logical (and not loop) domain
     auto it_logical_stream_axis = std::find(
         output->getLogicalDomain().begin(),
         output->getLogicalDomain().end(),
         stream_axis);
-    // for now we do not support split/merge stream axis
+
+    // Verify stream axis is not split/merged
     NVF_ERROR(
         it_logical_stream_axis != output->getLogicalDomain().end(),
         "Cannot stream parallelize on a split/merge axis ",
         stream_axis);
-    // we don't support reducing or broadcasting a stream axis
+
+    // Verify stream axis is an iteration axis (not reduction/broadcast)
     NVF_CHECK(
         stream_axis->getIterType() == IterType::Iteration,
         "Stream axis ",
         stream_axis,
         " should be an iteration axis.");
-    // check if the current expr can be merged with the previous stream for-loop
-    // We consider the previous expression to check whether the expr should
-    // create a new stream for-loop or be integrated into the previous one
+
+    // Check if expression can be merged with previous stream for-loop
     if (!new_top_level_exprs.empty() &&
         new_top_level_exprs.back()->isA<ForLoop>() &&
         id_model.idGraph(IdMappingMode::ALMOSTEXACT)
@@ -119,21 +143,16 @@ void StreamParallelType::runPass(Fusion* fusion) {
             .strictAreMapped(
                 stream_axis,
                 new_top_level_exprs.back()->as<ForLoop>()->iterDomain())) {
-      // merge with previous for-loop
+      // Merge with existing for-loop
       new_top_level_exprs.back()->as<ForLoop>()->body().push_back(expr);
     } else {
-      // create a new for-loop
-      auto* j = IrBuilder::create<Val>(
-          DataType::Index); // running index of the for-loop
-      auto* start = hic->zeroVal();
-      auto* stop = stream_axis->extent();
-      auto* step = hic->oneVal();
+      // Create new for-loop for stream parallelization
       auto* for_loop = IrBuilder::create<ForLoop>(
           stream_axis,
-          /*index=*/j,
-          start,
-          stop,
-          step,
+          /*index=*/IrBuilder::create<Val>(DataType::Index),
+          /*start=*/hic->zeroVal(),
+          /*stop=*/stream_axis->extent(),
+          /*step=*/hic->oneVal(),
           /*vectorize=*/false,
           /*vectorize_shift=*/nullptr,
           /*unroll_required=*/false,
@@ -145,30 +164,36 @@ void StreamParallelType::runPass(Fusion* fusion) {
     }
   }
 
-  // Step 2. Setup each for loop's body by Slicing the tensors.
+  // Step 2: Process each for-loop's body by slicing tensors
+  // This step handles the actual tensor slicing for stream parallelization
   std::vector<Expr*> top_level_exprs = std::move(new_top_level_exprs);
   new_top_level_exprs.clear();
+
   for (auto top_level_expr : top_level_exprs) {
-    // TODO: change in place? consr issue
     if (!top_level_expr->isA<ForLoop>()) {
       new_top_level_exprs.push_back(top_level_expr);
       continue;
     }
+
     auto* for_loop = top_level_expr->as<ForLoop>();
-    // this will contain the new body of the current for-loop
     std::vector<Expr*> new_loop_body;
 
+    // Process each expression in the loop body
     std::vector<Expr*> current_loop_body = for_loop->body().exprs();
     for (auto it_expr = current_loop_body.begin();
          it_expr != current_loop_body.end();
          ++it_expr) {
       Expr* expr = *it_expr;
+
+      // Process input tensors
       for (auto* input : ir_utils::filterByType<TensorView>(expr->inputs())) {
+        // Find stream axis index in input tensor
         int64_t input_stream_id_logical_index = -1;
         for (auto id : input->getLoopDomain()) {
           if (id_model.idGraph(IdMappingMode::ALMOSTEXACT)
                   .disjointValSets()
                   .strictAreMapped(for_loop->iterDomain(), id)) {
+            // Verify only one stream axis exists
             NVF_CHECK(
                 input_stream_id_logical_index == -1,
                 "Expected at most one axis mapping to the stream axis ",
@@ -177,6 +202,8 @@ void StreamParallelType::runPass(Fusion* fusion) {
                 input,
                 " loop's domain ",
                 input->getLoopDomain());
+
+            // Find stream axis in logical domain
             auto it_input_stream_id_logical = std::find(
                 input->getLogicalDomain().begin(),
                 input->getLogicalDomain().end(),
@@ -193,15 +220,21 @@ void StreamParallelType::runPass(Fusion* fusion) {
                 input->getLogicalDomain().begin(), it_input_stream_id_logical);
           }
         }
+
+        // Skip if no stream axis found
         if (input_stream_id_logical_index == -1) {
           continue;
         }
+
+        // Create sliced tensor for current stream iteration
         TensorView* input_j = select(
             input,
             input_stream_id_logical_index,
             for_loop->index(),
             /*keep_reduction_axis=*/true);
         new_loop_body.push_back(input_j->definition());
+
+        // Update all expressions using this input
         for (auto it_running_expr = current_loop_body.begin();
              it_running_expr != current_loop_body.end();
              ++it_running_expr) {
@@ -216,12 +249,15 @@ void StreamParallelType::runPass(Fusion* fusion) {
         }
       }
 
+      // Process output tensors
       for (auto* output : ir_utils::filterByType<TensorView>(expr->outputs())) {
+        // Find stream axis index in output tensor
         int64_t output_stream_id_logical_index = -1;
         for (auto id : output->getLoopDomain()) {
           if (id_model.idGraph(IdMappingMode::ALMOSTEXACT)
                   .disjointValSets()
                   .strictAreMapped(for_loop->iterDomain(), id)) {
+            // Verify only one stream axis exists
             NVF_CHECK(
                 output_stream_id_logical_index == -1,
                 "Expected at most one axis mapping to the stream axis ",
@@ -230,6 +266,8 @@ void StreamParallelType::runPass(Fusion* fusion) {
                 output,
                 " loop's domain ",
                 output->getLoopDomain());
+
+            // Find stream axis in logical domain
             auto it_output_stream_id_logical = std::find(
                 output->getLogicalDomain().begin(),
                 output->getLogicalDomain().end(),
@@ -247,17 +285,25 @@ void StreamParallelType::runPass(Fusion* fusion) {
                 it_output_stream_id_logical);
           }
         }
+
+        // Skip if no stream axis found
         if (output_stream_id_logical_index == -1) {
           continue;
         }
+
+        // Create sliced tensor for current stream iteration
         TensorView* output_j = select(
             output,
             output_stream_id_logical_index,
             for_loop->index(),
             /*keep_reduction_axis=*/true);
+
+        // Allocate memory for the output tensor
         new_top_level_exprs.push_back(
             IrBuilder::create<kir::Allocate>(output, MemoryType::Global));
         new_loop_body.push_back(output_j->definition());
+
+        // Update all expressions using this output
         for (auto it_running_expr = current_loop_body.begin();
              it_running_expr != current_loop_body.end();
              ++it_running_expr) {
@@ -265,6 +311,7 @@ void StreamParallelType::runPass(Fusion* fusion) {
           for (auto* running_output :
                ir_utils::filterByType<TensorView>(running_expr->outputs())) {
             if (running_output == output) {
+              // Create alias for the sliced output
               TensorView* output_j_alias =
                   ops::newValLike(
                       output_j, output_j->dtype(), /*keep_reduction_axis=*/true)
@@ -272,19 +319,14 @@ void StreamParallelType::runPass(Fusion* fusion) {
               hic->markAlias(output_j, output_j_alias);
               *it_running_expr = ir_utils::transferDefinitionToNewOutputs(
                   running_expr, {output_j_alias});
-              if (Communication* comm = dynamic_cast<Communication*>(
-                      output_j_alias->definition());
-                  comm && comm->type() == CommunicationType::Allgather) {
-                std::cout << "HERE, with expr:" << *it_running_expr
-                          << std::endl;
-              }
             }
           }
         }
       }
       new_loop_body.push_back(*it_expr);
     }
-    // reseting the for-loop body
+
+    // Update for-loop body with processed expressions
     for_loop->body().clear();
     for (auto* expr : new_loop_body) {
       for_loop->body().push_back(expr);
@@ -292,8 +334,7 @@ void StreamParallelType::runPass(Fusion* fusion) {
     new_top_level_exprs.push_back(top_level_expr);
   }
 
-  // Step 3. Finalize the for-loop bodies by adding the stream setup and
-  // synchronization
+  // Step 3: Add stream management and synchronization
   for (auto* top_level_expr : new_top_level_exprs) {
     if (!top_level_expr->isA<ForLoop>()) {
       continue;
@@ -301,46 +342,44 @@ void StreamParallelType::runPass(Fusion* fusion) {
     auto* for_loop = top_level_expr->as<ForLoop>();
     std::vector<Expr*> new_loop_body;
 
-    // Get the current stream to later synchronize subsequent new streams
+    // Get current stream for later synchronization
     auto* get_current_stream = IrBuilder::create<hir::GetCurrentStream>();
     hir::Stream* original_stream = get_current_stream->stream();
     new_loop_body.push_back(get_current_stream);
 
-    // set the stream to the one corresponding to the current for-loop index
-    auto* j = for_loop->index();
+    // Set up stream for current iteration
     auto* number_of_streams =
         IrBuilder::create<NamedScalar>("numberOfStreams", DataType::Int);
-    auto* stream_index = mod(j, number_of_streams);
+    auto* stream_index = mod(for_loop->index(), number_of_streams);
     auto* stream = IrBuilder::create<hir::Stream>(stream_index);
     auto* set_stream = IrBuilder::create<hir::SetCurrentStream>(stream);
     new_loop_body.push_back(set_stream);
 
-    // sync the new stream with the original stream
+    // Synchronize with original stream
     auto* initial_sync_stream =
         IrBuilder::create<hir::Synchronize>(original_stream);
     new_loop_body.push_back(initial_sync_stream);
 
-    // add the actual exprs to the for-loop body
+    // Add the actual computation expressions
     for (auto* expr : for_loop->body().exprs()) {
       new_loop_body.push_back(expr);
     }
 
-    // set back the original stream
+    // Restore original stream and synchronize
     auto* set_back_original_stream =
         IrBuilder::create<hir::SetCurrentStream>(original_stream);
     new_loop_body.push_back(set_back_original_stream);
-    // synchronize original stream with the for-loop's streams
     auto* sync_stream = IrBuilder::create<hir::Synchronize>(stream);
     new_loop_body.push_back(sync_stream);
 
-    // reset the for-loop's body to the one we constructed.
+    // Update for-loop body with stream management
     for_loop->body().clear();
     for (auto* expr : new_loop_body) {
       for_loop->body().push_back(expr);
     }
   }
 
-  // reset hic topLevelExprs to new_top_level_exprs
+  // Update the container's top-level expressions
   hic->resetTopLevelExprs(new_top_level_exprs);
 }
 
diff --git a/csrc/preseg_passes/stream_parallel_type.h b/csrc/preseg_passes/stream_parallel_type.h
index a9600809e21..9c0c39efe87 100644
--- a/csrc/preseg_passes/stream_parallel_type.h
+++ b/csrc/preseg_passes/stream_parallel_type.h
@@ -12,7 +12,16 @@
 
 namespace nvfuser::preseg_passes {
 
-// A pass used in HostIrLower that takes a HostIrContainer as input, reads the TensorView's ParallelType::Stream, and modify the the HostIrContainer's top level expressions with the corresponding Host For Loops, which bodies contain stream assignement, selecting on tensor's axis, and the exprs on those sliced tensors. After this pass, the ParallelType::Stream is removed from the TensorView's axis.
+// A pass used in HostIrLower that takes a HostIrContainer as input, reads the
+// TensorView's ParallelType::Stream, and modify the the HostIrContainer's top
+// level expressions with the corresponding Host For Loops, which bodies contain
+// stream assignement, selecting on tensor's axis, and the exprs on those sliced
+// tensors. After this pass, the ParallelType::Stream is removed from the
+// TensorView's axis.
+//
+// An illustration of the pass can be found in the tests
+// `test_host_ir_stream_lowering.cpp`
+// with the option `NVFUSER_DUMP=host_ir`.
 class StreamParallelType : public OptimizationPass<StreamParallelType> {
   friend class OptimizationPass<StreamParallelType>;
 

From b6c54f2e47246535f6b2ea915bf236dabe54f087 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 16 Apr 2025 03:48:24 -0700
Subject: [PATCH 19/38] fix rebase

---
 csrc/preseg_passes/stream_parallel_type.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/preseg_passes/stream_parallel_type.cpp b/csrc/preseg_passes/stream_parallel_type.cpp
index 12fe2f6a285..82f1b3d0e67 100644
--- a/csrc/preseg_passes/stream_parallel_type.cpp
+++ b/csrc/preseg_passes/stream_parallel_type.cpp
@@ -112,7 +112,7 @@ void StreamParallelType::runPass(Fusion* fusion) {
 
     // Verify expression can be handled as a standalone host operation
     NVF_ERROR(
-        HostIrLower::isLoweredAsStandaloneHostOp(expr),
+        HostIrLower::isLowerableAsStandaloneHostOp(expr),
         "Stream parallel type not supported for expr ",
         expr);
 

From 32a8d552befc8deb4f4fff1deb3d88558bb88074 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 16 Apr 2025 08:23:11 -0700
Subject: [PATCH 20/38] temporarily disable stream pass also in the python test

---
 csrc/python_frontend/fusion_definition.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp
index c48abc9dbdc..ad7b8baf2d6 100644
--- a/csrc/python_frontend/fusion_definition.cpp
+++ b/csrc/python_frontend/fusion_definition.cpp
@@ -11,6 +11,7 @@
 #include <multidevice/utils.h>
 #include <options.h>
 #include <preseg_passes/pre_segmenter.h>
+#include <preseg_passes/stream_parallel_type.h>
 #include <python_frontend/distributed_tensor.h>
 #include <python_frontend/fusion_cache.h>
 #include <python_frontend/fusion_definition.h>
@@ -452,6 +453,10 @@ std::pair<KernelArgumentHolder, std::vector<Sharding>> FusionDefinition::
       if (scheds->multi_device_executor == nullptr) {
         MultiDeviceExecutorParams params;
         params.lower.communicator_backend = backend_type_;
+        // Disable StreamParallelType pass temporarily as proper stream lowering gets
+        // implemented
+        preseg_passes::OptimizationPassGuard<preseg_passes::StreamParallelType> guard(
+            false);
         scheds->multi_device_executor = std::make_unique<MultiDeviceExecutor>(
             std::make_unique<Fusion>(*scheds->preschedFusion()),
             Communicator::getInstance(),

From afbd020b9e2e6b9461bb335532d44bb884ebc533 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 16 Apr 2025 08:38:34 -0700
Subject: [PATCH 21/38] lint

---
 csrc/python_frontend/fusion_definition.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp
index ad7b8baf2d6..950b2bd148a 100644
--- a/csrc/python_frontend/fusion_definition.cpp
+++ b/csrc/python_frontend/fusion_definition.cpp
@@ -453,10 +453,10 @@ std::pair<KernelArgumentHolder, std::vector<Sharding>> FusionDefinition::
       if (scheds->multi_device_executor == nullptr) {
         MultiDeviceExecutorParams params;
         params.lower.communicator_backend = backend_type_;
-        // Disable StreamParallelType pass temporarily as proper stream lowering gets
-        // implemented
-        preseg_passes::OptimizationPassGuard<preseg_passes::StreamParallelType> guard(
-            false);
+        // Disable StreamParallelType pass temporarily as proper stream lowering
+        // gets implemented
+        preseg_passes::OptimizationPassGuard<preseg_passes::StreamParallelType>
+            guard(false);
         scheds->multi_device_executor = std::make_unique<MultiDeviceExecutor>(
             std::make_unique<Fusion>(*scheds->preschedFusion()),
             Communicator::getInstance(),

From 165bd1bab236119a85e9e4dd5843424887960470 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 16 Apr 2025 08:57:55 -0700
Subject: [PATCH 22/38] move stream_parallel_type to host_ir/pass folder

---
 CMakeLists.txt                                                | 2 +-
 csrc/host_ir/lower.cpp                                        | 2 +-
 csrc/{preseg_passes => host_ir/pass}/stream_parallel_type.cpp | 2 +-
 csrc/{preseg_passes => host_ir/pass}/stream_parallel_type.h   | 0
 csrc/python_frontend/fusion_definition.cpp                    | 2 +-
 tests/cpp/test_host_ir_stream_lowering.cpp                    | 2 +-
 tests/cpp/test_multidevice_host_ir.cpp                        | 2 +-
 7 files changed, 6 insertions(+), 6 deletions(-)
 rename csrc/{preseg_passes => host_ir/pass}/stream_parallel_type.cpp (99%)
 rename csrc/{preseg_passes => host_ir/pass}/stream_parallel_type.h (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 96a447055ea..dcf94d4a3a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -212,7 +212,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/preseg_passes/remove_empty.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/segment_inplace_update.cpp
-  ${NVFUSER_SRCS_DIR}/preseg_passes/stream_parallel_type.cpp
+  ${NVFUSER_SRCS_DIR}/host_ir/pass/stream_parallel_type.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/translate_no_reduction_matmul_to_mul_squeeze.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/translate_repeat_to_expand.cpp
   ${NVFUSER_SRCS_DIR}/rng.cpp
diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index 1a74d9a9f01..c36fae09e0a 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -7,6 +7,7 @@
 // clang-format on
 #include <device_lower/utils.h>
 #include <host_ir/lower.h>
+#include <host_ir/pass/stream_parallel_type.h>
 #include <ir/all_nodes.h>
 #include <ir/builder.h>
 #include <ir/interface_nodes.h>
@@ -19,7 +20,6 @@
 #include <preseg_passes/make_resharding_contiguous.h>
 #include <preseg_passes/propagate_shardings.h>
 #include <preseg_passes/reorder_sharded_axis.h>
-#include <preseg_passes/stream_parallel_type.h>
 #include <runtime/fusion_kernel_runtime.h>
 #include <limits>
 
diff --git a/csrc/preseg_passes/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp
similarity index 99%
rename from csrc/preseg_passes/stream_parallel_type.cpp
rename to csrc/host_ir/pass/stream_parallel_type.cpp
index 82f1b3d0e67..f1419e3c626 100644
--- a/csrc/preseg_passes/stream_parallel_type.cpp
+++ b/csrc/host_ir/pass/stream_parallel_type.cpp
@@ -8,6 +8,7 @@
 
 #include <host_ir/container.h>
 #include <host_ir/lower.h>
+#include <host_ir/pass/stream_parallel_type.h>
 #include <id_model/id_model.h>
 #include <ir/all_nodes.h>
 #include <ir/builder.h>
@@ -16,7 +17,6 @@
 #include <kernel_ir.h>
 #include <ops/all_ops.h>
 #include <ops/utils.h>
-#include <preseg_passes/stream_parallel_type.h>
 
 namespace nvfuser::preseg_passes {
 
diff --git a/csrc/preseg_passes/stream_parallel_type.h b/csrc/host_ir/pass/stream_parallel_type.h
similarity index 100%
rename from csrc/preseg_passes/stream_parallel_type.h
rename to csrc/host_ir/pass/stream_parallel_type.h
diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp
index 950b2bd148a..d6e552032b1 100644
--- a/csrc/python_frontend/fusion_definition.cpp
+++ b/csrc/python_frontend/fusion_definition.cpp
@@ -7,11 +7,11 @@
 // clang-format on
 #include <debug.h>
 #include <fusion_profiler.h>
+#include <host_ir/pass/stream_parallel_type.h>
 #include <instrumentation.h>
 #include <multidevice/utils.h>
 #include <options.h>
 #include <preseg_passes/pre_segmenter.h>
-#include <preseg_passes/stream_parallel_type.h>
 #include <python_frontend/distributed_tensor.h>
 #include <python_frontend/fusion_cache.h>
 #include <python_frontend/fusion_definition.h>
diff --git a/tests/cpp/test_host_ir_stream_lowering.cpp b/tests/cpp/test_host_ir_stream_lowering.cpp
index 9f3d9f432ed..f6d74caea87 100644
--- a/tests/cpp/test_host_ir_stream_lowering.cpp
+++ b/tests/cpp/test_host_ir_stream_lowering.cpp
@@ -11,12 +11,12 @@
 #include <host_ir/container.h>
 #include <host_ir/executor.h>
 #include <host_ir/lower.h>
+#include <host_ir/pass/stream_parallel_type.h>
 #include <ir/all_nodes.h>
 #include <ir/builder.h>
 #include <kernel_ir.h>
 #include <multidevice/executor.h>
 #include <ops/all_ops.h>
-#include <preseg_passes/stream_parallel_type.h>
 #include <tests/cpp/utils.h>
 
 #include <algorithm>
diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp
index e9db27cfd7f..7b233bc47db 100644
--- a/tests/cpp/test_multidevice_host_ir.cpp
+++ b/tests/cpp/test_multidevice_host_ir.cpp
@@ -9,9 +9,9 @@
 #include <fusion.h>
 #include <host_ir/container.h>
 #include <host_ir/executor.h>
+#include <host_ir/pass/stream_parallel_type.h>
 #include <ir/all_nodes.h>
 #include <ops/all_ops.h>
-#include <preseg_passes/stream_parallel_type.h>
 #include <tests/cpp/multidevice.h>
 
 namespace nvfuser {

From b55d4e73787f3ca3b841ea2baf769021a85e1199 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 18 Apr 2025 02:35:58 -0700
Subject: [PATCH 23/38] minor comment

---
 csrc/host_ir/executor.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index b973c7f3277..12cf344e549 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -703,8 +703,7 @@ void HostIrEvaluator::handle(BinaryOp* binary_op) {
       at::div_out(output, lhs, rhs);
       break;
     default:
-      NVF_CHECK(
-          false,
+      NVF_THROW(
           "Unexpected operator type: ",
           binary_op->getBinaryOpType(),
           " in ",
@@ -744,8 +743,7 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) {
       at::amin_out(output, input, reduction_axes);
       return;
     default:
-      NVF_CHECK(
-          false,
+      NVF_THROW(
           "Unexpected operator type: ",
           reduction_op->getReductionOpType(),
           " in ",

From 1181eac5f93fdc1ff91a9f6182246950e2825008 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 18 Apr 2025 04:41:13 -0700
Subject: [PATCH 24/38] minor improvements and cleanup

---
 csrc/host_ir/pass/stream_parallel_type.cpp | 143 ++++++++++++---------
 csrc/preseg_passes/optimization_pass.h     |   2 -
 csrc/type.cpp                              |   2 +-
 3 files changed, 84 insertions(+), 63 deletions(-)

diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp
index f1419e3c626..e72fb2ac5ee 100644
--- a/csrc/host_ir/pass/stream_parallel_type.cpp
+++ b/csrc/host_ir/pass/stream_parallel_type.cpp
@@ -39,55 +39,12 @@ IterDomain* getStreamAxis(const std::vector<IterDomain*>& domain) {
   return ret;
 }
 
-// StreamParallelType pass implementation.
-// This pass handles stream parallelization of operations in a fusion.
-// It works by:
-// 1. Identifying stream-parallelized axes in tensor operations
-// 2. Grouping compatible operations into stream-parallel for-loops
-// 3. Setting up proper stream synchronization and management
-//
-// The pass ensures that:
-// - Input tensors don't have stream axes
-// - Only one stream axis exists per tensor
-// - Stream axes are properly synchronized
-// - Operations are correctly grouped into stream-parallel regions
-// - The resulting HostIrContainer's top level expression is valid for execution
-// and does not contain any stream axes
-//
-// TODO: Here, we assume that the fusion input is a HostIrContainer and use the
-// linear structure of the HostIrContainer::topLevelExpr to greedily merge the
-// adjacent compatible stream for-loop bodies. Ideally we should look at the dag
-// and use the segmenter.
-void StreamParallelType::runPass(Fusion* fusion) {
-  // Verify that input tensors don't have stream axes
-  NVF_CHECK(
-      std::all_of(
-          fusion->inputs().begin(),
-          fusion->inputs().end(),
-          [](Val* input) {
-            auto input_tv = dynamic_cast<TensorView*>(input);
-            return input_tv == nullptr ||
-                getStreamAxis(input_tv->getLoopDomain()) == nullptr;
-          }),
-      "Expected no stream axis in the TensorView inputs.");
-
-  // Set up the fusion environment and build the ID model
-  FusionGuard fg(fusion);
-  hir::HostIrContainer* hic = dynamic_cast<hir::HostIrContainer*>(fusion);
-  NVF_CHECK(hic, "Expected HostIrContainer");
-
-  IdModel id_model(fusion);
-  id_model.buildAlmostExactGraph();
-
+// Step 1: Group expressions into stream-parallel regions
+std::vector<Expr*> groupStreamParallelRegions(
+    hir::HostIrContainer* hic,
+    const IdModel& id_model) {
   std::vector<Expr*> new_top_level_exprs;
 
-  // Step 1: Group expressions into stream-parallel regions
-  // This step identifies which expressions can be merged into single stream
-  // for-loops
-  //
-  // After this step, new_top_level_exprs contains a
-  // list of expressions including newly created for-loops representing
-  // the stream parallelization containing and the relevant expressions
   for (auto expr : hic->topLevelExprs()) {
     // Skip expressions with no outputs
     if (expr->outputs().size() == 0) {
@@ -130,15 +87,16 @@ void StreamParallelType::runPass(Fusion* fusion) {
 
     // Verify stream axis is an iteration axis (not reduction/broadcast)
     NVF_CHECK(
-        stream_axis->getIterType() == IterType::Iteration,
+        stream_axis->getIterType() == IterType::Iteration ||
+            stream_axis->getIterType() == IterType::Broadcast,
         "Stream axis ",
         stream_axis,
-        " should be an iteration axis.");
+        " should be an iteration or broadcast axis.");
 
     // Check if expression can be merged with previous stream for-loop
     if (!new_top_level_exprs.empty() &&
         new_top_level_exprs.back()->isA<ForLoop>() &&
-        id_model.idGraph(IdMappingMode::ALMOSTEXACT)
+        id_model.idGraph(IdMappingMode::BROADCAST)
             .disjointValSets()
             .strictAreMapped(
                 stream_axis,
@@ -149,7 +107,7 @@ void StreamParallelType::runPass(Fusion* fusion) {
       // Create new for-loop for stream parallelization
       auto* for_loop = IrBuilder::create<ForLoop>(
           stream_axis,
-          /*index=*/IrBuilder::create<Val>(DataType::Index),
+          /*index=*/NamedScalar::getParallelIndex(ParallelType::Stream),
           /*start=*/hic->zeroVal(),
           /*stop=*/stream_axis->extent(),
           /*step=*/hic->oneVal(),
@@ -164,10 +122,15 @@ void StreamParallelType::runPass(Fusion* fusion) {
     }
   }
 
-  // Step 2: Process each for-loop's body by slicing tensors
-  // This step handles the actual tensor slicing for stream parallelization
-  std::vector<Expr*> top_level_exprs = std::move(new_top_level_exprs);
-  new_top_level_exprs.clear();
+  return new_top_level_exprs;
+}
+
+// Step 2: Process for-loop bodies by slicing tensors
+std::vector<Expr*> processForLoopBodies(
+    hir::HostIrContainer* hic,
+    const IdModel& id_model,
+    std::vector<Expr*> top_level_exprs) {
+  std::vector<Expr*> new_top_level_exprs;
 
   for (auto top_level_expr : top_level_exprs) {
     if (!top_level_expr->isA<ForLoop>()) {
@@ -190,7 +153,7 @@ void StreamParallelType::runPass(Fusion* fusion) {
         // Find stream axis index in input tensor
         int64_t input_stream_id_logical_index = -1;
         for (auto id : input->getLoopDomain()) {
-          if (id_model.idGraph(IdMappingMode::ALMOSTEXACT)
+          if (id_model.idGraph(IdMappingMode::BROADCAST)
                   .disjointValSets()
                   .strictAreMapped(for_loop->iterDomain(), id)) {
             // Verify only one stream axis exists
@@ -254,7 +217,7 @@ void StreamParallelType::runPass(Fusion* fusion) {
         // Find stream axis index in output tensor
         int64_t output_stream_id_logical_index = -1;
         for (auto id : output->getLoopDomain()) {
-          if (id_model.idGraph(IdMappingMode::ALMOSTEXACT)
+          if (id_model.idGraph(IdMappingMode::BROADCAST)
                   .disjointValSets()
                   .strictAreMapped(for_loop->iterDomain(), id)) {
             // Verify only one stream axis exists
@@ -334,9 +297,16 @@ void StreamParallelType::runPass(Fusion* fusion) {
     new_top_level_exprs.push_back(top_level_expr);
   }
 
-  // Step 3: Add stream management and synchronization
-  for (auto* top_level_expr : new_top_level_exprs) {
+  return new_top_level_exprs;
+}
+
+// Step 3: Add stream management and synchronization
+std::vector<Expr*> addStreamManagement(std::vector<Expr*> top_level_exprs) {
+  std::vector<Expr*> new_top_level_exprs;
+
+  for (auto* top_level_expr : top_level_exprs) {
     if (!top_level_expr->isA<ForLoop>()) {
+      new_top_level_exprs.push_back(top_level_expr);
       continue;
     }
     auto* for_loop = top_level_expr->as<ForLoop>();
@@ -377,10 +347,63 @@ void StreamParallelType::runPass(Fusion* fusion) {
     for (auto* expr : new_loop_body) {
       for_loop->body().push_back(expr);
     }
+    new_top_level_exprs.push_back(top_level_expr);
   }
 
+  return new_top_level_exprs;
+}
+
+// StreamParallelType pass implementation.
+// This pass handles stream parallelization of operations in a fusion.
+// It works by:
+// 1. Identifying stream-parallelized axes in tensor operations
+// 2. Grouping compatible operations into stream-parallel for-loops
+// 3. Setting up proper stream synchronization and management
+//
+// The pass ensures that:
+// - Input tensors don't have stream axes
+// - Only one stream axis exists per tensor
+// - Stream axes are properly synchronized
+// - Operations are correctly grouped into stream-parallel regions
+// - The resulting HostIrContainer's top level expression is valid for execution
+// and does not contain any stream axes
+//
+// TODO: Here, we assume that the fusion input is a HostIrContainer and use the
+// linear structure of the HostIrContainer::topLevelExpr to greedily merge the
+// adjacent compatible stream for-loop bodies. Ideally we should look at the dag
+// and use the segmenter.
+void StreamParallelType::runPass(Fusion* fusion) {
+  // Verify that input tensors don't have stream axes
+  NVF_CHECK(
+      std::all_of(
+          fusion->inputs().begin(),
+          fusion->inputs().end(),
+          [](Val* input) {
+            auto input_tv = dynamic_cast<TensorView*>(input);
+            return input_tv == nullptr ||
+                getStreamAxis(input_tv->getLoopDomain()) == nullptr;
+          }),
+      "Expected no stream axis in the TensorView inputs.");
+
+  // Set up the fusion environment and build the ID model
+  FusionGuard fg(fusion);
+  hir::HostIrContainer* hic = dynamic_cast<hir::HostIrContainer*>(fusion);
+  NVF_CHECK(hic, "Expected HostIrContainer");
+
+  IdModel id_model(fusion);
+  id_model.buildBroadcastGraph();
+
+  // Step 1: Group expressions into stream-parallel regions
+  std::vector<Expr*> top_level_exprs = groupStreamParallelRegions(hic, id_model);
+
+  // Step 2: Process for-loop bodies by slicing tensors
+  top_level_exprs = processForLoopBodies(hic, id_model, std::move(top_level_exprs));
+
+  // Step 3: Add stream management and synchronization
+  top_level_exprs = addStreamManagement(std::move(top_level_exprs));
+
   // Update the container's top-level expressions
-  hic->resetTopLevelExprs(new_top_level_exprs);
+  hic->resetTopLevelExprs(top_level_exprs);
 }
 
 } // namespace nvfuser::preseg_passes
diff --git a/csrc/preseg_passes/optimization_pass.h b/csrc/preseg_passes/optimization_pass.h
index 53d8a8acd3c..359a4a42742 100644
--- a/csrc/preseg_passes/optimization_pass.h
+++ b/csrc/preseg_passes/optimization_pass.h
@@ -18,8 +18,6 @@
 
 namespace nvfuser::preseg_passes {
 
-using FusionPass = std::function<void(Fusion*)>;
-
 //! [experimental API]
 //! Base class to unify optimization pass APIs.
 //! OptimizationPass can be turned on/off programmatically with the `setEnabled`
diff --git a/csrc/type.cpp b/csrc/type.cpp
index d1a5b2abd80..e4a89372d52 100644
--- a/csrc/type.cpp
+++ b/csrc/type.cpp
@@ -729,7 +729,7 @@ static const char* parallel_type2string(ParallelType t) {
     case ParallelType::TIDx:
       return "threadIdx.x";
     case ParallelType::Stream:
-      return "Stream";
+      return "StreamIdx";
     case ParallelType::Vectorize:
       return "V";
     case ParallelType::Unroll:

From cad9bce67e39678d3a972bdaa8098e16d84f0206 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 18 Apr 2025 05:17:31 -0700
Subject: [PATCH 25/38] further refactor of stream pass

---
 csrc/host_ir/pass/stream_parallel_type.cpp | 192 ++++++++++-----------
 1 file changed, 91 insertions(+), 101 deletions(-)

diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp
index e72fb2ac5ee..d40c9cff147 100644
--- a/csrc/host_ir/pass/stream_parallel_type.cpp
+++ b/csrc/host_ir/pass/stream_parallel_type.cpp
@@ -20,9 +20,8 @@
 
 namespace nvfuser::preseg_passes {
 
-// Helper function to find the first stream-parallelized axis in a domain.
-// This function throws if multiple stream-parallelized axes are found (only one
-// is allowed)
+namespace {
+
 IterDomain* getStreamAxis(const std::vector<IterDomain*>& domain) {
   IterDomain* ret = nullptr;
   for (auto id : domain) {
@@ -39,6 +38,83 @@ IterDomain* getStreamAxis(const std::vector<IterDomain*>& domain) {
   return ret;
 }
 
+void validateStreamAxis(IterDomain* stream_axis, const TensorView* tv) {
+  // Find the stream axis in the logical domain
+  auto it_logical_stream_axis = std::find(
+      tv->getLogicalDomain().begin(),
+      tv->getLogicalDomain().end(),
+      stream_axis);
+
+  // Verify stream axis is not split/merged
+  NVF_ERROR(
+      it_logical_stream_axis != tv->getLogicalDomain().end(),
+      "Cannot stream parallelize on a split/merge axis ",
+      stream_axis);
+
+  // Verify stream axis is an iteration or broadcast axis
+  NVF_CHECK(
+      stream_axis->getIterType() == IterType::Iteration ||
+          stream_axis->getIterType() == IterType::Broadcast,
+      "Stream axis ",
+      stream_axis,
+      " should be an iteration or broadcast axis.");
+}
+
+bool areIdsMapped(const IdModel& id_model, IterDomain* id1, IterDomain* id2) {
+  return id_model.idGraph(IdMappingMode::BROADCAST)
+      .disjointValSets()
+      .strictAreMapped(id1, id2);
+}
+
+bool canMergeWithPreviousForLoop(
+    const std::vector<Expr*>& new_top_level_exprs,
+    IterDomain* stream_axis,
+    const IdModel& id_model) {
+  return !new_top_level_exprs.empty() &&
+      new_top_level_exprs.back()->isA<ForLoop>() &&
+      areIdsMapped(
+          id_model,
+          stream_axis,
+          new_top_level_exprs.back()->as<ForLoop>()->iterDomain());
+}
+
+int64_t findStreamAxisIndex(
+    const TensorView* tv,
+    IterDomain* stream_axis,
+    const IdModel& id_model) {
+  int64_t stream_id_logical_index = -1;
+  for (auto id : tv->getLoopDomain()) {
+    if (areIdsMapped(id_model, stream_axis, id)) {
+      // Verify only one stream axis exists
+      NVF_CHECK(
+          stream_id_logical_index == -1,
+          "Expected at most one axis mapping to the stream axis ",
+          stream_axis,
+          " in the tensor ",
+          tv,
+          " loop's domain ",
+          tv->getLoopDomain());
+
+      // Find stream axis in logical domain
+      auto it_stream_id_logical = std::find(
+          tv->getLogicalDomain().begin(),
+          tv->getLogicalDomain().end(),
+          id);
+      NVF_CHECK(
+          it_stream_id_logical != tv->getLogicalDomain().end(),
+          "Expected to find ",
+          id,
+          " in ",
+          tv,
+          "'s logical domain ",
+          tv->getLogicalDomain());
+      stream_id_logical_index = std::distance(
+          tv->getLogicalDomain().begin(), it_stream_id_logical);
+    }
+  }
+  return stream_id_logical_index;
+}
+
 // Step 1: Group expressions into stream-parallel regions
 std::vector<Expr*> groupStreamParallelRegions(
     hir::HostIrContainer* hic,
@@ -73,34 +149,11 @@ std::vector<Expr*> groupStreamParallelRegions(
         "Stream parallel type not supported for expr ",
         expr);
 
-    // Find the stream axis in the logical (and not loop) domain
-    auto it_logical_stream_axis = std::find(
-        output->getLogicalDomain().begin(),
-        output->getLogicalDomain().end(),
-        stream_axis);
-
-    // Verify stream axis is not split/merged
-    NVF_ERROR(
-        it_logical_stream_axis != output->getLogicalDomain().end(),
-        "Cannot stream parallelize on a split/merge axis ",
-        stream_axis);
-
-    // Verify stream axis is an iteration axis (not reduction/broadcast)
-    NVF_CHECK(
-        stream_axis->getIterType() == IterType::Iteration ||
-            stream_axis->getIterType() == IterType::Broadcast,
-        "Stream axis ",
-        stream_axis,
-        " should be an iteration or broadcast axis.");
+    // Validate stream axis
+    validateStreamAxis(stream_axis, output);
 
     // Check if expression can be merged with previous stream for-loop
-    if (!new_top_level_exprs.empty() &&
-        new_top_level_exprs.back()->isA<ForLoop>() &&
-        id_model.idGraph(IdMappingMode::BROADCAST)
-            .disjointValSets()
-            .strictAreMapped(
-                stream_axis,
-                new_top_level_exprs.back()->as<ForLoop>()->iterDomain())) {
+    if (canMergeWithPreviousForLoop(new_top_level_exprs, stream_axis, id_model)) {
       // Merge with existing for-loop
       new_top_level_exprs.back()->as<ForLoop>()->body().push_back(expr);
     } else {
@@ -117,7 +170,6 @@ std::vector<Expr*> groupStreamParallelRegions(
           CircularBufferLoopStage::NotApplicable,
           /*circular_buffer_loop_stage_depth=*/0);
       for_loop->body().push_back(expr);
-      // replace the current expr by the for-loop containing it
       new_top_level_exprs.push_back(for_loop);
     }
   }
@@ -150,41 +202,9 @@ std::vector<Expr*> processForLoopBodies(
 
       // Process input tensors
       for (auto* input : ir_utils::filterByType<TensorView>(expr->inputs())) {
-        // Find stream axis index in input tensor
-        int64_t input_stream_id_logical_index = -1;
-        for (auto id : input->getLoopDomain()) {
-          if (id_model.idGraph(IdMappingMode::BROADCAST)
-                  .disjointValSets()
-                  .strictAreMapped(for_loop->iterDomain(), id)) {
-            // Verify only one stream axis exists
-            NVF_CHECK(
-                input_stream_id_logical_index == -1,
-                "Expected at most one axis mapping to the stream axis ",
-                for_loop->iterDomain(),
-                " in the tensor ",
-                input,
-                " loop's domain ",
-                input->getLoopDomain());
-
-            // Find stream axis in logical domain
-            auto it_input_stream_id_logical = std::find(
-                input->getLogicalDomain().begin(),
-                input->getLogicalDomain().end(),
-                id);
-            NVF_CHECK(
-                it_input_stream_id_logical != input->getLogicalDomain().end(),
-                "Expected to find ",
-                id,
-                " in ",
-                input,
-                "'s logical domain ",
-                input->getLogicalDomain());
-            input_stream_id_logical_index = std::distance(
-                input->getLogicalDomain().begin(), it_input_stream_id_logical);
-          }
-        }
+        int64_t input_stream_id_logical_index = findStreamAxisIndex(
+            input, for_loop->iterDomain(), id_model);
 
-        // Skip if no stream axis found
         if (input_stream_id_logical_index == -1) {
           continue;
         }
@@ -214,42 +234,9 @@ std::vector<Expr*> processForLoopBodies(
 
       // Process output tensors
       for (auto* output : ir_utils::filterByType<TensorView>(expr->outputs())) {
-        // Find stream axis index in output tensor
-        int64_t output_stream_id_logical_index = -1;
-        for (auto id : output->getLoopDomain()) {
-          if (id_model.idGraph(IdMappingMode::BROADCAST)
-                  .disjointValSets()
-                  .strictAreMapped(for_loop->iterDomain(), id)) {
-            // Verify only one stream axis exists
-            NVF_CHECK(
-                output_stream_id_logical_index == -1,
-                "Expected at most one axis mapping to the stream axis ",
-                for_loop->iterDomain(),
-                " in the tensor ",
-                output,
-                " loop's domain ",
-                output->getLoopDomain());
-
-            // Find stream axis in logical domain
-            auto it_output_stream_id_logical = std::find(
-                output->getLogicalDomain().begin(),
-                output->getLogicalDomain().end(),
-                id);
-            NVF_CHECK(
-                it_output_stream_id_logical != output->getLogicalDomain().end(),
-                "Expected to find ",
-                id,
-                " in ",
-                output,
-                "'s logical domain ",
-                output->getLogicalDomain());
-            output_stream_id_logical_index = std::distance(
-                output->getLogicalDomain().begin(),
-                it_output_stream_id_logical);
-          }
-        }
+        int64_t output_stream_id_logical_index = findStreamAxisIndex(
+            output, for_loop->iterDomain(), id_model);
 
-        // Skip if no stream axis found
         if (output_stream_id_logical_index == -1) {
           continue;
         }
@@ -276,14 +263,14 @@ std::vector<Expr*> processForLoopBodies(
             if (running_output == output) {
               // Create alias for the sliced output
               TensorView* output_j_alias =
-                  ops::newValLike(
-                      output_j, output_j->dtype(), /*keep_reduction_axis=*/true)
+                  ops::newValLike(output_j, output_j->dtype(), true)
                       ->as<TensorView>();
               hic->markAlias(output_j, output_j_alias);
               *it_running_expr = ir_utils::transferDefinitionToNewOutputs(
                   running_expr, {output_j_alias});
             }
           }
+
         }
       }
       new_loop_body.push_back(*it_expr);
@@ -353,6 +340,8 @@ std::vector<Expr*> addStreamManagement(std::vector<Expr*> top_level_exprs) {
   return new_top_level_exprs;
 }
 
+} // anonymous namespace
+
 // StreamParallelType pass implementation.
 // This pass handles stream parallelization of operations in a fusion.
 // It works by:
@@ -407,3 +396,4 @@ void StreamParallelType::runPass(Fusion* fusion) {
 }
 
 } // namespace nvfuser::preseg_passes
+

From 7ae7c52f7340c85458b8b48d3d971532e183e36b Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 18 Apr 2025 05:38:36 -0700
Subject: [PATCH 26/38] improve comments clarity

---
 csrc/host_ir/pass/stream_parallel_type.cpp | 65 ++++++++++++++--------
 1 file changed, 41 insertions(+), 24 deletions(-)

diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp
index d40c9cff147..dd19f4f6b5d 100644
--- a/csrc/host_ir/pass/stream_parallel_type.cpp
+++ b/csrc/host_ir/pass/stream_parallel_type.cpp
@@ -22,6 +22,7 @@ namespace nvfuser::preseg_passes {
 
 namespace {
 
+// Finds the stream axis in a tensor's domain. There should be at most one stream axis.
 IterDomain* getStreamAxis(const std::vector<IterDomain*>& domain) {
   IterDomain* ret = nullptr;
   for (auto id : domain) {
@@ -38,6 +39,7 @@ IterDomain* getStreamAxis(const std::vector<IterDomain*>& domain) {
   return ret;
 }
 
+// Validates that a stream axis is valid in a tensor
 void validateStreamAxis(IterDomain* stream_axis, const TensorView* tv) {
   // Find the stream axis in the logical domain
   auto it_logical_stream_axis = std::find(
@@ -60,12 +62,14 @@ void validateStreamAxis(IterDomain* stream_axis, const TensorView* tv) {
       " should be an iteration or broadcast axis.");
 }
 
+// Checks if two iteration domains are mapped in the ID model
 bool areIdsMapped(const IdModel& id_model, IterDomain* id1, IterDomain* id2) {
   return id_model.idGraph(IdMappingMode::BROADCAST)
       .disjointValSets()
       .strictAreMapped(id1, id2);
 }
 
+// Determines if a stream-parallel for-loop can be merged with the previous one
 bool canMergeWithPreviousForLoop(
     const std::vector<Expr*>& new_top_level_exprs,
     IterDomain* stream_axis,
@@ -78,6 +82,7 @@ bool canMergeWithPreviousForLoop(
           new_top_level_exprs.back()->as<ForLoop>()->iterDomain());
 }
 
+// Finds where a stream axis appears in a tensor's logical domain
 int64_t findStreamAxisIndex(
     const TensorView* tv,
     IterDomain* stream_axis,
@@ -121,6 +126,7 @@ std::vector<Expr*> groupStreamParallelRegions(
     const IdModel& id_model) {
   std::vector<Expr*> new_top_level_exprs;
 
+  // Process each top-level expression
   for (auto expr : hic->topLevelExprs()) {
     // Skip expressions with no outputs
     if (expr->outputs().size() == 0) {
@@ -128,7 +134,7 @@ std::vector<Expr*> groupStreamParallelRegions(
       continue;
     }
 
-    // Verify single output constraint
+    // Each expression should have exactly one output
     NVF_CHECK(
         expr->outputs().size() == 1,
         "Each expr should have at most one output.");
@@ -137,13 +143,13 @@ std::vector<Expr*> groupStreamParallelRegions(
     TensorView* output = expr->output(0)->as<TensorView>();
     IterDomain* stream_axis = getStreamAxis(output->getLoopDomain());
 
-    // If no stream axis, keep expression as is
+    // If no stream axis found, keep the expression as is
     if (stream_axis == nullptr) {
       new_top_level_exprs.push_back(expr);
       continue;
     }
 
-    // Verify expression can be handled as a standalone host operation
+    // Verify that the expression can be handled as a standalone host operation
     NVF_ERROR(
         HostIrLower::isLowerableAsStandaloneHostOp(expr),
         "Stream parallel type not supported for expr ",
@@ -152,12 +158,12 @@ std::vector<Expr*> groupStreamParallelRegions(
     // Validate stream axis
     validateStreamAxis(stream_axis, output);
 
-    // Check if expression can be merged with previous stream for-loop
+    // Check if we can merge this expression with the previous for-loop
     if (canMergeWithPreviousForLoop(new_top_level_exprs, stream_axis, id_model)) {
-      // Merge with existing for-loop
+      // Merge with existing for-loop by adding the expression to its body
       new_top_level_exprs.back()->as<ForLoop>()->body().push_back(expr);
     } else {
-      // Create new for-loop for stream parallelization
+      // Create a new for-loop for stream parallelization
       auto* for_loop = IrBuilder::create<ForLoop>(
           stream_axis,
           /*index=*/NamedScalar::getParallelIndex(ParallelType::Stream),
@@ -169,6 +175,7 @@ std::vector<Expr*> groupStreamParallelRegions(
           /*unroll_required=*/false,
           CircularBufferLoopStage::NotApplicable,
           /*circular_buffer_loop_stage_depth=*/0);
+      // Add the expression to the new for-loop's body
       for_loop->body().push_back(expr);
       new_top_level_exprs.push_back(for_loop);
     }
@@ -184,7 +191,9 @@ std::vector<Expr*> processForLoopBodies(
     std::vector<Expr*> top_level_exprs) {
   std::vector<Expr*> new_top_level_exprs;
 
+  // Process each top-level expression
   for (auto top_level_expr : top_level_exprs) {
+    // Skip non-for-loop expressions
     if (!top_level_expr->isA<ForLoop>()) {
       new_top_level_exprs.push_back(top_level_expr);
       continue;
@@ -192,24 +201,26 @@ std::vector<Expr*> processForLoopBodies(
 
     auto* for_loop = top_level_expr->as<ForLoop>();
     std::vector<Expr*> new_loop_body;
+    std::vector<Expr*> current_loop_body = for_loop->body().exprs();
 
     // Process each expression in the loop body
-    std::vector<Expr*> current_loop_body = for_loop->body().exprs();
     for (auto it_expr = current_loop_body.begin();
          it_expr != current_loop_body.end();
          ++it_expr) {
       Expr* expr = *it_expr;
 
-      // Process input tensors
+      // Process input tensors that might have stream axes
       for (auto* input : ir_utils::filterByType<TensorView>(expr->inputs())) {
+        // Find if this input has a stream axis
         int64_t input_stream_id_logical_index = findStreamAxisIndex(
             input, for_loop->iterDomain(), id_model);
 
+        // Skip if no stream axis found
         if (input_stream_id_logical_index == -1) {
           continue;
         }
 
-        // Create sliced tensor for current stream iteration
+        // Create a sliced version of the input tensor for this stream iterdomain
         TensorView* input_j = select(
             input,
             input_stream_id_logical_index,
@@ -217,7 +228,7 @@ std::vector<Expr*> processForLoopBodies(
             /*keep_reduction_axis=*/true);
         new_loop_body.push_back(input_j->definition());
 
-        // Update all expressions using this input
+        // Update all expressions that use this input to use the sliced version
         for (auto it_running_expr = current_loop_body.begin();
              it_running_expr != current_loop_body.end();
              ++it_running_expr) {
@@ -232,28 +243,30 @@ std::vector<Expr*> processForLoopBodies(
         }
       }
 
-      // Process output tensors
+      // Process output tensors that might have stream axes
       for (auto* output : ir_utils::filterByType<TensorView>(expr->outputs())) {
+        // Find if this output has a stream axis
         int64_t output_stream_id_logical_index = findStreamAxisIndex(
             output, for_loop->iterDomain(), id_model);
 
+        // Skip if no stream axis found
         if (output_stream_id_logical_index == -1) {
           continue;
         }
 
-        // Create sliced tensor for current stream iteration
+        // Create a sliced version of the output tensor for this stream axis
         TensorView* output_j = select(
             output,
             output_stream_id_logical_index,
             for_loop->index(),
             /*keep_reduction_axis=*/true);
 
-        // Allocate memory for the output tensor
+        // Allocate memory for the output tensor, and place the allocation IR before the for-loop, at the top level
         new_top_level_exprs.push_back(
             IrBuilder::create<kir::Allocate>(output, MemoryType::Global));
         new_loop_body.push_back(output_j->definition());
 
-        // Update all expressions using this output
+        // Update all expressions that use this output to use the sliced version
         for (auto it_running_expr = current_loop_body.begin();
              it_running_expr != current_loop_body.end();
              ++it_running_expr) {
@@ -261,7 +274,8 @@ std::vector<Expr*> processForLoopBodies(
           for (auto* running_output :
                ir_utils::filterByType<TensorView>(running_expr->outputs())) {
             if (running_output == output) {
-              // Create alias for the sliced output
+              // Create an alias for the sliced output to maintain the original tensor's properties
+              // Alias is needed here to avoid that transferDefinitionToNewOutputs throws. Indeed, HIC does not make the SSA assumption, but the util functions we use (such as transferDefinitionToNewOutputs) do, therefore we need to create an alias for the sliced output to not create loops in the dag.
               TensorView* output_j_alias =
                   ops::newValLike(output_j, output_j->dtype(), true)
                       ->as<TensorView>();
@@ -270,13 +284,14 @@ std::vector<Expr*> processForLoopBodies(
                   running_expr, {output_j_alias});
             }
           }
-
         }
       }
+
+      // Add the original expression to the new loop body
       new_loop_body.push_back(*it_expr);
     }
 
-    // Update for-loop body with processed expressions
+    // Update the for-loop body with all the processed expressions
     for_loop->body().clear();
     for (auto* expr : new_loop_body) {
       for_loop->body().push_back(expr);
@@ -291,20 +306,23 @@ std::vector<Expr*> processForLoopBodies(
 std::vector<Expr*> addStreamManagement(std::vector<Expr*> top_level_exprs) {
   std::vector<Expr*> new_top_level_exprs;
 
+  // Process each top-level expression
   for (auto* top_level_expr : top_level_exprs) {
+    // Skip non-for-loop expressions
     if (!top_level_expr->isA<ForLoop>()) {
       new_top_level_exprs.push_back(top_level_expr);
       continue;
     }
+
     auto* for_loop = top_level_expr->as<ForLoop>();
     std::vector<Expr*> new_loop_body;
 
-    // Get current stream for later synchronization
+    // Get the current stream before entering the loop
     auto* get_current_stream = IrBuilder::create<hir::GetCurrentStream>();
     hir::Stream* original_stream = get_current_stream->stream();
     new_loop_body.push_back(get_current_stream);
 
-    // Set up stream for current iteration
+    // Set up a new stream for this iteration based on the loop index
     auto* number_of_streams =
         IrBuilder::create<NamedScalar>("numberOfStreams", DataType::Int);
     auto* stream_index = mod(for_loop->index(), number_of_streams);
@@ -312,24 +330,24 @@ std::vector<Expr*> addStreamManagement(std::vector<Expr*> top_level_exprs) {
     auto* set_stream = IrBuilder::create<hir::SetCurrentStream>(stream);
     new_loop_body.push_back(set_stream);
 
-    // Synchronize with original stream
+    // Synchronize with the original stream before starting computation
     auto* initial_sync_stream =
         IrBuilder::create<hir::Synchronize>(original_stream);
     new_loop_body.push_back(initial_sync_stream);
 
-    // Add the actual computation expressions
+    // Add all the expressions to the loop body
     for (auto* expr : for_loop->body().exprs()) {
       new_loop_body.push_back(expr);
     }
 
-    // Restore original stream and synchronize
+    // Restore the original stream and synchronize with the iteration's stream
     auto* set_back_original_stream =
         IrBuilder::create<hir::SetCurrentStream>(original_stream);
     new_loop_body.push_back(set_back_original_stream);
     auto* sync_stream = IrBuilder::create<hir::Synchronize>(stream);
     new_loop_body.push_back(sync_stream);
 
-    // Update for-loop body with stream management
+    // Update the for-loop body with the new expressions
     for_loop->body().clear();
     for (auto* expr : new_loop_body) {
       for_loop->body().push_back(expr);
@@ -396,4 +414,3 @@ void StreamParallelType::runPass(Fusion* fusion) {
 }
 
 } // namespace nvfuser::preseg_passes
-

From 6dd673f4b39f0b9db7809be9110d6f998da1bd8e Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 18 Apr 2025 05:49:22 -0700
Subject: [PATCH 27/38] more comments

---
 csrc/host_ir/pass/stream_parallel_type.cpp | 42 +++++++++++++---------
 csrc/ops/indexing.h                        |  3 ++
 2 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp
index dd19f4f6b5d..63ebc9fc42c 100644
--- a/csrc/host_ir/pass/stream_parallel_type.cpp
+++ b/csrc/host_ir/pass/stream_parallel_type.cpp
@@ -22,7 +22,8 @@ namespace nvfuser::preseg_passes {
 
 namespace {
 
-// Finds the stream axis in a tensor's domain. There should be at most one stream axis.
+// Finds the stream axis in a tensor's domain. There should be at most one
+// stream axis.
 IterDomain* getStreamAxis(const std::vector<IterDomain*>& domain) {
   IterDomain* ret = nullptr;
   for (auto id : domain) {
@@ -102,9 +103,7 @@ int64_t findStreamAxisIndex(
 
       // Find stream axis in logical domain
       auto it_stream_id_logical = std::find(
-          tv->getLogicalDomain().begin(),
-          tv->getLogicalDomain().end(),
-          id);
+          tv->getLogicalDomain().begin(), tv->getLogicalDomain().end(), id);
       NVF_CHECK(
           it_stream_id_logical != tv->getLogicalDomain().end(),
           "Expected to find ",
@@ -113,8 +112,8 @@ int64_t findStreamAxisIndex(
           tv,
           "'s logical domain ",
           tv->getLogicalDomain());
-      stream_id_logical_index = std::distance(
-          tv->getLogicalDomain().begin(), it_stream_id_logical);
+      stream_id_logical_index =
+          std::distance(tv->getLogicalDomain().begin(), it_stream_id_logical);
     }
   }
   return stream_id_logical_index;
@@ -159,7 +158,8 @@ std::vector<Expr*> groupStreamParallelRegions(
     validateStreamAxis(stream_axis, output);
 
     // Check if we can merge this expression with the previous for-loop
-    if (canMergeWithPreviousForLoop(new_top_level_exprs, stream_axis, id_model)) {
+    if (canMergeWithPreviousForLoop(
+            new_top_level_exprs, stream_axis, id_model)) {
       // Merge with existing for-loop by adding the expression to its body
       new_top_level_exprs.back()->as<ForLoop>()->body().push_back(expr);
     } else {
@@ -212,15 +212,16 @@ std::vector<Expr*> processForLoopBodies(
       // Process input tensors that might have stream axes
       for (auto* input : ir_utils::filterByType<TensorView>(expr->inputs())) {
         // Find if this input has a stream axis
-        int64_t input_stream_id_logical_index = findStreamAxisIndex(
-            input, for_loop->iterDomain(), id_model);
+        int64_t input_stream_id_logical_index =
+            findStreamAxisIndex(input, for_loop->iterDomain(), id_model);
 
         // Skip if no stream axis found
         if (input_stream_id_logical_index == -1) {
           continue;
         }
 
-        // Create a sliced version of the input tensor for this stream iterdomain
+        // Create a sliced version of the input tensor for this stream
+        // iterdomain
         TensorView* input_j = select(
             input,
             input_stream_id_logical_index,
@@ -246,8 +247,8 @@ std::vector<Expr*> processForLoopBodies(
       // Process output tensors that might have stream axes
       for (auto* output : ir_utils::filterByType<TensorView>(expr->outputs())) {
         // Find if this output has a stream axis
-        int64_t output_stream_id_logical_index = findStreamAxisIndex(
-            output, for_loop->iterDomain(), id_model);
+        int64_t output_stream_id_logical_index =
+            findStreamAxisIndex(output, for_loop->iterDomain(), id_model);
 
         // Skip if no stream axis found
         if (output_stream_id_logical_index == -1) {
@@ -261,7 +262,8 @@ std::vector<Expr*> processForLoopBodies(
             for_loop->index(),
             /*keep_reduction_axis=*/true);
 
-        // Allocate memory for the output tensor, and place the allocation IR before the for-loop, at the top level
+        // Allocate memory for the output tensor, and place the allocation IR
+        // before the for-loop, at the top level
         new_top_level_exprs.push_back(
             IrBuilder::create<kir::Allocate>(output, MemoryType::Global));
         new_loop_body.push_back(output_j->definition());
@@ -274,8 +276,12 @@ std::vector<Expr*> processForLoopBodies(
           for (auto* running_output :
                ir_utils::filterByType<TensorView>(running_expr->outputs())) {
             if (running_output == output) {
-              // Create an alias for the sliced output to maintain the original tensor's properties
-              // Alias is needed here to avoid that transferDefinitionToNewOutputs throws. Indeed, HIC does not make the SSA assumption, but the util functions we use (such as transferDefinitionToNewOutputs) do, therefore we need to create an alias for the sliced output to not create loops in the dag.
+              // Create an alias for the sliced output to maintain the original
+              // tensor's properties Alias is needed here to avoid that
+              // transferDefinitionToNewOutputs throws. Indeed, HIC does not
+              // make the SSA assumption, but the util functions we use (such as
+              // transferDefinitionToNewOutputs) do, therefore we need to create
+              // an alias for the sliced output to not create loops in the dag.
               TensorView* output_j_alias =
                   ops::newValLike(output_j, output_j->dtype(), true)
                       ->as<TensorView>();
@@ -401,10 +407,12 @@ void StreamParallelType::runPass(Fusion* fusion) {
   id_model.buildBroadcastGraph();
 
   // Step 1: Group expressions into stream-parallel regions
-  std::vector<Expr*> top_level_exprs = groupStreamParallelRegions(hic, id_model);
+  std::vector<Expr*> top_level_exprs =
+      groupStreamParallelRegions(hic, id_model);
 
   // Step 2: Process for-loop bodies by slicing tensors
-  top_level_exprs = processForLoopBodies(hic, id_model, std::move(top_level_exprs));
+  top_level_exprs =
+      processForLoopBodies(hic, id_model, std::move(top_level_exprs));
 
   // Step 3: Add stream management and synchronization
   top_level_exprs = addStreamManagement(std::move(top_level_exprs));
diff --git a/csrc/ops/indexing.h b/csrc/ops/indexing.h
index 7a219c534a3..5e0410d95d5 100644
--- a/csrc/ops/indexing.h
+++ b/csrc/ops/indexing.h
@@ -15,6 +15,9 @@
 
 namespace nvfuser {
 
+// When keep_reduction_axis is true, all reduction axis are kept in the
+// SelectOp's consumer. This is used in the context of HostIr where SelectOp is
+// used to index into Stream-parallelized axes.
 NVF_API TensorView* select(
     TensorView* tv,
     int64_t dim,

From db90ef0749725656a4ec3d668668aed3288b2a3d Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 23 Apr 2025 14:17:26 -0700
Subject: [PATCH 28/38] add HirAliasSelect

---
 csrc/dispatch.h             |  3 ++-
 csrc/host_ir/executor.cpp   |  9 ++++++++
 csrc/host_ir/executor.h     |  1 +
 csrc/host_ir/host_ir.cpp    | 45 +++++++++++++++++++++++++++++++++++++
 csrc/host_ir/host_ir.h      | 43 +++++++++++++++++++++++++++++++++++
 tests/cpp/test_host_irs.cpp | 38 +++++++++++++++++++++++++++++++
 6 files changed, 138 insertions(+), 1 deletion(-)

diff --git a/csrc/dispatch.h b/csrc/dispatch.h
index 218ccd8267a..b9874860bd6 100644
--- a/csrc/dispatch.h
+++ b/csrc/dispatch.h
@@ -158,7 +158,8 @@ class Val;
   f(Synchronize);                     \
   f(StartCoalescing);                 \
   f(EndCoalescing);                   \
-  f(ShareMemHandles);
+  f(ShareMemHandles);                 \
+  f(HirAliasSelect);
 
 // Forward declarations for all Val and Expr types
 
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 12cf344e549..66ac0ef1d64 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -751,6 +751,15 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) {
   }
 }
 
+void HostIrEvaluator::handle(HirAliasSelect* hir_alias_select) {
+  auto index =
+      expr_evaluator_.evaluate(hir_alias_select->index()).as<int64_t>();
+  auto input = getKnownConcreteValue(hir_alias_select->in()->as<TensorView>())
+                   .as<at::Tensor>();
+  int64_t axis = hir_alias_select->axis();
+  bind(hir_alias_select->out(), input.select(axis, index));
+}
+
 void HostIrEvaluator::unhandled(Statement* stmt) {
   NVF_ERROR(stmt->isA<Expr>(), stmt, " must be an Expr");
   auto* expr = stmt->as<Expr>();
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index 89ac5119681..3f147b7801b 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -142,6 +142,7 @@ class HostIrEvaluator final : public OptOutDispatch {
   void handle(BinaryOp* binary_op) override;
   void handle(ReductionOp* reduction_op) override;
   void handle(ShareMemHandles* share_mem_handles) override;
+  void handle(HirAliasSelect* hir_alias_select) override;
   void unhandled(Statement* stmt) override;
 
   c10::cuda::CUDAStream getCUDAStream(Stream* stream);
diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp
index 9e1386d0d3d..bf3d5cef9eb 100644
--- a/csrc/host_ir/host_ir.cpp
+++ b/csrc/host_ir/host_ir.cpp
@@ -355,6 +355,51 @@ std::string ShareMemHandles::toInlineString(int indent_size) const {
   NVF_THROW("Cannot be printed inline");
 }
 
+HirAliasSelect::HirAliasSelect(
+    IrBuilderPasskey passkey,
+    TensorView* in,
+    TensorView* out,
+    int64_t axis,
+    Val* index)
+    : Expr(passkey, {in, index}, {}, {}) {
+  NVF_ERROR(passkey.ir_container_ != nullptr);
+  NVF_ERROR(
+      passkey.ir_container_->isA<HostIrContainer>(),
+      this,
+      "must be registered in a HostIrContainer");
+  NVF_ERROR(
+      static_cast<int64_t>(in->getLogicalDomain().size()) > axis,
+      "Select axis ",
+      axis,
+      " is out of bounds for tensor ",
+      in->toString(),
+      " with ",
+      in->getLogicalDomain().size(),
+      " dimensions");
+  // "out" is not added as an output because the current op doesn't "define" it,
+  // but rather sets its allocation. Since "out" will be used in another
+  // producing expression, this avoids unnecessary cyclic dependencies. This
+  // ressembles how kir::Allocate treats its allocated TensorView.
+  addAttribute(out);
+  addDataAttribute(axis);
+}
+
+NVFUSER_DEFINE_CLONE_AND_CREATE(HirAliasSelect)
+
+std::string HirAliasSelect::toString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << out()->toString() << "\n";
+  indent_size++;
+  indent(ss, indent_size) << " = HirAliasSelect( " << in()->toString()
+                          << ", axis = " << in()->getLogicalDomain().at(axis())
+                          << ", index = " << index()->toString() << " )\n";
+  return ss.str();
+}
+
+std::string HirAliasSelect::toInlineString(int indent_size) const {
+  NVF_THROW("Cannot be printed inline");
+}
+
 } // namespace hir
 
 } // namespace nvfuser
diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h
index bad3a6ef722..d267d23ab1f 100644
--- a/csrc/host_ir/host_ir.h
+++ b/csrc/host_ir/host_ir.h
@@ -351,6 +351,49 @@ class ShareMemHandles : public Expr {
   }
 };
 
+// This op mimicks the semantics of SelectOp but is used in HIR non-SSA context
+// to index into a TensorView, returning an alias "slice" of the original
+// TensorView.
+class HirAliasSelect : public Expr {
+ public:
+  using Expr::Expr;
+  HirAliasSelect(
+      IrBuilderPasskey passkey,
+      TensorView* in,
+      TensorView* out,
+      int64_t axis,
+      Val* index);
+
+  HirAliasSelect(const HirAliasSelect& other) = delete;
+  HirAliasSelect& operator=(const HirAliasSelect& other) = delete;
+  HirAliasSelect(HirAliasSelect&& other) = delete;
+  HirAliasSelect& operator=(HirAliasSelect&& other) = delete;
+
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const char* getOpString() const override {
+    return "hir::HirAliasSelect";
+  }
+
+  TensorView* in() const {
+    return inputs().at(0)->as<TensorView>();
+  }
+
+  TensorView* out() const {
+    return attributeVal(0)->as<TensorView>();
+  }
+
+  int64_t axis() const {
+    return attribute<int64_t>(1);
+  }
+
+  Val* index() const {
+    return inputs().at(1);
+  }
+};
+
 } // namespace hir
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
index 633ebc83504..eb1291de57d 100644
--- a/tests/cpp/test_host_irs.cpp
+++ b/tests/cpp/test_host_irs.cpp
@@ -1487,6 +1487,44 @@ TEST_F(HirReductionOpTest, NonPreAllocatedOutputs) {
       << "Expected output: " << expected_out;
 }
 
+using HirAliasSelectHostIrTest = NVFuserTest;
+
+TEST_F(HirAliasSelectHostIrTest, SelectingTensor) {
+  constexpr int64_t ndims = 2;
+  constexpr int64_t dim = 1;
+  constexpr int64_t index = 3;
+  const std::vector<int64_t> input_sizes = {32, 32};
+
+  ASSERT_LT(dim, ndims);
+  ASSERT_EQ(input_sizes.size(), ndims);
+  ASSERT_LT(index, input_sizes.at(dim));
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard fg(hic.get());
+
+  TensorView* in = makeContigTensor(ndims);
+  TensorView* out = makeContigTensor(ndims - 1);
+  auto* index_val = IrBuilder::create<Val>(index, DataType::Index);
+  auto* select_op = IrBuilder::create<HirAliasSelect>(in, out, dim, index_val);
+
+  hic->addInput(in);
+  hic->addOutput(out);
+  hic->pushBackTopLevelExprs(select_op);
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0).dtype(torch::kFloat);
+  auto in_aten = at::randn(input_sizes, options);
+  std::unordered_map<Val*, PolymorphicValue> concrete_input_buffers = {
+      {in, in_aten}};
+
+  auto out_aten = hie.runWithInput(concrete_input_buffers)[0].as<at::Tensor>();
+
+  // validate
+  auto ref_out = in_aten.select(dim, index);
+  EXPECT_TRUE(ref_out.equal(out_aten));
+}
+
 } // namespace hir
 
 } // namespace nvfuser

From e32653a383c8e1689d8a1a8d5dbf5fd1d409ea92 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 23 Apr 2025 14:18:05 -0700
Subject: [PATCH 29/38] replace SelectOp by HirAliasSelect in stream lowering

---
 csrc/host_ir/container.cpp                 | 10 ++--
 csrc/host_ir/pass/stream_parallel_type.cpp | 56 ++++++++++++----------
 csrc/ops/indexing.cpp                      | 10 +---
 csrc/ops/indexing.h                        |  9 +---
 csrc/ops/utils.cpp                         | 27 ++++-------
 csrc/ops/utils.h                           | 14 ++----
 6 files changed, 52 insertions(+), 74 deletions(-)

diff --git a/csrc/host_ir/container.cpp b/csrc/host_ir/container.cpp
index 83e668770fc..9fdcfa376a6 100644
--- a/csrc/host_ir/container.cpp
+++ b/csrc/host_ir/container.cpp
@@ -35,11 +35,13 @@ Stream* HostIrContainer::getDefaultStream() {
 std::ostream& HostIrContainer::print(std::ostream& os) const {
   IrMathPrinter op_exprs(os);
   op_exprs.handle(this);
-  os << "Aliases:{";
-  for (const auto& alias : alias_) {
-    os << "\n  " << alias.first << " -> " << alias.second;
+  if (alias_.size() > 0) {
+    os << "Aliases:{";
+    for (const auto& alias : alias_) {
+      os << "\n  " << alias.first << " -> " << alias.second;
+    }
+    os << "\n}\n";
   }
-  os << "\n}\n";
   return os;
 }
 
diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp
index 63ebc9fc42c..3d63290ef17 100644
--- a/csrc/host_ir/pass/stream_parallel_type.cpp
+++ b/csrc/host_ir/pass/stream_parallel_type.cpp
@@ -119,6 +119,30 @@ int64_t findStreamAxisIndex(
   return stream_id_logical_index;
 }
 
+// Helper function to create a sliced version of a tensor for stream
+// parallelization
+hir::HirAliasSelect* createSlicedTensor(
+    TensorView* tensor,
+    int64_t stream_axis_index,
+    Val* index) {
+  auto dom = tensor->getLogicalDomain();
+
+  std::vector<IterDomain*> new_root;
+  new_root.reserve(dom.size() - 1);
+
+  for (auto i : arange((int64_t)dom.size())) {
+    if (i != stream_axis_index) {
+      new_root.emplace_back(dom[i]->cloneWithoutRFactor());
+    }
+  }
+
+  auto td = IrBuilder::create<TensorDomain>(
+      new_root, TensorDomain::getContiguityFilledWith(new_root, true));
+  auto out = IrBuilder::create<TensorView>(td, *tensor->getDataType());
+  return IrBuilder::create<hir::HirAliasSelect>(
+      tensor, out, stream_axis_index, index);
+}
+
 // Step 1: Group expressions into stream-parallel regions
 std::vector<Expr*> groupStreamParallelRegions(
     hir::HostIrContainer* hic,
@@ -222,12 +246,9 @@ std::vector<Expr*> processForLoopBodies(
 
         // Create a sliced version of the input tensor for this stream
         // iterdomain
-        TensorView* input_j = select(
-            input,
-            input_stream_id_logical_index,
-            for_loop->index(),
-            /*keep_reduction_axis=*/true);
-        new_loop_body.push_back(input_j->definition());
+        hir::HirAliasSelect* input_slicing = createSlicedTensor(
+            input, input_stream_id_logical_index, for_loop->index());
+        new_loop_body.push_back(input_slicing);
 
         // Update all expressions that use this input to use the sliced version
         for (auto it_running_expr = current_loop_body.begin();
@@ -238,7 +259,7 @@ std::vector<Expr*> processForLoopBodies(
                ir_utils::filterByType<TensorView>(running_expr->inputs())) {
             if (running_input == input) {
               *it_running_expr = ir_utils::replaceValInExprInputs(
-                  running_expr, input, input_j);
+                  running_expr, input, input_slicing->out());
             }
           }
         }
@@ -256,17 +277,14 @@ std::vector<Expr*> processForLoopBodies(
         }
 
         // Create a sliced version of the output tensor for this stream axis
-        TensorView* output_j = select(
-            output,
-            output_stream_id_logical_index,
-            for_loop->index(),
-            /*keep_reduction_axis=*/true);
+        hir::HirAliasSelect* output_slicing = createSlicedTensor(
+            output, output_stream_id_logical_index, for_loop->index());
 
         // Allocate memory for the output tensor, and place the allocation IR
         // before the for-loop, at the top level
         new_top_level_exprs.push_back(
             IrBuilder::create<kir::Allocate>(output, MemoryType::Global));
-        new_loop_body.push_back(output_j->definition());
+        new_loop_body.push_back(output_slicing);
 
         // Update all expressions that use this output to use the sliced version
         for (auto it_running_expr = current_loop_body.begin();
@@ -276,18 +294,8 @@ std::vector<Expr*> processForLoopBodies(
           for (auto* running_output :
                ir_utils::filterByType<TensorView>(running_expr->outputs())) {
             if (running_output == output) {
-              // Create an alias for the sliced output to maintain the original
-              // tensor's properties Alias is needed here to avoid that
-              // transferDefinitionToNewOutputs throws. Indeed, HIC does not
-              // make the SSA assumption, but the util functions we use (such as
-              // transferDefinitionToNewOutputs) do, therefore we need to create
-              // an alias for the sliced output to not create loops in the dag.
-              TensorView* output_j_alias =
-                  ops::newValLike(output_j, output_j->dtype(), true)
-                      ->as<TensorView>();
-              hic->markAlias(output_j, output_j_alias);
               *it_running_expr = ir_utils::transferDefinitionToNewOutputs(
-                  running_expr, {output_j_alias});
+                  running_expr, {output_slicing->out()});
             }
           }
         }
diff --git a/csrc/ops/indexing.cpp b/csrc/ops/indexing.cpp
index 80c0ff84b85..5ff75065ff2 100644
--- a/csrc/ops/indexing.cpp
+++ b/csrc/ops/indexing.cpp
@@ -19,14 +19,8 @@
 
 namespace nvfuser {
 
-TensorView* select(
-    TensorView* tv,
-    int64_t dim,
-    Val* index,
-    bool keep_reduction_axis) {
-  auto dom = keep_reduction_axis
-      ? tv->getLogicalDomain()
-      : TensorDomain::noReductions(tv->getLogicalDomain());
+TensorView* select(TensorView* tv, int64_t dim, Val* index) {
+  auto dom = TensorDomain::noReductions(tv->getLogicalDomain());
   NVF_CHECK(!dom.empty(), "select can not be applied to 0d tensor.");
 
   std::vector<IterDomain*> new_root;
diff --git a/csrc/ops/indexing.h b/csrc/ops/indexing.h
index 5e0410d95d5..c8152c33f82 100644
--- a/csrc/ops/indexing.h
+++ b/csrc/ops/indexing.h
@@ -15,14 +15,7 @@
 
 namespace nvfuser {
 
-// When keep_reduction_axis is true, all reduction axis are kept in the
-// SelectOp's consumer. This is used in the context of HostIr where SelectOp is
-// used to index into Stream-parallelized axes.
-NVF_API TensorView* select(
-    TensorView* tv,
-    int64_t dim,
-    Val* index,
-    bool keep_reduction_axis = false);
+NVF_API TensorView* select(TensorView* tv, int64_t dim, Val* index);
 
 // torch.index_select
 NVF_API TensorView* indexSelect(
diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index 5d32c22e212..8d3870d1a84 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -432,9 +432,7 @@ IterDomain* newOutputIterDomain(
 #pragma GCC diagnostic pop
 #endif
 
-std::vector<IterDomain*> newOutputDomain(
-    const std::vector<Val*>& vals,
-    bool keep_reduction_axis) {
+std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals) {
   std::vector<TensorView*> tvs;
   for (auto val : vals) {
     if (auto* tv = dynamic_cast<TensorView*>(val)) {
@@ -445,20 +443,14 @@ std::vector<IterDomain*> newOutputDomain(
       !tvs.empty(),
       "Tried to create new output TensorView but received empty list.");
 
-  auto getLogicalDomain =
-      [keep_reduction_axis](TensorView* tv) -> std::vector<IterDomain*> {
-    return keep_reduction_axis
-        ? tv->getLogicalDomain()
-        : TensorDomain::noReductions(tv->getLogicalDomain());
-  };
-
-  std::vector<IterDomain*> out_domain(getLogicalDomain(tvs[0]).size(), nullptr);
+  std::vector<IterDomain*> out_domain(
+      TensorDomain::noReductions(tvs[0]->getLogicalDomain()).size(), nullptr);
 
   for (const auto dim_i : arange(out_domain.size())) {
     std::vector<IterDomain*> input_ids;
     input_ids.reserve(tvs.size());
     for (auto* tv : tvs) {
-      auto dom = getLogicalDomain(tv);
+      auto dom = TensorDomain::noReductions(tv->getLogicalDomain());
       input_ids.emplace_back(dom[dim_i]);
     }
     out_domain[dim_i] = newOutputIterDomain(input_ids);
@@ -466,11 +458,8 @@ std::vector<IterDomain*> newOutputDomain(
   return out_domain;
 }
 
-TensorView* newOutputTV(
-    const std::vector<Val*>& vals,
-    DataType dtype,
-    bool keep_reduction_axis) {
-  auto out_domain = newOutputDomain(vals, keep_reduction_axis);
+TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype) {
+  auto out_domain = newOutputDomain(vals);
   auto* new_out = IrBuilder::create<TensorView>(
       IrBuilder::create<TensorDomain>(
           out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)),
@@ -513,12 +502,12 @@ std::vector<Val*> maybeBroadcast(const std::vector<Val*>& vals) {
   return out_vals;
 }
 
-Val* newValLike(Val* val, DataType dtype, bool keep_reduction_axis) {
+Val* newValLike(Val* val, DataType dtype) {
   NVF_CHECK(
       dtype != DataType::Null, "Invalid datatype provided for new value.");
 
   if (val->isA<TensorView>()) {
-    return newOutputTV({val}, dtype, keep_reduction_axis);
+    return newOutputTV({val}, dtype);
   }
 
   return newScalar(ValType::Others, dtype);
diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h
index 1a2abda03fc..94d6391cf45 100644
--- a/csrc/ops/utils.h
+++ b/csrc/ops/utils.h
@@ -99,21 +99,13 @@ IterDomain* newOutputIterDomain(
 // output tensorview, e.g., for BinaryOp. `vals` can contain scalars, e.g, when
 // creating the output TensorView for `tv0+scalar`. This is for convenience and
 // scalars will be ignored.
-std::vector<IterDomain*> newOutputDomain(
-    const std::vector<Val*>& vals,
-    bool keep_reduction_axis = false);
+std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals);
 
-TensorView* newOutputTV(
-    const std::vector<Val*>& vals,
-    DataType dtype,
-    bool keep_reduction_axis = false);
+TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype);
 
 std::vector<Val*> maybeBroadcast(const std::vector<Val*>& vals);
 
-NVF_API Val* newValLike(
-    Val* val,
-    DataType dtype,
-    bool keep_reduction_axis = false);
+NVF_API Val* newValLike(Val* val, DataType dtype);
 
 // returns the minimum init value for reduction:
 //   -inf for floating type;

From a50b53c90e744cf469779dcccdc613c6af68958f Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 23 Apr 2025 14:48:36 -0700
Subject: [PATCH 30/38] add cache for tensor slicing

---
 csrc/host_ir/pass/stream_parallel_type.cpp | 92 ++++++++++++++++------
 1 file changed, 68 insertions(+), 24 deletions(-)

diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp
index 3d63290ef17..6999b0c4ca5 100644
--- a/csrc/host_ir/pass/stream_parallel_type.cpp
+++ b/csrc/host_ir/pass/stream_parallel_type.cpp
@@ -119,29 +119,67 @@ int64_t findStreamAxisIndex(
   return stream_id_logical_index;
 }
 
-// Helper function to create a sliced version of a tensor for stream
-// parallelization
-hir::HirAliasSelect* createSlicedTensor(
-    TensorView* tensor,
-    int64_t stream_axis_index,
-    Val* index) {
-  auto dom = tensor->getLogicalDomain();
-
-  std::vector<IterDomain*> new_root;
-  new_root.reserve(dom.size() - 1);
-
-  for (auto i : arange((int64_t)dom.size())) {
-    if (i != stream_axis_index) {
-      new_root.emplace_back(dom[i]->cloneWithoutRFactor());
+// Cache for tensor slicing operations in stream parallelization.
+// This cache stores previously created sliced versions of tensors to avoid
+// redundant slicing operations. A sliced tensor is created by removing a
+// specific axis (stream axis) from the tensor's domain and creating a new
+// tensor that represents a slice of the original tensor at a given index.
+// The cache key is a tuple of (original tensor, axis index to remove, slice
+// index).
+struct TensorSlicingCache {
+  // Type aliases
+  using Key = std::tuple<TensorView*, int64_t, Val*>;
+
+  // Custom hash function for the tuple used as cache key
+  struct Hash {
+    size_t operator()(const Key& t) const {
+      auto [tv, idx, val] = t;
+      return std::hash<TensorView*>{}(tv) ^ std::hash<int64_t>{}(idx) ^
+          std::hash<Val*>{}(val);
     }
+  };
+
+  // Map type for storing cached sliced tensors
+  using Map = std::unordered_map<Key, hir::HirAliasSelect*, Hash>;
+
+  // Get the expr producing the indexed version of a tensor. If the expr already
+  // exists in the cache, returns the cached version. Otherwise, creates a new
+  // expr, producing a tensor "selected" on its dimension `stream_axis_index` at
+  // index `index`. Returns a pair of (expr, is_new) where is_new indicates
+  // whether the expr was newly created.
+  std::pair<hir::HirAliasSelect*, bool> get(
+      TensorView* tensor,
+      int64_t stream_axis_index,
+      Val* index) {
+    auto key = std::make_tuple(tensor, stream_axis_index, index);
+    auto it = cache_.find(key);
+    if (it != cache_.end()) {
+      return {it->second, false};
+    }
+
+    auto dom = tensor->getLogicalDomain();
+    std::vector<IterDomain*> new_root;
+    new_root.reserve(dom.size() - 1);
+
+    for (auto i : arange((int64_t)dom.size())) {
+      if (i != stream_axis_index) {
+        new_root.emplace_back(dom[i]->cloneWithoutRFactor());
+      }
+    }
+
+    auto td = IrBuilder::create<TensorDomain>(
+        new_root, TensorDomain::getContiguityFilledWith(new_root, true));
+    auto out = IrBuilder::create<TensorView>(td, *tensor->getDataType());
+    auto result = IrBuilder::create<hir::HirAliasSelect>(
+        tensor, out, stream_axis_index, index);
+
+    cache_[key] = result;
+    return {result, true};
   }
 
-  auto td = IrBuilder::create<TensorDomain>(
-      new_root, TensorDomain::getContiguityFilledWith(new_root, true));
-  auto out = IrBuilder::create<TensorView>(td, *tensor->getDataType());
-  return IrBuilder::create<hir::HirAliasSelect>(
-      tensor, out, stream_axis_index, index);
-}
+ private:
+  Map cache_; // Storage for cached sliced tensors
+};
 
 // Step 1: Group expressions into stream-parallel regions
 std::vector<Expr*> groupStreamParallelRegions(
@@ -214,6 +252,8 @@ std::vector<Expr*> processForLoopBodies(
     const IdModel& id_model,
     std::vector<Expr*> top_level_exprs) {
   std::vector<Expr*> new_top_level_exprs;
+  // Create a cache for tensor indexing
+  TensorSlicingCache tensor_slicing_cache;
 
   // Process each top-level expression
   for (auto top_level_expr : top_level_exprs) {
@@ -246,9 +286,11 @@ std::vector<Expr*> processForLoopBodies(
 
         // Create a sliced version of the input tensor for this stream
         // iterdomain
-        hir::HirAliasSelect* input_slicing = createSlicedTensor(
+        auto [input_slicing, is_new] = tensor_slicing_cache.get(
             input, input_stream_id_logical_index, for_loop->index());
-        new_loop_body.push_back(input_slicing);
+        if (is_new) {
+          new_loop_body.push_back(input_slicing);
+        }
 
         // Update all expressions that use this input to use the sliced version
         for (auto it_running_expr = current_loop_body.begin();
@@ -277,14 +319,16 @@ std::vector<Expr*> processForLoopBodies(
         }
 
         // Create a sliced version of the output tensor for this stream axis
-        hir::HirAliasSelect* output_slicing = createSlicedTensor(
+        auto [output_slicing, is_new] = tensor_slicing_cache.get(
             output, output_stream_id_logical_index, for_loop->index());
+        if (is_new) {
+          new_loop_body.push_back(output_slicing);
+        }
 
         // Allocate memory for the output tensor, and place the allocation IR
         // before the for-loop, at the top level
         new_top_level_exprs.push_back(
             IrBuilder::create<kir::Allocate>(output, MemoryType::Global));
-        new_loop_body.push_back(output_slicing);
 
         // Update all expressions that use this output to use the sliced version
         for (auto it_running_expr = current_loop_body.begin();

From d01c5a27f64db79e568827e25730d9ab6b84cfa4 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 23 Apr 2025 15:58:29 -0700
Subject: [PATCH 31/38] separate out tensor allocation logic

---
 csrc/host_ir/pass/stream_parallel_type.cpp | 45 +++++++++++++++++-----
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp
index 6999b0c4ca5..7532c145ba1 100644
--- a/csrc/host_ir/pass/stream_parallel_type.cpp
+++ b/csrc/host_ir/pass/stream_parallel_type.cpp
@@ -246,7 +246,36 @@ std::vector<Expr*> groupStreamParallelRegions(
   return new_top_level_exprs;
 }
 
-// Step 2: Process for-loop bodies by slicing tensors
+// Helper function to add allocations for tensors that need them
+std::vector<Expr*> addTensorAllocations(
+    std::vector<Expr*> top_level_exprs,
+    const IdModel& id_model) {
+  std::vector<Expr*> new_top_level_exprs;
+
+  for (auto* expr : top_level_exprs) {
+    if (expr->isA<ForLoop>()) {
+      // add allocations for tensors produced in the loop that have a stream axes
+      auto* for_loop = expr->as<ForLoop>();
+      for (auto* body_expr : for_loop->body().exprs()) {
+        for (auto* output : ir_utils::filterByType<TensorView>(body_expr->outputs())) {
+          if (findStreamAxisIndex(output, for_loop->iterDomain(), id_model) != -1) {
+            new_top_level_exprs.push_back(
+                IrBuilder::create<kir::Allocate>(output, MemoryType::Global));
+          }
+        }
+      }
+    }
+    new_top_level_exprs.push_back(expr);
+  }
+
+  // Add all original expressions
+  new_top_level_exprs.insert(
+      new_top_level_exprs.end(), top_level_exprs.begin(), top_level_exprs.end());
+
+  return new_top_level_exprs;
+}
+
+// Step 3: Process for-loop bodies by slicing tensors
 std::vector<Expr*> processForLoopBodies(
     hir::HostIrContainer* hic,
     const IdModel& id_model,
@@ -325,11 +354,6 @@ std::vector<Expr*> processForLoopBodies(
           new_loop_body.push_back(output_slicing);
         }
 
-        // Allocate memory for the output tensor, and place the allocation IR
-        // before the for-loop, at the top level
-        new_top_level_exprs.push_back(
-            IrBuilder::create<kir::Allocate>(output, MemoryType::Global));
-
         // Update all expressions that use this output to use the sliced version
         for (auto it_running_expr = current_loop_body.begin();
              it_running_expr != current_loop_body.end();
@@ -424,7 +448,7 @@ std::vector<Expr*> addStreamManagement(std::vector<Expr*> top_level_exprs) {
 // 1. Identifying stream-parallelized axes in tensor operations
 // 2. Grouping compatible operations into stream-parallel for-loops
 // 3. Setting up proper stream synchronization and management
-//
+// 4. Adding allocations for tensors that need them
 // The pass ensures that:
 // - Input tensors don't have stream axes
 // - Only one stream axis exists per tensor
@@ -462,11 +486,14 @@ void StreamParallelType::runPass(Fusion* fusion) {
   std::vector<Expr*> top_level_exprs =
       groupStreamParallelRegions(hic, id_model);
 
-  // Step 2: Process for-loop bodies by slicing tensors
+  // Step 2: Add allocations for tensors that need them
+  top_level_exprs = addTensorAllocations(std::move(top_level_exprs), id_model);
+
+  // Step 3: Process for-loop bodies by slicing tensors
   top_level_exprs =
       processForLoopBodies(hic, id_model, std::move(top_level_exprs));
 
-  // Step 3: Add stream management and synchronization
+  // Step 4: Add stream management and synchronization
   top_level_exprs = addStreamManagement(std::move(top_level_exprs));
 
   // Update the container's top-level expressions

From 25b7695b2db13c85020845fdca4bb1e91f7e8359 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 23 Apr 2025 16:12:23 -0700
Subject: [PATCH 32/38] minor cleanup

---
 csrc/host_ir/pass/stream_parallel_type.cpp | 147 ++++++---------------
 1 file changed, 42 insertions(+), 105 deletions(-)

diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp
index 7532c145ba1..8bea4c3430f 100644
--- a/csrc/host_ir/pass/stream_parallel_type.cpp
+++ b/csrc/host_ir/pass/stream_parallel_type.cpp
@@ -183,12 +183,11 @@ struct TensorSlicingCache {
 
 // Step 1: Group expressions into stream-parallel regions
 std::vector<Expr*> groupStreamParallelRegions(
-    hir::HostIrContainer* hic,
+    const std::vector<Expr*>& top_level_exprs,
     const IdModel& id_model) {
   std::vector<Expr*> new_top_level_exprs;
 
-  // Process each top-level expression
-  for (auto expr : hic->topLevelExprs()) {
+  for (auto* expr : top_level_exprs) {
     // Skip expressions with no outputs
     if (expr->outputs().size() == 0) {
       new_top_level_exprs.push_back(expr);
@@ -229,9 +228,9 @@ std::vector<Expr*> groupStreamParallelRegions(
       auto* for_loop = IrBuilder::create<ForLoop>(
           stream_axis,
           /*index=*/NamedScalar::getParallelIndex(ParallelType::Stream),
-          /*start=*/hic->zeroVal(),
+          /*start=*/FusionGuard::getCurFusion()->zeroVal(),
           /*stop=*/stream_axis->extent(),
-          /*step=*/hic->oneVal(),
+          /*step=*/FusionGuard::getCurFusion()->oneVal(),
           /*vectorize=*/false,
           /*vectorize_shift=*/nullptr,
           /*unroll_required=*/false,
@@ -254,11 +253,14 @@ std::vector<Expr*> addTensorAllocations(
 
   for (auto* expr : top_level_exprs) {
     if (expr->isA<ForLoop>()) {
-      // add allocations for tensors produced in the loop that have a stream axes
+      // add allocations for tensors produced in the loop that have a stream
+      // axes
       auto* for_loop = expr->as<ForLoop>();
       for (auto* body_expr : for_loop->body().exprs()) {
-        for (auto* output : ir_utils::filterByType<TensorView>(body_expr->outputs())) {
-          if (findStreamAxisIndex(output, for_loop->iterDomain(), id_model) != -1) {
+        for (auto* output :
+             ir_utils::filterByType<TensorView>(body_expr->outputs())) {
+          if (findStreamAxisIndex(output, for_loop->iterDomain(), id_model) !=
+              -1) {
             new_top_level_exprs.push_back(
                 IrBuilder::create<kir::Allocate>(output, MemoryType::Global));
           }
@@ -268,131 +270,68 @@ std::vector<Expr*> addTensorAllocations(
     new_top_level_exprs.push_back(expr);
   }
 
-  // Add all original expressions
-  new_top_level_exprs.insert(
-      new_top_level_exprs.end(), top_level_exprs.begin(), top_level_exprs.end());
-
   return new_top_level_exprs;
 }
 
 // Step 3: Process for-loop bodies by slicing tensors
 std::vector<Expr*> processForLoopBodies(
-    hir::HostIrContainer* hic,
-    const IdModel& id_model,
-    std::vector<Expr*> top_level_exprs) {
-  std::vector<Expr*> new_top_level_exprs;
-  // Create a cache for tensor indexing
+    std::vector<Expr*> top_level_exprs,
+    const IdModel& id_model) {
   TensorSlicingCache tensor_slicing_cache;
 
-  // Process each top-level expression
-  for (auto top_level_expr : top_level_exprs) {
-    // Skip non-for-loop expressions
-    if (!top_level_expr->isA<ForLoop>()) {
-      new_top_level_exprs.push_back(top_level_expr);
+  for (auto* expr : top_level_exprs) {
+    if (!expr->isA<ForLoop>()) {
       continue;
     }
 
-    auto* for_loop = top_level_expr->as<ForLoop>();
+    auto* for_loop = expr->as<ForLoop>();
     std::vector<Expr*> new_loop_body;
-    std::vector<Expr*> current_loop_body = for_loop->body().exprs();
-
-    // Process each expression in the loop body
-    for (auto it_expr = current_loop_body.begin();
-         it_expr != current_loop_body.end();
-         ++it_expr) {
-      Expr* expr = *it_expr;
-
-      // Process input tensors that might have stream axes
-      for (auto* input : ir_utils::filterByType<TensorView>(expr->inputs())) {
-        // Find if this input has a stream axis
-        int64_t input_stream_id_logical_index =
-            findStreamAxisIndex(input, for_loop->iterDomain(), id_model);
-
-        // Skip if no stream axis found
-        if (input_stream_id_logical_index == -1) {
-          continue;
-        }
 
-        // Create a sliced version of the input tensor for this stream
-        // iterdomain
-        auto [input_slicing, is_new] = tensor_slicing_cache.get(
-            input, input_stream_id_logical_index, for_loop->index());
+    // Lambda to process a tensor in a for-loop body
+    auto processTensor = [&](Expr*& expr, TensorView* tensor) {
+      if (auto stream_idx =
+              findStreamAxisIndex(tensor, for_loop->iterDomain(), id_model);
+          stream_idx != -1) {
+        auto [slicing, is_new] =
+            tensor_slicing_cache.get(tensor, stream_idx, for_loop->index());
         if (is_new) {
-          new_loop_body.push_back(input_slicing);
+          new_loop_body.push_back(slicing);
         }
-
-        // Update all expressions that use this input to use the sliced version
-        for (auto it_running_expr = current_loop_body.begin();
-             it_running_expr != current_loop_body.end();
-             ++it_running_expr) {
-          Expr* running_expr = *it_running_expr;
-          for (auto* running_input :
-               ir_utils::filterByType<TensorView>(running_expr->inputs())) {
-            if (running_input == input) {
-              *it_running_expr = ir_utils::replaceValInExprInputs(
-                  running_expr, input, input_slicing->out());
-            }
-          }
+        expr = ir_utils::replaceValInExprInputs(expr, tensor, slicing->out());
+        if (expr->outputs().size() > 0 && expr->outputs()[0] == tensor) {
+          expr =
+              ir_utils::transferDefinitionToNewOutputs(expr, {slicing->out()});
         }
       }
+    };
 
-      // Process output tensors that might have stream axes
-      for (auto* output : ir_utils::filterByType<TensorView>(expr->outputs())) {
-        // Find if this output has a stream axis
-        int64_t output_stream_id_logical_index =
-            findStreamAxisIndex(output, for_loop->iterDomain(), id_model);
-
-        // Skip if no stream axis found
-        if (output_stream_id_logical_index == -1) {
-          continue;
-        }
-
-        // Create a sliced version of the output tensor for this stream axis
-        auto [output_slicing, is_new] = tensor_slicing_cache.get(
-            output, output_stream_id_logical_index, for_loop->index());
-        if (is_new) {
-          new_loop_body.push_back(output_slicing);
-        }
-
-        // Update all expressions that use this output to use the sliced version
-        for (auto it_running_expr = current_loop_body.begin();
-             it_running_expr != current_loop_body.end();
-             ++it_running_expr) {
-          Expr* running_expr = *it_running_expr;
-          for (auto* running_output :
-               ir_utils::filterByType<TensorView>(running_expr->outputs())) {
-            if (running_output == output) {
-              *it_running_expr = ir_utils::transferDefinitionToNewOutputs(
-                  running_expr, {output_slicing->out()});
-            }
-          }
-        }
+    for (auto* body_expr : for_loop->body().exprs()) {
+      for (auto* input :
+           ir_utils::filterByType<TensorView>(body_expr->inputs())) {
+        processTensor(body_expr, input);
       }
-
-      // Add the original expression to the new loop body
-      new_loop_body.push_back(*it_expr);
+      for (auto* output :
+           ir_utils::filterByType<TensorView>(body_expr->outputs())) {
+        processTensor(body_expr, output);
+      }
+      new_loop_body.push_back(body_expr);
     }
 
-    // Update the for-loop body with all the processed expressions
     for_loop->body().clear();
     for (auto* expr : new_loop_body) {
       for_loop->body().push_back(expr);
     }
-    new_top_level_exprs.push_back(top_level_expr);
   }
 
-  return new_top_level_exprs;
+  return top_level_exprs;
 }
 
-// Step 3: Add stream management and synchronization
+// Step 4: Add stream management and synchronization
 std::vector<Expr*> addStreamManagement(std::vector<Expr*> top_level_exprs) {
-  std::vector<Expr*> new_top_level_exprs;
-
   // Process each top-level expression
   for (auto* top_level_expr : top_level_exprs) {
     // Skip non-for-loop expressions
     if (!top_level_expr->isA<ForLoop>()) {
-      new_top_level_exprs.push_back(top_level_expr);
       continue;
     }
 
@@ -434,10 +373,9 @@ std::vector<Expr*> addStreamManagement(std::vector<Expr*> top_level_exprs) {
     for (auto* expr : new_loop_body) {
       for_loop->body().push_back(expr);
     }
-    new_top_level_exprs.push_back(top_level_expr);
   }
 
-  return new_top_level_exprs;
+  return top_level_exprs;
 }
 
 } // anonymous namespace
@@ -484,14 +422,13 @@ void StreamParallelType::runPass(Fusion* fusion) {
 
   // Step 1: Group expressions into stream-parallel regions
   std::vector<Expr*> top_level_exprs =
-      groupStreamParallelRegions(hic, id_model);
+      groupStreamParallelRegions(hic->topLevelExprs(), id_model);
 
   // Step 2: Add allocations for tensors that need them
   top_level_exprs = addTensorAllocations(std::move(top_level_exprs), id_model);
 
   // Step 3: Process for-loop bodies by slicing tensors
-  top_level_exprs =
-      processForLoopBodies(hic, id_model, std::move(top_level_exprs));
+  top_level_exprs = processForLoopBodies(std::move(top_level_exprs), id_model);
 
   // Step 4: Add stream management and synchronization
   top_level_exprs = addStreamManagement(std::move(top_level_exprs));

From 6b479deca59efed1cc9afae42447756bb260c019 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 24 Apr 2025 06:19:59 -0700
Subject: [PATCH 33/38] lower as HIR only set without permute

---
 csrc/host_ir/executor.cpp | 18 ++++++++----------
 csrc/host_ir/lower.cpp    | 37 +++++++++++++++++++++++++++----------
 2 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 12cf344e549..3a3c0921d2a 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -632,21 +632,19 @@ void HostIrEvaluator::handle(LinearOp* linear) {
 }
 
 void HostIrEvaluator::handle(LoadStoreOp* load_store_op) {
+  NVF_ERROR(
+      load_store_op->opType() == LoadStoreOpType::Set,
+      "LoadStoreOp must be a Set");
   NVF_ERROR(
       load_store_op->out()->isA<TensorView>(), "out must be a TensorView");
   auto* out_tv = load_store_op->out()->as<TensorView>();
   auto in_tensor = getKnownConcreteValue(load_store_op->in()).as<at::Tensor>();
 
-  // If output has root domain, compute and apply permutation
-  if (out_tv->hasRoot()) {
-    auto permutation = ir_utils::computePermutation(
-        out_tv->getRootDomain(), out_tv->getLogicalDomain());
-    NVF_ERROR(
-        permutation.has_value(),
-        "The logical domain of a Set.Permute is supposed to be a permutation of the root domain: ",
-        out_tv->toString());
-    in_tensor = in_tensor.permute(*permutation).contiguous();
-  }
+  // If output has root domain, it means that the set op is a permute, which we
+  // don't support currently
+  NVF_ERROR(
+      !out_tv->hasRoot(), "the set op", load_store_op, "must not be a permute");
+
   if (!isKnown(load_store_op->out())) {
     bind(load_store_op->out(), in_tensor);
   } else {
diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index 308e1399872..fd14096b190 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -615,16 +615,33 @@ std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
 }
 
 bool HostIrLower::isLowerableAsStandaloneHostOp(Expr* expr) {
-  return expr->isOneOf<
-      MatmulOp,
-      SliceOp,
-      SelectOp,
-      LinearOp,
-      LoadStoreOp,
-      BinaryOp,
-      ReductionOp,
-      Communication,
-      P2PCommunication>();
+  if (expr->isOneOf<
+          MatmulOp,
+          SliceOp,
+          SelectOp,
+          LinearOp,
+          BinaryOp,
+          ReductionOp,
+          Communication,
+          P2PCommunication>()) {
+    return true;
+  }
+
+  // Lower as standalone op "set" ops, i.e., LoadStoreOp of "Set" type with no
+  // permute
+  if (expr->isA<LoadStoreOp>()) {
+    auto* load_store = expr->as<LoadStoreOp>();
+    if (load_store->opType() == LoadStoreOpType::Set &&
+        load_store->out()->isA<TensorView>()) {
+      auto* tv = load_store->out()->as<TensorView>();
+      // If the output tensor has no root, it means it has no permute
+      if (!tv->hasRoot()) {
+        return true;
+      }
+    }
+  }
+
+  return false;
 }
 
 bool HostIrLower::shouldMergeSegmentedGroups(

From b2a76e94c68c738043ac3251611299b720e85c02 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 24 Apr 2025 06:33:09 -0700
Subject: [PATCH 34/38] add comment

---
 csrc/host_ir/lower.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index c36fae09e0a..626a7e67e28 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -720,6 +720,9 @@ std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
   }
 
   for (auto tv : hic->allTvs()) {
+    // set all host tensors to global memory type. This must be the case by
+    // definition of a host tensor, and setting the memory type to global is
+    // also required to avoid Allocate HIR nodes to throw
     tv->setMemoryType(MemoryType::Global);
   }
 

From 7f7caf5177936b7387d4e443768788584de2702a Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Sun, 27 Apr 2025 04:17:41 -0700
Subject: [PATCH 35/38] change namespace of the optimization pass to hir

---
 csrc/host_ir/lower.cpp                     |  2 +-
 csrc/host_ir/pass/stream_parallel_type.cpp |  4 +--
 csrc/host_ir/pass/stream_parallel_type.h   |  8 +++---
 csrc/python_frontend/fusion_definition.cpp |  2 +-
 tests/cpp/test_host_ir_stream_lowering.cpp | 30 +++++++++++-----------
 tests/cpp/test_multidevice_host_ir.cpp     |  4 +--
 6 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index 227b76eb597..313970f703c 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -769,7 +769,7 @@ std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
   }
   hic->resetTopLevelExprs(new_top_level_exprs);
 
-  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+  preseg_passes::OptimizationPass<hir::StreamParallelType>::runPass(
       hic.get());
 
   return hic;
diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp
index 8bea4c3430f..d7bfa0f090a 100644
--- a/csrc/host_ir/pass/stream_parallel_type.cpp
+++ b/csrc/host_ir/pass/stream_parallel_type.cpp
@@ -18,7 +18,7 @@
 #include <ops/all_ops.h>
 #include <ops/utils.h>
 
-namespace nvfuser::preseg_passes {
+namespace nvfuser::hir {
 
 namespace {
 
@@ -437,4 +437,4 @@ void StreamParallelType::runPass(Fusion* fusion) {
   hic->resetTopLevelExprs(top_level_exprs);
 }
 
-} // namespace nvfuser::preseg_passes
+} // namespace nvfuser::hir
diff --git a/csrc/host_ir/pass/stream_parallel_type.h b/csrc/host_ir/pass/stream_parallel_type.h
index 9c0c39efe87..f389dbe1ff7 100644
--- a/csrc/host_ir/pass/stream_parallel_type.h
+++ b/csrc/host_ir/pass/stream_parallel_type.h
@@ -10,7 +10,7 @@
 #include <fusion.h>
 #include <preseg_passes/optimization_pass.h>
 
-namespace nvfuser::preseg_passes {
+namespace nvfuser::hir {
 
 // A pass used in HostIrLower that takes a HostIrContainer as input, reads the
 // TensorView's ParallelType::Stream, and modify the the HostIrContainer's top
@@ -22,8 +22,8 @@ namespace nvfuser::preseg_passes {
 // An illustration of the pass can be found in the tests
 // `test_host_ir_stream_lowering.cpp`
 // with the option `NVFUSER_DUMP=host_ir`.
-class StreamParallelType : public OptimizationPass<StreamParallelType> {
-  friend class OptimizationPass<StreamParallelType>;
+class StreamParallelType : public preseg_passes::OptimizationPass<StreamParallelType> {
+  friend class preseg_passes::OptimizationPass<StreamParallelType>;
 
  protected:
   static void runPass(Fusion* fusion);
@@ -32,4 +32,4 @@ class StreamParallelType : public OptimizationPass<StreamParallelType> {
   }
 };
 
-} // namespace nvfuser::preseg_passes
+} // namespace nvfuser::hir
diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp
index d6e552032b1..fd3e714f7cb 100644
--- a/csrc/python_frontend/fusion_definition.cpp
+++ b/csrc/python_frontend/fusion_definition.cpp
@@ -455,7 +455,7 @@ std::pair<KernelArgumentHolder, std::vector<Sharding>> FusionDefinition::
         params.lower.communicator_backend = backend_type_;
         // Disable StreamParallelType pass temporarily as proper stream lowering
         // gets implemented
-        preseg_passes::OptimizationPassGuard<preseg_passes::StreamParallelType>
+        preseg_passes::OptimizationPassGuard<hir::StreamParallelType>
             guard(false);
         scheds->multi_device_executor = std::make_unique<MultiDeviceExecutor>(
             std::make_unique<Fusion>(*scheds->preschedFusion()),
diff --git a/tests/cpp/test_host_ir_stream_lowering.cpp b/tests/cpp/test_host_ir_stream_lowering.cpp
index f6d74caea87..e03fccb34e0 100644
--- a/tests/cpp/test_host_ir_stream_lowering.cpp
+++ b/tests/cpp/test_host_ir_stream_lowering.cpp
@@ -36,7 +36,7 @@ TEST_F(HirLowerStreamTest, InputsAreNotStreamParallelized) {
   tv->axis(0)->parallelize(ParallelType::Stream);
 
   EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
-                   preseg_passes::StreamParallelType>::runPass(hic.get()));
+                   StreamParallelType>::runPass(hic.get()));
 }
 
 TEST_F(HirLowerStreamTest, Split) {
@@ -51,7 +51,7 @@ TEST_F(HirLowerStreamTest, Split) {
   tv1->axis(0)->parallelize(ParallelType::Stream);
 
   EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
-                   preseg_passes::StreamParallelType>::runPass(hic.get()));
+                   StreamParallelType>::runPass(hic.get()));
 }
 
 TEST_F(HirLowerStreamTest, Merge) {
@@ -66,7 +66,7 @@ TEST_F(HirLowerStreamTest, Merge) {
   tv1->axis(0)->parallelize(ParallelType::Stream);
 
   EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
-                   preseg_passes::StreamParallelType>::runPass(hic.get()));
+                   StreamParallelType>::runPass(hic.get()));
 }
 
 TEST_F(HirLowerStreamTest, SingleSetOp) {
@@ -81,7 +81,7 @@ TEST_F(HirLowerStreamTest, SingleSetOp) {
   tv1->setMemoryType(MemoryType::Global);
   tv1->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
       hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
@@ -111,7 +111,7 @@ TEST_F(HirLowerStreamTest, SingleSetOpNonOutermost) {
   tv1->setMemoryType(MemoryType::Global);
   tv1->axis(1)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
       hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
@@ -144,7 +144,7 @@ TEST_F(HirLowerStreamTest, SingleBinaryOp) {
   tv2->setMemoryType(MemoryType::Global);
   tv2->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
       hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
@@ -180,7 +180,7 @@ TEST_F(HirLowerStreamTest, TwoSetOps) {
   tv1->axis(0)->parallelize(ParallelType::Stream);
   tv2->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
       hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 3);
@@ -218,7 +218,7 @@ TEST_F(HirLowerStreamTest, ThreeSetOpsWithDisjointsForLoops) {
   tv1->axis(0)->parallelize(ParallelType::Stream);
   tv3->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
       hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 5);
@@ -252,7 +252,7 @@ TEST_F(HirLowerStreamTest, ReductionUnsupported) {
   tv1->axis(0)->parallelize(ParallelType::Stream);
 
   EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
-                   preseg_passes::StreamParallelType>::runPass(hic.get()));
+                   StreamParallelType>::runPass(hic.get()));
 }
 
 TEST_F(HirLowerStreamTest, Reduction) {
@@ -267,7 +267,7 @@ TEST_F(HirLowerStreamTest, Reduction) {
   tv1->setMemoryType(MemoryType::Global);
   tv1->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
       hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
@@ -301,7 +301,7 @@ TEST_F(HirLowerStreamTest, Matmul_M) {
   c->setMemoryType(MemoryType::Global);
   c->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
       hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
@@ -338,7 +338,7 @@ TEST_F(HirLowerStreamTest, BatchedMatmul) {
   c->setMemoryType(MemoryType::Global);
   c->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
       hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
@@ -375,7 +375,7 @@ TEST_F(HirLowerStreamTest, Matmul_N) {
   c->setMemoryType(MemoryType::Global);
   c->axis(1)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<preseg_passes::StreamParallelType>::runPass(
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
       hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
@@ -413,7 +413,7 @@ TEST_F(HirLowerStreamTest, Matmul_K) {
   c->axis(-1)->parallelize(ParallelType::Stream);
 
   EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
-                   preseg_passes::StreamParallelType>::runPass(hic.get()));
+                   StreamParallelType>::runPass(hic.get()));
 }
 
 // We don's support PostOnStream because it does not support well pre-allocated
@@ -461,7 +461,7 @@ TEST_F(HirLowerStreamTest, DoNotSupportPostOnStream) {
   output->axis(-1)->parallelize(ParallelType::Stream);
 
   EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
-                   preseg_passes::StreamParallelType>::runPass(hic.get()));
+                   StreamParallelType>::runPass(hic.get()));
 }
 
 } // namespace hir
diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp
index 7b233bc47db..6932a40fe5c 100644
--- a/tests/cpp/test_multidevice_host_ir.cpp
+++ b/tests/cpp/test_multidevice_host_ir.cpp
@@ -365,7 +365,7 @@ using OverlapDistributedMatmulTest = MultiDeviceTest;
 TEST_F(OverlapDistributedMatmulTest, AG_matmul) {
   // Disable StreamParallelType pass temporarily as proper stream lowering gets
   // implemented
-  preseg_passes::OptimizationPassGuard<preseg_passes::StreamParallelType> guard(
+  preseg_passes::OptimizationPassGuard<hir::StreamParallelType> guard(
       false);
 
   constexpr int64_t M = 32768;
@@ -424,7 +424,7 @@ TEST_F(OverlapDistributedMatmulTest, AG_matmul) {
 
 TEST_F(OverlapDistributedMatmulTest, AG_linear) {
   // Disable StreamParallelType pass tempor
-  preseg_passes::OptimizationPassGuard<preseg_passes::StreamParallelType> guard(
+  preseg_passes::OptimizationPassGuard<hir::StreamParallelType> guard(
       false);
 
   constexpr int64_t M = 32768;

From 7777fe0038c0c1568f5f8abc91b20618bb360e63 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Sun, 27 Apr 2025 07:27:05 -0700
Subject: [PATCH 36/38] lint

---
 csrc/host_ir/executor.cpp                    |  2 +-
 csrc/host_ir/lower.cpp                       |  3 +-
 csrc/host_ir/pass/stream_parallel_type.h     |  3 +-
 python/python_frontend/fusion_definition.cpp |  4 +-
 tests/cpp/test_host_ir_stream_lowering.cpp   | 51 ++++++++------------
 tests/cpp/test_multidevice_host_ir.cpp       |  6 +--
 6 files changed, 29 insertions(+), 40 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 96a462a9444..e089fec32cf 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -771,7 +771,7 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) {
     }
   }
   switch (reduction_op->getReductionOpType()) {
-  case BinaryOpType::Add:
+    case BinaryOpType::Add:
       at::sum_out(output, input, reduction_axes);
       return;
     case BinaryOpType::Max:
diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index 313970f703c..ca9bb80ae4e 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -769,8 +769,7 @@ std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
   }
   hic->resetTopLevelExprs(new_top_level_exprs);
 
-  preseg_passes::OptimizationPass<hir::StreamParallelType>::runPass(
-      hic.get());
+  preseg_passes::OptimizationPass<hir::StreamParallelType>::runPass(hic.get());
 
   return hic;
 }
diff --git a/csrc/host_ir/pass/stream_parallel_type.h b/csrc/host_ir/pass/stream_parallel_type.h
index f389dbe1ff7..8b5f138ad7e 100644
--- a/csrc/host_ir/pass/stream_parallel_type.h
+++ b/csrc/host_ir/pass/stream_parallel_type.h
@@ -22,7 +22,8 @@ namespace nvfuser::hir {
 // An illustration of the pass can be found in the tests
 // `test_host_ir_stream_lowering.cpp`
 // with the option `NVFUSER_DUMP=host_ir`.
-class StreamParallelType : public preseg_passes::OptimizationPass<StreamParallelType> {
+class StreamParallelType
+    : public preseg_passes::OptimizationPass<StreamParallelType> {
   friend class preseg_passes::OptimizationPass<StreamParallelType>;
 
  protected:
diff --git a/python/python_frontend/fusion_definition.cpp b/python/python_frontend/fusion_definition.cpp
index fd3e714f7cb..b77947f1415 100644
--- a/python/python_frontend/fusion_definition.cpp
+++ b/python/python_frontend/fusion_definition.cpp
@@ -455,8 +455,8 @@ std::pair<KernelArgumentHolder, std::vector<Sharding>> FusionDefinition::
         params.lower.communicator_backend = backend_type_;
         // Disable StreamParallelType pass temporarily as proper stream lowering
         // gets implemented
-        preseg_passes::OptimizationPassGuard<hir::StreamParallelType>
-            guard(false);
+        preseg_passes::OptimizationPassGuard<hir::StreamParallelType> guard(
+            false);
         scheds->multi_device_executor = std::make_unique<MultiDeviceExecutor>(
             std::make_unique<Fusion>(*scheds->preschedFusion()),
             Communicator::getInstance(),
diff --git a/tests/cpp/test_host_ir_stream_lowering.cpp b/tests/cpp/test_host_ir_stream_lowering.cpp
index e03fccb34e0..b77df002bc6 100644
--- a/tests/cpp/test_host_ir_stream_lowering.cpp
+++ b/tests/cpp/test_host_ir_stream_lowering.cpp
@@ -35,8 +35,8 @@ TEST_F(HirLowerStreamTest, InputsAreNotStreamParallelized) {
   hic->addInput(tv);
   tv->axis(0)->parallelize(ParallelType::Stream);
 
-  EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
-                   StreamParallelType>::runPass(hic.get()));
+  EXPECT_ANY_THROW(
+      preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get()));
 }
 
 TEST_F(HirLowerStreamTest, Split) {
@@ -50,8 +50,8 @@ TEST_F(HirLowerStreamTest, Split) {
   tv1->split(0, 2);
   tv1->axis(0)->parallelize(ParallelType::Stream);
 
-  EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
-                   StreamParallelType>::runPass(hic.get()));
+  EXPECT_ANY_THROW(
+      preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get()));
 }
 
 TEST_F(HirLowerStreamTest, Merge) {
@@ -65,8 +65,8 @@ TEST_F(HirLowerStreamTest, Merge) {
   tv1->merge(0, 1);
   tv1->axis(0)->parallelize(ParallelType::Stream);
 
-  EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
-                   StreamParallelType>::runPass(hic.get()));
+  EXPECT_ANY_THROW(
+      preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get()));
 }
 
 TEST_F(HirLowerStreamTest, SingleSetOp) {
@@ -81,8 +81,7 @@ TEST_F(HirLowerStreamTest, SingleSetOp) {
   tv1->setMemoryType(MemoryType::Global);
   tv1->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
-      hic.get());
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
   EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
@@ -111,8 +110,7 @@ TEST_F(HirLowerStreamTest, SingleSetOpNonOutermost) {
   tv1->setMemoryType(MemoryType::Global);
   tv1->axis(1)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
-      hic.get());
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
   EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
@@ -144,8 +142,7 @@ TEST_F(HirLowerStreamTest, SingleBinaryOp) {
   tv2->setMemoryType(MemoryType::Global);
   tv2->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
-      hic.get());
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
   EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
@@ -180,8 +177,7 @@ TEST_F(HirLowerStreamTest, TwoSetOps) {
   tv1->axis(0)->parallelize(ParallelType::Stream);
   tv2->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
-      hic.get());
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 3);
   EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
@@ -218,8 +214,7 @@ TEST_F(HirLowerStreamTest, ThreeSetOpsWithDisjointsForLoops) {
   tv1->axis(0)->parallelize(ParallelType::Stream);
   tv3->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
-      hic.get());
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 5);
   EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
@@ -251,8 +246,8 @@ TEST_F(HirLowerStreamTest, ReductionUnsupported) {
   tv1->setMemoryType(MemoryType::Global);
   tv1->axis(0)->parallelize(ParallelType::Stream);
 
-  EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
-                   StreamParallelType>::runPass(hic.get()));
+  EXPECT_ANY_THROW(
+      preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get()));
 }
 
 TEST_F(HirLowerStreamTest, Reduction) {
@@ -267,8 +262,7 @@ TEST_F(HirLowerStreamTest, Reduction) {
   tv1->setMemoryType(MemoryType::Global);
   tv1->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
-      hic.get());
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
   EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
@@ -301,8 +295,7 @@ TEST_F(HirLowerStreamTest, Matmul_M) {
   c->setMemoryType(MemoryType::Global);
   c->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
-      hic.get());
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
   EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
@@ -338,8 +331,7 @@ TEST_F(HirLowerStreamTest, BatchedMatmul) {
   c->setMemoryType(MemoryType::Global);
   c->axis(0)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
-      hic.get());
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
   EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
@@ -375,8 +367,7 @@ TEST_F(HirLowerStreamTest, Matmul_N) {
   c->setMemoryType(MemoryType::Global);
   c->axis(1)->parallelize(ParallelType::Stream);
 
-  preseg_passes::OptimizationPass<StreamParallelType>::runPass(
-      hic.get());
+  preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get());
 
   EXPECT_EQ(hic->topLevelExprs().size(), 2);
   EXPECT_TRUE(hic->topLevelExprs().at(0)->isA<kir::Allocate>());
@@ -412,8 +403,8 @@ TEST_F(HirLowerStreamTest, Matmul_K) {
   c->setMemoryType(MemoryType::Global);
   c->axis(-1)->parallelize(ParallelType::Stream);
 
-  EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
-                   StreamParallelType>::runPass(hic.get()));
+  EXPECT_ANY_THROW(
+      preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get()));
 }
 
 // We don's support PostOnStream because it does not support well pre-allocated
@@ -460,8 +451,8 @@ TEST_F(HirLowerStreamTest, DoNotSupportPostOnStream) {
 
   output->axis(-1)->parallelize(ParallelType::Stream);
 
-  EXPECT_ANY_THROW(preseg_passes::OptimizationPass<
-                   StreamParallelType>::runPass(hic.get()));
+  EXPECT_ANY_THROW(
+      preseg_passes::OptimizationPass<StreamParallelType>::runPass(hic.get()));
 }
 
 } // namespace hir
diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp
index 6932a40fe5c..db53f7f114d 100644
--- a/tests/cpp/test_multidevice_host_ir.cpp
+++ b/tests/cpp/test_multidevice_host_ir.cpp
@@ -365,8 +365,7 @@ using OverlapDistributedMatmulTest = MultiDeviceTest;
 TEST_F(OverlapDistributedMatmulTest, AG_matmul) {
   // Disable StreamParallelType pass temporarily as proper stream lowering gets
   // implemented
-  preseg_passes::OptimizationPassGuard<hir::StreamParallelType> guard(
-      false);
+  preseg_passes::OptimizationPassGuard<hir::StreamParallelType> guard(false);
 
   constexpr int64_t M = 32768;
   constexpr int64_t K = 32768;
@@ -424,8 +423,7 @@ TEST_F(OverlapDistributedMatmulTest, AG_matmul) {
 
 TEST_F(OverlapDistributedMatmulTest, AG_linear) {
   // Disable StreamParallelType pass tempor
-  preseg_passes::OptimizationPassGuard<hir::StreamParallelType> guard(
-      false);
+  preseg_passes::OptimizationPassGuard<hir::StreamParallelType> guard(false);
 
   constexpr int64_t M = 32768;
   constexpr int64_t K = 32768;

From e517bc31fad5cd39f4d84e5d813f469f6ba90a26 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Sun, 27 Apr 2025 07:36:08 -0700
Subject: [PATCH 37/38] fix merge

---
 csrc/host_ir/executor.cpp | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index e089fec32cf..2f2cf9e7b92 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -705,8 +705,6 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) {
   bind(tv, tensor);
 }
 
-<<<<<<< HEAD
-=======
 void HostIrEvaluator::handle(HirAliasSelect* hir_alias_select) {
   auto index =
       expr_evaluator_.evaluate(hir_alias_select->index()).as<int64_t>();
@@ -716,7 +714,6 @@ void HostIrEvaluator::handle(HirAliasSelect* hir_alias_select) {
   bind(hir_alias_select->out(), input.select(axis, index));
 }
 
->>>>>>> bfc7ba836400aa349fab473fa04bab204e9c5601
 void HostIrEvaluator::handle(BinaryOp* binary_op) {
   if (!isKnown(binary_op->outputs().at(0))) {
     return unhandled(binary_op);
@@ -789,18 +786,6 @@ void HostIrEvaluator::handle(ReductionOp* reduction_op) {
   }
 }
 
-<<<<<<< HEAD
-void HostIrEvaluator::handle(HirAliasSelect* hir_alias_select) {
-  auto index =
-      expr_evaluator_.evaluate(hir_alias_select->index()).as<int64_t>();
-  auto input = getKnownConcreteValue(hir_alias_select->in()->as<TensorView>())
-                   .as<at::Tensor>();
-  int64_t axis = hir_alias_select->axis();
-  bind(hir_alias_select->out(), input.select(axis, index));
-}
-
-=======
->>>>>>> bfc7ba836400aa349fab473fa04bab204e9c5601
 void HostIrEvaluator::unhandled(Statement* stmt) {
   NVF_ERROR(stmt->isA<Expr>(), stmt, " must be an Expr");
   auto* expr = stmt->as<Expr>();

From 35ff4dab62b187e3842086ff20a223af267c1a50 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 28 Apr 2025 09:58:44 +0300
Subject: [PATCH 38/38] empty commit to trigger the CI