NVIDIA · samnordmann · Apr 16, 2025 · Mar 26, 2025 · Mar 26, 2025 · Apr 11, 2025
diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
@@ -3881,15 +3881,12 @@ bool SegmentCandidateFinder::codeGenSupportedMerge(
   NVF_ERROR(
       areDirectlyConnected(group1, group2),
       "only support testing immediate producer-consumer groups");
-  if (options_.only_segment_resharding_exprs) {
-    for (auto group : {group1, group2}) {
-      for (auto expr : group->exprs()) {
-        if (isResharding(expr)) {
-          return false;
-        }
-      }
-    }
-    return true;
+  // The segmemter should ideally be redesigned to be more flexible and
+  // decoupled from the schedulers, but for now, we just return
+  // `SchedulerType::None` as it is not relevant when the segmenter is
+  // used with a custom should-merge function.
+  if (options_.custom_should_merge_groups != nullptr) {
+    return (options_.custom_should_merge_groups)(group1, group2);
   }
   return tryMerge(segmented_fusion_.get(), runtimeInfo(), group1, group2) !=
       SchedulerType::None;
@@ -3900,7 +3897,7 @@ bool SegmentCandidateFinder::codeGenSupportedMerge(
 SchedulerType SegmentCandidateFinder::deriveSchedulerType(
     SegmentedGroup* group) {
   FUSER_PERF_SCOPE("SegmentCandidateFinder::deriveSchedulerType");
-  if (options_.only_segment_resharding_exprs) {
+  if (options_.custom_should_merge_groups != nullptr) {
     // We don't need to generate a SchedulerType for multidevice segments at
     // this moment
     return SchedulerType::None;
@@ -3920,7 +3917,7 @@ SegmentCandidateFinder::SegmentCandidateFinder(
     : options_(options), runtime_inputs_(inputs) {
   FUSER_PERF_SCOPE("SegmentCandidateFinder::SegmentCandidateFinder");
   NVF_ERROR(
-      !options_.only_segment_resharding_exprs ||
+      options_.custom_should_merge_groups == nullptr ||
           (!options_.run_translate_welford &&
            !options_.run_combine_reductions && options_.run_herrmann_merge &&
            options_.run_final_merge),

diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h
@@ -19,6 +19,7 @@
 #include <visibility.h>
 
 #include <deque>
+#include <functional>
 #include <list>
 #include <unordered_set>
 #include <vector>
@@ -482,7 +483,12 @@ struct SegmentCandidateFinderOptions {
   bool run_combine_reductions = true;
   bool run_herrmann_merge = true;
   bool run_final_merge = true;
-  bool only_segment_resharding_exprs = false;
+  // if provided, this custom function will be used to determine if two groups
+  // should be merged. If not provided, the tryMerge function will be used. This
+  // option is used in the context of MultiGpus where we proceed to a first
+  // segmentation to scoop out communications from compute.
+  std::function<bool(SegmentedGroup*, SegmentedGroup*)>
+      custom_should_merge_groups = nullptr;
 };
 
 //!  SegmentCandidateFinder

diff --git a/csrc/host_ir/container.cpp b/csrc/host_ir/container.cpp
@@ -26,7 +26,7 @@ HostIrContainer::HostIrContainer(int64_t num_kernel_executors)
 HostIrContainer::~HostIrContainer() = default;
 
 Stream* HostIrContainer::getDefaultStream() {
-  if (!default_stream_) {
+  if (default_stream_ == nullptr) {
     default_stream_ = IrBuilder::createInContainer<Stream>(this);
   }
   return default_stream_;
@@ -35,6 +35,11 @@ Stream* HostIrContainer::getDefaultStream() {
 std::ostream& HostIrContainer::print(std::ostream& os) const {
   IrMathPrinter op_exprs(os);
   op_exprs.handle(this);
+  os << "Aliases:{";
+  for (const auto& alias : alias_) {
+    os << "\n  " << alias.first << " -> " << alias.second;
+  }
+  os << "\n}\n";
   return os;
 }
 

diff --git a/csrc/host_ir/container.h b/csrc/host_ir/container.h
@@ -41,6 +41,10 @@ class HostIrContainer final : public Fusion {
   //! Print to an output stream
   std::ostream& print(std::ostream& os) const;
 
+  void resetTopLevelExprs(std::vector<Expr*> exprs) {
+    top_level_exprs_ = std::move(exprs);
+  }
+
   const std::vector<Expr*>& topLevelExprs() const;
 
   void pushBackTopLevelExprs(Expr* expr);
@@ -55,10 +59,22 @@ class HostIrContainer final : public Fusion {
 
   Stream* getDefaultStream();
 
+  void markAlias(TensorView* original, const TensorView* new_alias) {
+    while (alias_.count(original)) {
+      original = alias_[original]->as<TensorView>();
+    }
+    alias_[new_alias] = original;
+  }
+
+  const auto& alias() const {
+    return alias_;
+  }
+
  private:
   std::vector<Expr*> top_level_exprs_;
   std::vector<std::unique_ptr<KernelExecutor>> kernel_executors_;
   Stream* default_stream_ = nullptr;
+  std::unordered_map<const Val*, Val*> alias_;
 };
 
 } // namespace hir