From faa6756b6415f55657b698011996f67ee97fa1bf Mon Sep 17 00:00:00 2001
From: mcowan <mcowan@nvidia.com>
Date: Fri, 14 Jun 2024 16:40:53 +0000
Subject: [PATCH 1/2] add method to access fusion executor caches in
 multidevice executor

---
 csrc/multidevice/executor.cpp         | 20 +++++++++++++++++---
 csrc/multidevice/executor.h           |  8 +++++++-
 tests/cpp/test_multidevice_matmul.cpp | 12 ++++++++++++
 3 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/csrc/multidevice/executor.cpp b/csrc/multidevice/executor.cpp
index c27b8abb16c..04373c98c01 100644
--- a/csrc/multidevice/executor.cpp
+++ b/csrc/multidevice/executor.cpp
@@ -79,7 +79,7 @@ MultiDeviceExecutor::MultiDeviceExecutor(
     should_run_[group] = involvedDevices(expr).count(comm_.deviceId());
   }
   // prepare the order in which to launch the kernels/comms
-  prepareRuntimeOrder(staged_fusion_.get(), workspace);
+  prepareRuntimeOrder(staged_fusion_.get(), workspace_);
 
   // Allocator setup
   // vals_to_allocate_ stores the tensors that need to be allocated at runtime,
@@ -217,7 +217,7 @@ std::vector<at::Tensor> MultiDeviceExecutor::runWithInput(
   }
 
   // Run through the groups to launch kernels and comms
-  for (auto group : workspace.group_run_order) {
+  for (auto group : workspace_.group_run_order) {
     if (!is_resharding_.at(group)) {
       postKernel(group, launch_params);
     } else {
@@ -261,7 +261,7 @@ std::string MultiDeviceExecutor::validate() const {
 std::ostream& MultiDeviceExecutor::print() {
   int compute_segment_counter = 0;
   int communication_counter = 0;
-  for (auto group : workspace.group_run_order) {
+  for (auto group : workspace_.group_run_order) {
     if (is_resharding_[group]) {
       debug() << "Communication " << communication_counter << ": "
               << group->exprs().at(0) << "\n";
@@ -277,4 +277,18 @@ std::ostream& MultiDeviceExecutor::print() {
   return debug();
 }
 
+std::vector<FusionExecutorCache*> MultiDeviceExecutor::
+    getFusionExecutorCaches() {
+  NVF_CHECK(
+      params_.use_fusion_executor_cache,
+      "MultideviceExecutor must be configured to use FusionExecutorCache");
+  std::vector<FusionExecutorCache*> fecs;
+  for (SegmentedGroup* group : workspace_.group_run_order) {
+    if (!is_resharding_.at(group)) {
+      fecs.push_back(&(fec_.at(group)));
+    }
+  }
+  return fecs;
+}
+
 } // namespace nvfuser
diff --git a/csrc/multidevice/executor.h b/csrc/multidevice/executor.h
index 9c277464dd2..42f61d3eee3 100644
--- a/csrc/multidevice/executor.h
+++ b/csrc/multidevice/executor.h
@@ -110,6 +110,12 @@ class MultiDeviceExecutor {
   //! Print to default debugging output stream
   std::ostream& print();
 
+  // Returns a vector of Fusion executor caches that corresponds to
+  // each compute segment in runtime order.This is only valid if the executor
+  // was configured to use FusionExecutorCache. i.e.
+  // params.use_fusion_executor_cache = true
+  std::vector<FusionExecutorCache*> getFusionExecutorCaches();
+
  private:
   // execute locally a SegmentedGroup that does not involve inter-device
   // communication. Launch Params are used only if
@@ -129,7 +135,7 @@ class MultiDeviceExecutor {
   // 2) a Fusion comprised of one Expr, representing inter-device communication
   std::unique_ptr<SegmentedFusion> staged_fusion_;
   // Stores the order in which the pipeline's stage should be executed
-  RuntimeWorkSpace workspace;
+  RuntimeWorkSpace workspace_;
   // Cache Fusions, FusionExecutors, and Communications
   std::unordered_map<SegmentedGroup*, FusionExecutor> fe_;
   std::unordered_map<SegmentedGroup*, FusionExecutorCache> fec_;
diff --git a/tests/cpp/test_multidevice_matmul.cpp b/tests/cpp/test_multidevice_matmul.cpp
index 607e784e199..5979e424c2e 100644
--- a/tests/cpp/test_multidevice_matmul.cpp
+++ b/tests/cpp/test_multidevice_matmul.cpp
@@ -122,6 +122,9 @@ TEST_F(DistributedMatmulTest, LayoutTN_NoComms) {
       {expected_output},
       __LINE__,
       __FILE__);
+
+  std::vector<FusionExecutorCache*> fecs = runtime.getFusionExecutorCaches();
+  EXPECT_EQ(fecs.size(), 1);
 }
 
 TEST_F(DistributedMatmulTest, LayoutTN_Allgather) {
@@ -177,6 +180,9 @@ TEST_F(DistributedMatmulTest, LayoutTN_Allgather) {
       {expected_output},
       __LINE__,
       __FILE__);
+
+  std::vector<FusionExecutorCache*> fecs = runtime.getFusionExecutorCaches();
+  EXPECT_EQ(fecs.size(), 1);
 }
 
 TEST_F(DistributedMatmulTest, LayoutNT_AllReduce) {
@@ -228,6 +234,9 @@ TEST_F(DistributedMatmulTest, LayoutNT_AllReduce) {
 
   testValidate(
       runtime.completeFusion(), outputs, inputs, {out}, __LINE__, __FILE__);
+
+  std::vector<FusionExecutorCache*> fecs = runtime.getFusionExecutorCaches();
+  EXPECT_EQ(fecs.size(), 1);
 }
 
 TEST_F(DistributedMatmulTest, LayoutNT_ReduceScatter) {
@@ -292,5 +301,8 @@ TEST_F(DistributedMatmulTest, LayoutNT_ReduceScatter) {
       {expected_output},
       __LINE__,
       __FILE__);
+
+  std::vector<FusionExecutorCache*> fecs = runtime.getFusionExecutorCaches();
+  EXPECT_EQ(fecs.size(), 1);
 }
 } // namespace nvfuser

From dae37e47381bf7b3e3b1bb23d7a553d7fb996fa5 Mon Sep 17 00:00:00 2001
From: mcowan <mcowan@nvidia.com>
Date: Fri, 14 Jun 2024 19:17:08 +0000
Subject: [PATCH 2/2] feedback

---
 csrc/multidevice/executor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/multidevice/executor.cpp b/csrc/multidevice/executor.cpp
index 04373c98c01..8fa8223c783 100644
--- a/csrc/multidevice/executor.cpp
+++ b/csrc/multidevice/executor.cpp
@@ -284,7 +284,7 @@ std::vector<FusionExecutorCache*> MultiDeviceExecutor::
       "MultideviceExecutor must be configured to use FusionExecutorCache");
   std::vector<FusionExecutorCache*> fecs;
   for (SegmentedGroup* group : workspace_.group_run_order) {
-    if (!is_resharding_.at(group)) {
+    if (fec_.count(group) > 0) {
       fecs.push_back(&(fec_.at(group)));
     }
   }