From faa6756b6415f55657b698011996f67ee97fa1bf Mon Sep 17 00:00:00 2001 From: mcowan Date: Fri, 14 Jun 2024 16:40:53 +0000 Subject: [PATCH 1/2] add method to access fusion executor caches in multidevice executor --- csrc/multidevice/executor.cpp | 20 +++++++++++++++++--- csrc/multidevice/executor.h | 8 +++++++- tests/cpp/test_multidevice_matmul.cpp | 12 ++++++++++++ 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/csrc/multidevice/executor.cpp b/csrc/multidevice/executor.cpp index c27b8abb16c..04373c98c01 100644 --- a/csrc/multidevice/executor.cpp +++ b/csrc/multidevice/executor.cpp @@ -79,7 +79,7 @@ MultiDeviceExecutor::MultiDeviceExecutor( should_run_[group] = involvedDevices(expr).count(comm_.deviceId()); } // prepare the order in which to launch the kernels/comms - prepareRuntimeOrder(staged_fusion_.get(), workspace); + prepareRuntimeOrder(staged_fusion_.get(), workspace_); // Allocator setup // vals_to_allocate_ stores the tensors that need to be allocated at runtime, @@ -217,7 +217,7 @@ std::vector MultiDeviceExecutor::runWithInput( } // Run through the groups to launch kernels and comms - for (auto group : workspace.group_run_order) { + for (auto group : workspace_.group_run_order) { if (!is_resharding_.at(group)) { postKernel(group, launch_params); } else { @@ -261,7 +261,7 @@ std::string MultiDeviceExecutor::validate() const { std::ostream& MultiDeviceExecutor::print() { int compute_segment_counter = 0; int communication_counter = 0; - for (auto group : workspace.group_run_order) { + for (auto group : workspace_.group_run_order) { if (is_resharding_[group]) { debug() << "Communication " << communication_counter << ": " << group->exprs().at(0) << "\n"; @@ -277,4 +277,18 @@ std::ostream& MultiDeviceExecutor::print() { return debug(); } +std::vector MultiDeviceExecutor:: + getFusionExecutorCaches() { + NVF_CHECK( + params_.use_fusion_executor_cache, + "MultideviceExecutor must be configured to use FusionExecutorCache"); + std::vector fecs; + for (SegmentedGroup* group : workspace_.group_run_order) { + if (!is_resharding_.at(group)) { + fecs.push_back(&(fec_.at(group))); + } + } + return fecs; +} + } // namespace nvfuser diff --git a/csrc/multidevice/executor.h b/csrc/multidevice/executor.h index 9c277464dd2..42f61d3eee3 100644 --- a/csrc/multidevice/executor.h +++ b/csrc/multidevice/executor.h @@ -110,6 +110,12 @@ class MultiDeviceExecutor { //! Print to default debugging output stream std::ostream& print(); + // Returns a vector of Fusion executor caches that corresponds to + // each compute segment in runtime order.This is only valid if the executor + // was configured to use FusionExecutorCache. i.e. + // params.use_fusion_executor_cache = true + std::vector getFusionExecutorCaches(); + private: // execute locally a SegmentedGroup that does not involve inter-device // communication. Launch Params are used only if @@ -129,7 +135,7 @@ class MultiDeviceExecutor { // 2) a Fusion comprised of one Expr, representing inter-device communication std::unique_ptr staged_fusion_; // Stores the order in which the pipeline's stage should be executed - RuntimeWorkSpace workspace; + RuntimeWorkSpace workspace_; // Cache Fusions, FusionExecutors, and Communications std::unordered_map fe_; std::unordered_map fec_; diff --git a/tests/cpp/test_multidevice_matmul.cpp b/tests/cpp/test_multidevice_matmul.cpp index 607e784e199..5979e424c2e 100644 --- a/tests/cpp/test_multidevice_matmul.cpp +++ b/tests/cpp/test_multidevice_matmul.cpp @@ -122,6 +122,9 @@ TEST_F(DistributedMatmulTest, LayoutTN_NoComms) { {expected_output}, __LINE__, __FILE__); + + std::vector fecs = runtime.getFusionExecutorCaches(); + EXPECT_EQ(fecs.size(), 1); } TEST_F(DistributedMatmulTest, LayoutTN_Allgather) { @@ -177,6 +180,9 @@ TEST_F(DistributedMatmulTest, LayoutTN_Allgather) { {expected_output}, __LINE__, __FILE__); + + std::vector fecs = runtime.getFusionExecutorCaches(); + EXPECT_EQ(fecs.size(), 1); } TEST_F(DistributedMatmulTest, LayoutNT_AllReduce) { @@ -228,6 +234,9 @@ TEST_F(DistributedMatmulTest, LayoutNT_AllReduce) { testValidate( runtime.completeFusion(), outputs, inputs, {out}, __LINE__, __FILE__); + + std::vector fecs = runtime.getFusionExecutorCaches(); + EXPECT_EQ(fecs.size(), 1); } TEST_F(DistributedMatmulTest, LayoutNT_ReduceScatter) { @@ -292,5 +301,8 @@ TEST_F(DistributedMatmulTest, LayoutNT_ReduceScatter) { {expected_output}, __LINE__, __FILE__); + + std::vector fecs = runtime.getFusionExecutorCaches(); + EXPECT_EQ(fecs.size(), 1); } } // namespace nvfuser From dae37e47381bf7b3e3b1bb23d7a553d7fb996fa5 Mon Sep 17 00:00:00 2001 From: mcowan Date: Fri, 14 Jun 2024 19:17:08 +0000 Subject: [PATCH 2/2] feedback --- csrc/multidevice/executor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/multidevice/executor.cpp b/csrc/multidevice/executor.cpp index 04373c98c01..8fa8223c783 100644 --- a/csrc/multidevice/executor.cpp +++ b/csrc/multidevice/executor.cpp @@ -284,7 +284,7 @@ std::vector MultiDeviceExecutor:: "MultideviceExecutor must be configured to use FusionExecutorCache"); std::vector fecs; for (SegmentedGroup* group : workspace_.group_run_order) { - if (!is_resharding_.at(group)) { + if (fec_.count(group) > 0) { fecs.push_back(&(fec_.at(group))); } }