From f1d2a88cfb03d6503b1b972dd278f7fbfafe15ff Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 24 Apr 2025 06:36:27 -0700 Subject: [PATCH 1/4] add barrier at the end of IpcHandleCache::exchangeHandles --- csrc/multidevice/ipc_handle.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/csrc/multidevice/ipc_handle.cpp b/csrc/multidevice/ipc_handle.cpp index 6bb700dc2de..9a5ec4286b8 100644 --- a/csrc/multidevice/ipc_handle.cpp +++ b/csrc/multidevice/ipc_handle.cpp @@ -151,6 +151,12 @@ void IpcHandleCache::exchangeHandles( insert(communication, std::move(ipc_handles)); } + + // a second barrier is needed here to ensure all ranks have received the + // memhandles and the keys are deleted from the store before the next call to + // exchangeHandles + // TODO: precisely select what ranks need to wait on that barrier. + communicator->barrier(); } } // namespace nvfuser From cebfa873c8cb1383af0edd587a592688ae652f3a Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 24 Apr 2025 08:11:42 -0700 Subject: [PATCH 2/4] reenable P2PCommunicationTest.CudaComm test --- tests/cpp/test_multidevice_communications.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp index 1b6ce59801c..af0c0719aa7 100644 --- a/tests/cpp/test_multidevice_communications.cpp +++ b/tests/cpp/test_multidevice_communications.cpp @@ -417,7 +417,7 @@ INSTANTIATE_TEST_SUITE_P( using P2PCommunicationTest = MultiDeviceTest; -TEST_F(P2PCommunicationTest, DISABLED_CudaComm) { +TEST_F(P2PCommunicationTest, CudaComm) { static constexpr int kTensorSize = 8; static constexpr int kNumRepetitions = 32; From 01926f4104ca892d714015b7afa34a5b7fd00e09 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 24 Apr 2025 08:57:17 -0700 Subject: [PATCH 3/4] empty commit to trigger CI From 8ce05dd205cd2ba9202027c8c4fd07fa72ab0927 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 25 Apr 2025 00:23:28 -0700 Subject: [PATCH 4/4] reenable MultiDeviceTest.ShareIpcMemHandles --- tests/cpp/test_multidevice_host_ir.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp index 0b6efbd15a4..88286d6e4c0 100644 --- a/tests/cpp/test_multidevice_host_ir.cpp +++ b/tests/cpp/test_multidevice_host_ir.cpp @@ -478,7 +478,7 @@ TEST_F(OverlapDistributedMatmulTest, AG_linear) { EXPECT_TRUE(torch::allclose(out_ref, out_at, 1e-1, 1e-1)); } -TEST_F(MultiDeviceTest, DISABLED_ShareIpcMemHandles) { +TEST_F(MultiDeviceTest, ShareIpcMemHandles) { static constexpr int kTensorSize = 4; static constexpr int kNumRepetitions = 10;