From 63e78a68ed56526a2ea05eacfecd719b764458a3 Mon Sep 17 00:00:00 2001 From: valarLip <340077269@qq.com> Date: Wed, 17 Dec 2025 20:32:52 +0800 Subject: [PATCH] add guard in case pynccl init failed --- .../dist/device_communicators/communicator_cuda.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/aiter/dist/device_communicators/communicator_cuda.py b/aiter/dist/device_communicators/communicator_cuda.py index fcc7ee05b2..60ff59ca57 100644 --- a/aiter/dist/device_communicators/communicator_cuda.py +++ b/aiter/dist/device_communicators/communicator_cuda.py @@ -49,10 +49,16 @@ def __init__( PyNcclCommunicator, ) - self.pynccl_comm = PyNcclCommunicator( - group=self.cpu_group, - device=self.device, - ) + try: + self.pynccl_comm = PyNcclCommunicator( + group=self.cpu_group, + device=self.device, + ) + except Exception as e: + logger.warning( + f"Failed to initialize PyNcclCommunicator for group " + f"{self.unique_name}. Exception: {e}" + ) # if is_symmetric_memory_enabled(): # register_nccl_symmetric_ops(self.pynccl_comm)