From 5388e5db719e0d15cafe32eaa8dca2c906201dee Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 20 Feb 2025 09:58:08 -0800 Subject: [PATCH 1/4] Handle special case of libuv for Windows Signed-off-by: Logan Adams --- deepspeed/comm/torch.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py index efa0640fb87b..995ffdb0e227 100755 --- a/deepspeed/comm/torch.py +++ b/deepspeed/comm/torch.py @@ -145,11 +145,15 @@ def has_reduce_scatter_tensor(self): def init_process_group(self, backend, timeout, init_method, rank, world_size): if not torch.distributed.is_initialized(): + # Windows torch builds do not come with lib_uv by default. + # More information here: https://pytorch.org/tutorials/intermediate/TCPStore_libuv_backend.html + use_lib_uv = False if os.name == "nt" and required_torch_version(min_version=2.4) else True torch.distributed.init_process_group(backend, timeout=timeout, init_method=init_method, rank=rank, - world_size=world_size) + world_size=world_size, + use_libuv=use_libuv) self.using_mpi = torch.distributed.get_backend() == 'mpi' @disable_compiler_collective From cad017417924c8b17528858486e49d48e3953f0c Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 20 Feb 2025 10:16:27 -0800 Subject: [PATCH 2/4] typo in varaible name Signed-off-by: Logan Adams --- deepspeed/comm/torch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py index 995ffdb0e227..f36faf14fa36 100755 --- a/deepspeed/comm/torch.py +++ b/deepspeed/comm/torch.py @@ -147,7 +147,7 @@ def init_process_group(self, backend, timeout, init_method, rank, world_size): if not torch.distributed.is_initialized(): # Windows torch builds do not come with lib_uv by default. # More information here: https://pytorch.org/tutorials/intermediate/TCPStore_libuv_backend.html - use_lib_uv = False if os.name == "nt" and required_torch_version(min_version=2.4) else True + use_libuv = False if os.name == "nt" and required_torch_version(min_version=2.4) else True torch.distributed.init_process_group(backend, timeout=timeout, init_method=init_method, From 5778af9190e21508f9d92717559504e8c4a58a4e Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 20 Feb 2025 10:37:55 -0800 Subject: [PATCH 3/4] use_libuv is only added in torch 2.4+ Signed-off-by: Logan Adams --- deepspeed/comm/torch.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py index f36faf14fa36..9121bfef5dc0 100755 --- a/deepspeed/comm/torch.py +++ b/deepspeed/comm/torch.py @@ -145,15 +145,22 @@ def has_reduce_scatter_tensor(self): def init_process_group(self, backend, timeout, init_method, rank, world_size): if not torch.distributed.is_initialized(): - # Windows torch builds do not come with lib_uv by default. - # More information here: https://pytorch.org/tutorials/intermediate/TCPStore_libuv_backend.html - use_libuv = False if os.name == "nt" and required_torch_version(min_version=2.4) else True - torch.distributed.init_process_group(backend, - timeout=timeout, - init_method=init_method, - rank=rank, - world_size=world_size, - use_libuv=use_libuv) + if required_torch_version(min_version=2.4): + # Windows torch builds do not come with lib_uv by default. + # More information here: https://pytorch.org/tutorials/intermediate/TCPStore_libuv_backend.html + use_libuv = False if os.name == "nt" else True + torch.distributed.init_process_group(backend, + timeout=timeout, + init_method=init_method, + rank=rank, + world_size=world_size, + use_libuv=use_libuv) + else: + torch.distributed.init_process_group(backend, + timeout=timeout, + init_method=init_method, + rank=rank, + world_size=world_size) self.using_mpi = torch.distributed.get_backend() == 'mpi' @disable_compiler_collective From 1ffdc44f0e7c27dc8438a715d5e2e6e51da368c5 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 20 Feb 2025 11:12:19 -0800 Subject: [PATCH 4/4] missed not in if statement --- deepspeed/comm/torch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py index 9121bfef5dc0..1146832d7655 100755 --- a/deepspeed/comm/torch.py +++ b/deepspeed/comm/torch.py @@ -145,7 +145,7 @@ def has_reduce_scatter_tensor(self): def init_process_group(self, backend, timeout, init_method, rank, world_size): if not torch.distributed.is_initialized(): - if required_torch_version(min_version=2.4): + if not required_torch_version(min_version=2.4): # Windows torch builds do not come with lib_uv by default. # More information here: https://pytorch.org/tutorials/intermediate/TCPStore_libuv_backend.html use_libuv = False if os.name == "nt" else True