Describe the feature
I've been analyzing the code related to P2P communication, where tensors are converted to byte streams. This potentially involves transferring data from CUDA to CPU (numpy), followed by transmission using c10d.broadcast. This implementation appears reminiscent of an early version of DeepSpeed.
In contrast, modern implementations in DeepSpeed and Megatron utilize isend and irecv for P2P communication, eliminating the need for CUDA-to-CPU transfers. From my testing, I've observed that with ColossalAI's pipeline parallel, GPU utilization often drops below 50%. On the other hand, while using Megatron and DeepSpeed, the GPU utilization consistently remains above 99%. I'm wondering if the P2P communication method has any bearing on this disparity.
Is there any plan to revert to using the isend and irecv approach for P2P communication?
def _cuda_safe_tensor_to_object(tensor: torch.Tensor, tensor_size: torch.Size) -> object:
"""transform tensor to object with unpickle.
Info of the device in bytes stream will be modified into current device before unpickling
Args:
tensor (:class:`torch.tensor`): tensor to be unpickled
tensor_size (:class:`torch.Size`): Size of the real info in bytes
Returns:
Any: object after unpickled
"""
buf = tensor.numpy().tobytes()[:tensor_size]
if b"cuda" in buf:
buf_array = bytearray(buf)
device_index = torch.cuda.current_device()
# There might be more than one output tensors during forward
for cuda_str in re.finditer(b"cuda", buf_array):
pos = cuda_str.start()
buf_array[pos + 5] = 48 + device_index
buf = bytes(buf_array)
io_bytes = io.BytesIO(buf)
byte_pickler = _unpickler(io_bytes)
unpickle = byte_pickler.load()
return unpickle
def _broadcast_object_list(
object_list: List[Any], src: int, group: ProcessGroup, device: Optional[Union[torch.device, str, int]] = None
):
"""This is a modified version of the broadcast_object_list in torch.distribution
The only difference is that object will be move to correct device after unpickled.
If local_rank = src, then object list will be sent to rank src. Otherwise, object list will
be updated with data sent from rank src.
Args:
object_list (List[Any]): list of object to broadcast
src (int): source rank to broadcast
dst (int): dst rank to broadcast
device (:class:`torch.device`): device to do broadcast. current device in default
"""
if c10d._rank_not_in_group(group):
c10d._warn_not_in_group("broadcast_object_list")
return
is_nccl_backend = c10d._check_for_nccl_backend(group)
current_device = None
if device is not None:
if is_nccl_backend and device.type != "cuda":
raise ValueError("device type must be cuda for nccl backend")
current_device = device
else:
current_device = torch.device("cpu")
if is_nccl_backend:
current_device = torch.device("cuda", torch.cuda.current_device())
my_rank = dist.get_rank()
# Serialize object_list elements to tensors on src rank.
if my_rank == src:
if Version(torch.__version__) >= Version("1.13.0"):
tensor_list, size_list = zip(*[c10d._object_to_tensor(obj, device=current_device) for obj in object_list])
else:
tensor_list, size_list = zip(*[c10d._object_to_tensor(obj) for obj in object_list])
object_sizes_tensor = torch.cat(size_list)
else:
object_sizes_tensor = torch.empty(len(object_list), dtype=torch.long)
if is_nccl_backend:
object_sizes_tensor = object_sizes_tensor.to(current_device)
# Broadcast object sizes
c10d.broadcast(object_sizes_tensor, src=src, group=group, async_op=False)
# Concatenate and broadcast serialized object tensors
if my_rank == src:
object_tensor = torch.cat(tensor_list)
else:
object_tensor = torch.empty( # type: ignore[call-overload]
torch.sum(object_sizes_tensor).item(), # type: ignore[arg-type]
dtype=torch.uint8,
)
if is_nccl_backend:
object_tensor = object_tensor.to(current_device)
c10d.broadcast(object_tensor, src=src, group=group, async_op=False)
# Deserialize objects using their stored sizes.
offset = 0
if my_rank != src:
for i, obj_size in enumerate(object_sizes_tensor):
obj_view = object_tensor[offset : offset + obj_size]
obj_view = obj_view.type(torch.uint8)
if obj_view.device != torch.device("cpu"):
obj_view = obj_view.cpu()
offset += obj_size
# unpickle
unpickle_object = _cuda_safe_tensor_to_object(obj_view, obj_size)
# unconsistence in device
if (
isinstance(unpickle_object, torch.Tensor)
and unpickle_object.device.index != torch.cuda.current_device()
):
unpickle_object = unpickle_object.cuda()
object_list[i] = unpickle_object
Describe the feature
I've been analyzing the code related to P2P communication, where tensors are converted to byte streams. This potentially involves transferring data from CUDA to CPU (numpy), followed by transmission using
c10d.broadcast. This implementation appears reminiscent of an early version of DeepSpeed.In contrast, modern implementations in DeepSpeed and Megatron utilize
isendandirecvfor P2P communication, eliminating the need for CUDA-to-CPU transfers. From my testing, I've observed that with ColossalAI's pipeline parallel, GPU utilization often drops below 50%. On the other hand, while using Megatron and DeepSpeed, the GPU utilization consistently remains above 99%. I'm wondering if the P2P communication method has any bearing on this disparity.Is there any plan to revert to using the
isendandirecvapproach for P2P communication?