From 4382fc5b4312aee0802f676f6d9c772c3e38aded Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 15 Oct 2024 18:00:38 +0800 Subject: [PATCH 1/2] [pipeline] hotfix backward for multiple outputs --- colossalai/pipeline/schedule/interleaved_pp.py | 16 +++++++--------- colossalai/pipeline/schedule/one_f_one_b.py | 16 +++++++--------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/colossalai/pipeline/schedule/interleaved_pp.py b/colossalai/pipeline/schedule/interleaved_pp.py index c538ee0715b4..53251fd36924 100644 --- a/colossalai/pipeline/schedule/interleaved_pp.py +++ b/colossalai/pipeline/schedule/interleaved_pp.py @@ -351,15 +351,13 @@ def backward_step( if output_obj_grad is None: optimizer.backward(output_obj) else: - if "backward_tensor_keys" not in output_obj: - for k, grad in output_obj_grad.items(): - optimizer.backward_by_grad(output_obj[k], grad) - else: - for k, grad in output_obj_grad.items(): - output_obj[k].grad = grad - for k in output_obj["backward_tensor_keys"]: - tensor_to_backward = output_obj[k] - optimizer.backward_by_grad(tensor_to_backward, tensor_to_backward.grad) + keys = output_obj.get("backward_tensor_keys", output_obj_grad.keys()) + tensors_to_backward = [] + grads_to_backward = [] + for k in keys: + tensors_to_backward.append(output_obj[k]) + grads_to_backward.append(output_obj_grad[k]) + optimizer.backward_by_grad(tensors_to_backward, grads_to_backward) # Collect the grad of the input_obj. input_obj_grad = None diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py index 0fc90995adcc..f38390e9f011 100644 --- a/colossalai/pipeline/schedule/one_f_one_b.py +++ b/colossalai/pipeline/schedule/one_f_one_b.py @@ -305,15 +305,13 @@ def backward_step( if output_obj_grad is None: optimizer.backward(output_obj) else: - if "backward_tensor_keys" not in output_obj: - for k, grad in output_obj_grad.items(): - optimizer.backward_by_grad(output_obj[k], grad) - else: - for k, grad in output_obj_grad.items(): - output_obj[k].grad = grad - for k in output_obj["backward_tensor_keys"]: - tensor_to_backward = output_obj[k] - optimizer.backward_by_grad(tensor_to_backward, tensor_to_backward.grad) + keys = output_obj.get("backward_tensor_keys", output_obj_grad.keys()) + tensors_to_backward = [] + grads_to_backward = [] + for k in keys: + tensors_to_backward.append(output_obj[k]) + grads_to_backward.append(output_obj_grad[k]) + optimizer.backward_by_grad(tensors_to_backward, grads_to_backward) # Collect the grad of the input_obj. input_obj_grad = None From e52ac38df704c4ee855970d11b7a967996eae776 Mon Sep 17 00:00:00 2001 From: ver217 Date: Wed, 16 Oct 2024 16:34:58 +0800 Subject: [PATCH 2/2] [pipeline] hotfix backward for multiple outputs --- colossalai/pipeline/schedule/interleaved_pp.py | 5 ++++- colossalai/pipeline/schedule/one_f_one_b.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/colossalai/pipeline/schedule/interleaved_pp.py b/colossalai/pipeline/schedule/interleaved_pp.py index 53251fd36924..5da98364dc85 100644 --- a/colossalai/pipeline/schedule/interleaved_pp.py +++ b/colossalai/pipeline/schedule/interleaved_pp.py @@ -357,7 +357,10 @@ def backward_step( for k in keys: tensors_to_backward.append(output_obj[k]) grads_to_backward.append(output_obj_grad[k]) - optimizer.backward_by_grad(tensors_to_backward, grads_to_backward) + if len(tensors_to_backward) == 1: + optimizer.backward_by_grad(tensors_to_backward[0], grads_to_backward[0]) + else: + optimizer.backward_by_grad(tensors_to_backward, grads_to_backward) # Collect the grad of the input_obj. input_obj_grad = None diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py index f38390e9f011..224d63688b16 100644 --- a/colossalai/pipeline/schedule/one_f_one_b.py +++ b/colossalai/pipeline/schedule/one_f_one_b.py @@ -311,7 +311,10 @@ def backward_step( for k in keys: tensors_to_backward.append(output_obj[k]) grads_to_backward.append(output_obj_grad[k]) - optimizer.backward_by_grad(tensors_to_backward, grads_to_backward) + if len(tensors_to_backward) == 1: + optimizer.backward_by_grad(tensors_to_backward[0], grads_to_backward[0]) + else: + optimizer.backward_by_grad(tensors_to_backward, grads_to_backward) # Collect the grad of the input_obj. input_obj_grad = None