From dd086b83358f2243f43dd2a1a20e2b9c2e02b42b Mon Sep 17 00:00:00 2001 From: CjhHa1 Date: Thu, 24 Aug 2023 14:26:17 +0800 Subject: [PATCH 1/3] pause --- tests/kit/model_zoo/torchrec/__init__.py | 2 +- .../test_model/test_shard_bert.py | 89 +++++++++++-------- 2 files changed, 54 insertions(+), 37 deletions(-) diff --git a/tests/kit/model_zoo/torchrec/__init__.py b/tests/kit/model_zoo/torchrec/__init__.py index 43952e6998cf..4a19f2449602 100644 --- a/tests/kit/model_zoo/torchrec/__init__.py +++ b/tests/kit/model_zoo/torchrec/__init__.py @@ -1 +1 @@ -from .torchrec import * +#from .torchrec import * diff --git a/tests/test_shardformer/test_model/test_shard_bert.py b/tests/test_shardformer/test_model/test_shard_bert.py index 76f8c0541de5..b66fc0a9b67e 100644 --- a/tests/test_shardformer/test_model/test_shard_bert.py +++ b/tests/test_shardformer/test_model/test_shard_bert.py @@ -73,41 +73,56 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, torch.cuda.empty_cache() -@parameterize('test_config', [{ - 'tp_size': 1, - 'pp_size': 2, - 'num_microbatches': 4, - 'use_lazy_init': True, - 'precision': 'fp32', -}, { - 'tp_size': 2, - 'pp_size': 2, - 'num_microbatches': 2, - 'enable_all_optimization': True, - 'use_lazy_init': True, - 'precision': 'fp16', - 'initial_scale': 1, -}, { - 'tp_size': 4, - 'pp_size': 1, - 'enable_all_optimization': True, - 'use_lazy_init': False, - 'precision': 'fp32', -}, { - 'tp_size': 2, - 'pp_size': 1, - 'enable_all_optimization': True, - 'use_lazy_init': False, - 'precision': 'fp32' -}, { - 'tp_size': 2, - 'pp_size': 1, - 'enable_all_optimization': True, - 'use_lazy_init': True, - 'zero_stage': 2, - 'precision': 'fp16', - 'initial_scale': 1 -}]) +@parameterize( + 'test_config', + [ + # { + # 'tp_size': 1, + # 'pp_size': 2, + # 'num_microbatches': 4, + # 'use_lazy_init': True, + # 'precision': 'fp32', + # }, + # { + # 'tp_size': 2, + # 'pp_size': 2, + # 'num_microbatches': 2, + # 'enable_all_optimization': True, + # 'use_lazy_init': True, + # 'precision': 'fp16', + # 'initial_scale': 1, + # }, { + # 'tp_size': 4, + # 'pp_size': 1, + # 'enable_all_optimization': True, + # 'use_lazy_init': False, + # 'precision': 'fp32', + # }, { + # 'tp_size': 2, + # 'pp_size': 1, + # 'enable_all_optimization': True, + # 'use_lazy_init': False, + # 'precision': 'fp32' + # }, { + # 'tp_size': 2, + # 'pp_size': 1, + # 'enable_all_optimization': True, + # 'use_lazy_init': True, + # 'zero_stage': 2, + # 'precision': 'fp16', + # 'initial_scale': 1 + # }, + { + 'tp_size': 1, + 'pp_size': 2, + 'num_microbatches': 2, + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'zero_stage': 1, + 'precision': 'fp16', + 'initial_scale': 1 + } + ]) def run_bert_test(test_config): sub_model_zoo = model_zoo.get_sub_registry('transformers_bert') @@ -135,6 +150,8 @@ def run_bert_3d_test(test_config): sub_model_zoo = model_zoo.get_sub_registry('transformers_bert') for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): + if name != "transformer_bert": + continue check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config) clear_layout_converter() @@ -170,4 +187,4 @@ def test_bert_3d(): if __name__ == "__main__": test_bert() - test_bert_3d() + #test_bert_3d() From be878b7d62a8f047fc1f25e160729e7ce55585e3 Mon Sep 17 00:00:00 2001 From: CjhHa1 Date: Fri, 25 Aug 2023 16:17:16 +0800 Subject: [PATCH 2/3] finish pp+zero1 --- colossalai/pipeline/schedule/one_f_one_b.py | 3 +- colossalai/zero/low_level/low_level_optim.py | 9 +- tests/kit/model_zoo/torchrec/__init__.py | 2 +- .../test_model/test_shard_bert.py | 98 +++++++++---------- .../test_model/test_shard_bloom.py | 9 ++ .../test_model/test_shard_gpt2.py | 9 ++ .../test_model/test_shard_llama.py | 9 ++ .../test_model/test_shard_opt.py | 9 ++ .../test_model/test_shard_t5.py | 9 ++ .../test_model/test_shard_vit.py | 13 ++- .../test_model/test_shard_whisper.py | 64 ++++++------ 11 files changed, 146 insertions(+), 88 deletions(-) diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py index f5e4929aa7c8..0058873c21ba 100644 --- a/colossalai/pipeline/schedule/one_f_one_b.py +++ b/colossalai/pipeline/schedule/one_f_one_b.py @@ -128,11 +128,11 @@ def forward_step(self, Union[torch.Tensor, dict]: The intermediate output (dict) of the current stage. If it is the last stage, the output is the loss (Tensor). """ micro_batch = self.load_micro_batch() - # for the first stage, input_obj is None # for the non-first stage, input_obj is the output of the previous stage and it's must be a dict output_obj = model_forward(model, micro_batch, input_obj) if self.stage_manager.is_last_stage(): + loss = criterion(output_obj, micro_batch) / self.num_microbatches if accum_loss is not None: accum_loss.add_(loss.detach()) @@ -158,7 +158,6 @@ def backward_step(self, optimizer: OptimizerWrapper, input_obj: Optional[dict], # Retain the grad on the input_obj. tree_map(retain_grad, input_obj) - # Backward pass. if output_obj_grad is None: optimizer.backward(output_obj) diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py index 64d6a5395120..a1e85e5b90f6 100644 --- a/colossalai/zero/low_level/low_level_optim.py +++ b/colossalai/zero/low_level/low_level_optim.py @@ -316,7 +316,6 @@ def _add_to_bucket(self, param, group_id): def backward(self, loss, retain_graph=False): assert not(self._partition_grads and not self.require_grad_sync), \ "ZeRO2(partition_grads) and gradient accumulation(no_sync) are not compatible" - if self.mixed_precision_mixin is not None: loss = self.mixed_precision_mixin.pre_backward(loss) @@ -333,6 +332,13 @@ def backward(self, loss, retain_graph=False): self.zero_grad() + def backward_by_grad(self, tensor, grad): + # in lower stage which grad is transfered by higher stage + # we need to pass the optim state down. + if self.mixed_precision_mixin is not None: + grad = self.mixed_precision_mixin.pre_backward_by_grad(tensor, grad) + torch.autograd.backward(tensor, grad) + def zero_grad(self, set_to_none=True): """ Set parameter gradients to zero. If set_to_none = True, gradient @@ -358,7 +364,6 @@ def zero_grad(self, set_to_none=True): def step(self, closure=None): assert closure is None, 'closure is not supported by step()' - if not self.require_grad_sync: return diff --git a/tests/kit/model_zoo/torchrec/__init__.py b/tests/kit/model_zoo/torchrec/__init__.py index 4a19f2449602..43952e6998cf 100644 --- a/tests/kit/model_zoo/torchrec/__init__.py +++ b/tests/kit/model_zoo/torchrec/__init__.py @@ -1 +1 @@ -#from .torchrec import * +from .torchrec import * diff --git a/tests/test_shardformer/test_model/test_shard_bert.py b/tests/test_shardformer/test_model/test_shard_bert.py index b66fc0a9b67e..a15645a7f344 100644 --- a/tests/test_shardformer/test_model/test_shard_bert.py +++ b/tests/test_shardformer/test_model/test_shard_bert.py @@ -73,56 +73,50 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, torch.cuda.empty_cache() -@parameterize( - 'test_config', - [ - # { - # 'tp_size': 1, - # 'pp_size': 2, - # 'num_microbatches': 4, - # 'use_lazy_init': True, - # 'precision': 'fp32', - # }, - # { - # 'tp_size': 2, - # 'pp_size': 2, - # 'num_microbatches': 2, - # 'enable_all_optimization': True, - # 'use_lazy_init': True, - # 'precision': 'fp16', - # 'initial_scale': 1, - # }, { - # 'tp_size': 4, - # 'pp_size': 1, - # 'enable_all_optimization': True, - # 'use_lazy_init': False, - # 'precision': 'fp32', - # }, { - # 'tp_size': 2, - # 'pp_size': 1, - # 'enable_all_optimization': True, - # 'use_lazy_init': False, - # 'precision': 'fp32' - # }, { - # 'tp_size': 2, - # 'pp_size': 1, - # 'enable_all_optimization': True, - # 'use_lazy_init': True, - # 'zero_stage': 2, - # 'precision': 'fp16', - # 'initial_scale': 1 - # }, - { - 'tp_size': 1, - 'pp_size': 2, - 'num_microbatches': 2, - 'enable_all_optimization': True, - 'use_lazy_init': True, - 'zero_stage': 1, - 'precision': 'fp16', - 'initial_scale': 1 - } - ]) +@parameterize('test_config', [{ + 'tp_size': 1, + 'pp_size': 2, + 'num_microbatches': 4, + 'use_lazy_init': True, + 'precision': 'fp32', +}, { + 'tp_size': 2, + 'pp_size': 2, + 'num_microbatches': 2, + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'precision': 'fp16', + 'initial_scale': 1, +}, { + 'tp_size': 4, + 'pp_size': 1, + 'enable_all_optimization': True, + 'use_lazy_init': False, + 'precision': 'fp32', +}, { + 'tp_size': 2, + 'pp_size': 1, + 'enable_all_optimization': True, + 'use_lazy_init': False, + 'precision': 'fp32' +}, { + 'tp_size': 2, + 'pp_size': 1, + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'zero_stage': 2, + 'precision': 'fp16', + 'initial_scale': 1 +}, { + 'tp_size': 1, + 'pp_size': 2, + 'num_microbatches': 2, + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'zero_stage': 1, + 'precision': 'fp16', + 'initial_scale': 1 +}]) def run_bert_test(test_config): sub_model_zoo = model_zoo.get_sub_registry('transformers_bert') @@ -150,8 +144,6 @@ def run_bert_3d_test(test_config): sub_model_zoo = model_zoo.get_sub_registry('transformers_bert') for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): - if name != "transformer_bert": - continue check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config) clear_layout_converter() @@ -187,4 +179,4 @@ def test_bert_3d(): if __name__ == "__main__": test_bert() - #test_bert_3d() + test_bert_3d() diff --git a/tests/test_shardformer/test_model/test_shard_bloom.py b/tests/test_shardformer/test_model/test_shard_bloom.py index 0e236fd47934..590eff642e2b 100644 --- a/tests/test_shardformer/test_model/test_shard_bloom.py +++ b/tests/test_shardformer/test_model/test_shard_bloom.py @@ -110,6 +110,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, 'zero_stage': 2, 'precision': 'fp16', 'initial_scale': 1 +}, { + 'tp_size': 1, + 'pp_size': 2, + 'num_microbatches': 2, + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'zero_stage': 1, + 'precision': 'fp16', + 'initial_scale': 1 }]) def run_bloom_test(test_config): diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py index 85d66e493e03..13458fc5420e 100644 --- a/tests/test_shardformer/test_model/test_shard_gpt2.py +++ b/tests/test_shardformer/test_model/test_shard_gpt2.py @@ -128,6 +128,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, 'zero_stage': 2, 'precision': 'fp16', 'initial_scale': 1 +}, { + 'tp_size': 1, + 'pp_size': 2, + 'num_microbatches': 2, + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'zero_stage': 1, + 'precision': 'fp16', + 'initial_scale': 1 }]) @clear_cache_before_run() def run_gpt2_test(test_config): diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py index 485d2685e8f4..8dc6376bfb90 100644 --- a/tests/test_shardformer/test_model/test_shard_llama.py +++ b/tests/test_shardformer/test_model/test_shard_llama.py @@ -142,6 +142,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, 'zero_stage': 2, 'precision': 'fp16', 'initial_scale': 1 +}, { + 'tp_size': 1, + 'pp_size': 2, + 'num_microbatches': 2, + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'zero_stage': 1, + 'precision': 'fp16', + 'initial_scale': 1 }]) def run_llama_test(test_config): diff --git a/tests/test_shardformer/test_model/test_shard_opt.py b/tests/test_shardformer/test_model/test_shard_opt.py index ad344585e8ce..5a73bb5fd8b3 100644 --- a/tests/test_shardformer/test_model/test_shard_opt.py +++ b/tests/test_shardformer/test_model/test_shard_opt.py @@ -135,6 +135,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, 'zero_stage': 2, 'precision': 'fp16', 'initial_scale': 1 +}, { + 'tp_size': 1, + 'pp_size': 2, + 'num_microbatches': 2, + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'zero_stage': 1, + 'precision': 'fp16', + 'initial_scale': 1 }]) def run_opt_test(test_config): diff --git a/tests/test_shardformer/test_model/test_shard_t5.py b/tests/test_shardformer/test_model/test_shard_t5.py index a853f024deb2..cd3d3d673132 100644 --- a/tests/test_shardformer/test_model/test_shard_t5.py +++ b/tests/test_shardformer/test_model/test_shard_t5.py @@ -118,6 +118,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, 'zero_stage': 2, 'precision': 'fp16', 'initial_scale': 1 +}, { + 'tp_size': 1, + 'pp_size': 2, + 'num_microbatches': 2, + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'zero_stage': 1, + 'precision': 'fp16', + 'initial_scale': 1 }]) @clear_cache_before_run() def run_t5_test(test_config): diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py index 0b092966cfd8..96b6bcbe8e97 100644 --- a/tests/test_shardformer/test_model/test_shard_vit.py +++ b/tests/test_shardformer/test_model/test_shard_vit.py @@ -45,7 +45,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, if org_model.__class__.__name__ == 'ViTModel': check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol) - check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol) # unwrap model @@ -97,6 +96,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, torch.cuda.empty_cache() +#TODO: num_microbatch size = 2 inf loss @parameterize('test_config', [{ 'tp_size': 2, 'pp_size': 2, @@ -132,6 +132,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, 'zero_stage': 2, 'precision': 'fp16', 'initial_scale': 1 +}, { + 'tp_size': 1, + 'pp_size': 2, + 'num_microbatches': 4, + 'enable_all_optimization': True, + 'use_lazy_init': False, + 'zero_stage': 1, + 'precision': 'fp16', + 'initial_scale': 1 }]) def run_vit_test(test_config): @@ -195,4 +204,4 @@ def test_vit_3d(): if __name__ == "__main__": test_vit() - test_vit_3d() + #test_vit_3d() diff --git a/tests/test_shardformer/test_model/test_shard_whisper.py b/tests/test_shardformer/test_model/test_shard_whisper.py index 6445b314dc97..4e85072039f2 100644 --- a/tests/test_shardformer/test_model/test_shard_whisper.py +++ b/tests/test_shardformer/test_model/test_shard_whisper.py @@ -114,34 +114,42 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, # TODO(jianghai) fix fp16 -@parameterize('test_config', [{ - 'tp_size': 2, - 'pp_size': 2, - 'num_microbatches': 2, - 'enable_all_optimization': True, - 'use_lazy_init': True, - 'precision': 'fp32', - 'initial_scale': 1, -}, { - 'tp_size': 1, - 'pp_size': 2, - 'num_microbatches': 4, - 'use_lazy_init': False, - 'precision': 'fp32', - 'initial_scale': 1, -}, { - 'tp_size': 4, - 'pp_size': 1, - 'enable_all_optimization': True, - 'use_lazy_init': False, - 'precision': 'fp32', -}, { - 'tp_size': 1, - 'pp_size': 4, - 'num_microbatches': 4, - 'use_lazy_init': False, - 'precision': 'fp32', -}]) +@parameterize( + 'test_config', + [ + { + 'tp_size': 2, + 'pp_size': 2, + 'num_microbatches': 2, + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'precision': 'fp32', + 'initial_scale': 1, + }, + { + 'tp_size': 1, + 'pp_size': 2, + 'num_microbatches': 4, + 'use_lazy_init': False, + 'precision': 'fp32', + 'initial_scale': 1, + }, + { + 'tp_size': 4, + 'pp_size': 1, + 'enable_all_optimization': True, + 'use_lazy_init': False, + 'precision': 'fp32', + }, + { + 'tp_size': 1, + 'pp_size': 4, + 'num_microbatches': 4, + 'use_lazy_init': False, + 'precision': 'fp32', + }, + # whisper is not supported fp16 for now. + ]) def run_whisper_test(test_config): sub_model_zoo = model_zoo.get_sub_registry('transformers_whisper') for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): From b4e324d6b439be8021c8eb6d5cf820806589f4d4 Mon Sep 17 00:00:00 2001 From: Jianghai <72591262+CjhHa1@users.noreply.github.com> Date: Fri, 25 Aug 2023 16:59:36 +0800 Subject: [PATCH 3/3] Update test_shard_vit.py --- tests/test_shardformer/test_model/test_shard_vit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py index 96b6bcbe8e97..d40058bb73f7 100644 --- a/tests/test_shardformer/test_model/test_shard_vit.py +++ b/tests/test_shardformer/test_model/test_shard_vit.py @@ -204,4 +204,4 @@ def test_vit_3d(): if __name__ == "__main__": test_vit() - #test_vit_3d() + test_vit_3d()