From dd086b83358f2243f43dd2a1a20e2b9c2e02b42b Mon Sep 17 00:00:00 2001
From: CjhHa1 <cjh18671720497@outlook.com>
Date: Thu, 24 Aug 2023 14:26:17 +0800
Subject: [PATCH 1/3] pause

---
 tests/kit/model_zoo/torchrec/__init__.py      |  2 +-
 .../test_model/test_shard_bert.py             | 89 +++++++++++--------
 2 files changed, 54 insertions(+), 37 deletions(-)

diff --git a/tests/kit/model_zoo/torchrec/__init__.py b/tests/kit/model_zoo/torchrec/__init__.py
index 43952e6998cf..4a19f2449602 100644
--- a/tests/kit/model_zoo/torchrec/__init__.py
+++ b/tests/kit/model_zoo/torchrec/__init__.py
@@ -1 +1 @@
-from .torchrec import *
+#from .torchrec import *
diff --git a/tests/test_shardformer/test_model/test_shard_bert.py b/tests/test_shardformer/test_model/test_shard_bert.py
index 76f8c0541de5..b66fc0a9b67e 100644
--- a/tests/test_shardformer/test_model/test_shard_bert.py
+++ b/tests/test_shardformer/test_model/test_shard_bert.py
@@ -73,41 +73,56 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     torch.cuda.empty_cache()
 
 
-@parameterize('test_config', [{
-    'tp_size': 1,
-    'pp_size': 2,
-    'num_microbatches': 4,
-    'use_lazy_init': True,
-    'precision': 'fp32',
-}, {
-    'tp_size': 2,
-    'pp_size': 2,
-    'num_microbatches': 2,
-    'enable_all_optimization': True,
-    'use_lazy_init': True,
-    'precision': 'fp16',
-    'initial_scale': 1,
-}, {
-    'tp_size': 4,
-    'pp_size': 1,
-    'enable_all_optimization': True,
-    'use_lazy_init': False,
-    'precision': 'fp32',
-}, {
-    'tp_size': 2,
-    'pp_size': 1,
-    'enable_all_optimization': True,
-    'use_lazy_init': False,
-    'precision': 'fp32'
-}, {
-    'tp_size': 2,
-    'pp_size': 1,
-    'enable_all_optimization': True,
-    'use_lazy_init': True,
-    'zero_stage': 2,
-    'precision': 'fp16',
-    'initial_scale': 1
-}])
+@parameterize(
+    'test_config',
+    [
+    #     {
+    #     'tp_size': 1,
+    #     'pp_size': 2,
+    #     'num_microbatches': 4,
+    #     'use_lazy_init': True,
+    #     'precision': 'fp32',
+    # },
+    # {
+    #     'tp_size': 2,
+    #     'pp_size': 2,
+    #     'num_microbatches': 2,
+    #     'enable_all_optimization': True,
+    #     'use_lazy_init': True,
+    #     'precision': 'fp16',
+    #     'initial_scale': 1,
+    # }, {
+    #     'tp_size': 4,
+    #     'pp_size': 1,
+    #     'enable_all_optimization': True,
+    #     'use_lazy_init': False,
+    #     'precision': 'fp32',
+    # }, {
+    #     'tp_size': 2,
+    #     'pp_size': 1,
+    #     'enable_all_optimization': True,
+    #     'use_lazy_init': False,
+    #     'precision': 'fp32'
+    # }, {
+    #     'tp_size': 2,
+    #     'pp_size': 1,
+    #     'enable_all_optimization': True,
+    #     'use_lazy_init': True,
+    #     'zero_stage': 2,
+    #     'precision': 'fp16',
+    #     'initial_scale': 1
+    # },
+        {
+            'tp_size': 1,
+            'pp_size': 2,
+            'num_microbatches': 2,
+            'enable_all_optimization': True,
+            'use_lazy_init': True,
+            'zero_stage': 1,
+            'precision': 'fp16',
+            'initial_scale': 1
+        }
+    ])
 def run_bert_test(test_config):
 
     sub_model_zoo = model_zoo.get_sub_registry('transformers_bert')
@@ -135,6 +150,8 @@ def run_bert_3d_test(test_config):
     sub_model_zoo = model_zoo.get_sub_registry('transformers_bert')
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
+        if name != "transformer_bert":
+            continue
         check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
 
     clear_layout_converter()
@@ -170,4 +187,4 @@ def test_bert_3d():
 
 if __name__ == "__main__":
     test_bert()
-    test_bert_3d()
+    #test_bert_3d()

From be878b7d62a8f047fc1f25e160729e7ce55585e3 Mon Sep 17 00:00:00 2001
From: CjhHa1 <cjh18671720497@outlook.com>
Date: Fri, 25 Aug 2023 16:17:16 +0800
Subject: [PATCH 2/3] finish pp+zero1

---
 colossalai/pipeline/schedule/one_f_one_b.py   |  3 +-
 colossalai/zero/low_level/low_level_optim.py  |  9 +-
 tests/kit/model_zoo/torchrec/__init__.py      |  2 +-
 .../test_model/test_shard_bert.py             | 98 +++++++++----------
 .../test_model/test_shard_bloom.py            |  9 ++
 .../test_model/test_shard_gpt2.py             |  9 ++
 .../test_model/test_shard_llama.py            |  9 ++
 .../test_model/test_shard_opt.py              |  9 ++
 .../test_model/test_shard_t5.py               |  9 ++
 .../test_model/test_shard_vit.py              | 13 ++-
 .../test_model/test_shard_whisper.py          | 64 ++++++------
 11 files changed, 146 insertions(+), 88 deletions(-)

diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
index f5e4929aa7c8..0058873c21ba 100644
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -128,11 +128,11 @@ def forward_step(self,
             Union[torch.Tensor, dict]: The intermediate output (dict) of the current stage. If it is the last stage, the output is the loss (Tensor).
         """
         micro_batch = self.load_micro_batch()
-
         # for the first stage, input_obj is None
         # for the non-first stage, input_obj is the output of the previous stage and it's must be a dict
         output_obj = model_forward(model, micro_batch, input_obj)
         if self.stage_manager.is_last_stage():
+
             loss = criterion(output_obj, micro_batch) / self.num_microbatches
             if accum_loss is not None:
                 accum_loss.add_(loss.detach())
@@ -158,7 +158,6 @@ def backward_step(self, optimizer: OptimizerWrapper, input_obj: Optional[dict],
 
         # Retain the grad on the input_obj.
         tree_map(retain_grad, input_obj)
-
         # Backward pass.
         if output_obj_grad is None:
             optimizer.backward(output_obj)
diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index 64d6a5395120..a1e85e5b90f6 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -316,7 +316,6 @@ def _add_to_bucket(self, param, group_id):
     def backward(self, loss, retain_graph=False):
         assert not(self._partition_grads and not self.require_grad_sync), \
             "ZeRO2(partition_grads) and gradient accumulation(no_sync) are not compatible"
-
         if self.mixed_precision_mixin is not None:
             loss = self.mixed_precision_mixin.pre_backward(loss)
 
@@ -333,6 +332,13 @@ def backward(self, loss, retain_graph=False):
 
         self.zero_grad()
 
+    def backward_by_grad(self, tensor, grad):
+        # in lower stage which grad is transfered by higher stage
+        # we need to pass the optim state down.
+        if self.mixed_precision_mixin is not None:
+            grad = self.mixed_precision_mixin.pre_backward_by_grad(tensor, grad)
+        torch.autograd.backward(tensor, grad)
+
     def zero_grad(self, set_to_none=True):
         """
         Set parameter gradients to zero. If set_to_none = True, gradient
@@ -358,7 +364,6 @@ def zero_grad(self, set_to_none=True):
 
     def step(self, closure=None):
         assert closure is None, 'closure is not supported by step()'
-
         if not self.require_grad_sync:
             return
 
diff --git a/tests/kit/model_zoo/torchrec/__init__.py b/tests/kit/model_zoo/torchrec/__init__.py
index 4a19f2449602..43952e6998cf 100644
--- a/tests/kit/model_zoo/torchrec/__init__.py
+++ b/tests/kit/model_zoo/torchrec/__init__.py
@@ -1 +1 @@
-#from .torchrec import *
+from .torchrec import *
diff --git a/tests/test_shardformer/test_model/test_shard_bert.py b/tests/test_shardformer/test_model/test_shard_bert.py
index b66fc0a9b67e..a15645a7f344 100644
--- a/tests/test_shardformer/test_model/test_shard_bert.py
+++ b/tests/test_shardformer/test_model/test_shard_bert.py
@@ -73,56 +73,50 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     torch.cuda.empty_cache()
 
 
-@parameterize(
-    'test_config',
-    [
-    #     {
-    #     'tp_size': 1,
-    #     'pp_size': 2,
-    #     'num_microbatches': 4,
-    #     'use_lazy_init': True,
-    #     'precision': 'fp32',
-    # },
-    # {
-    #     'tp_size': 2,
-    #     'pp_size': 2,
-    #     'num_microbatches': 2,
-    #     'enable_all_optimization': True,
-    #     'use_lazy_init': True,
-    #     'precision': 'fp16',
-    #     'initial_scale': 1,
-    # }, {
-    #     'tp_size': 4,
-    #     'pp_size': 1,
-    #     'enable_all_optimization': True,
-    #     'use_lazy_init': False,
-    #     'precision': 'fp32',
-    # }, {
-    #     'tp_size': 2,
-    #     'pp_size': 1,
-    #     'enable_all_optimization': True,
-    #     'use_lazy_init': False,
-    #     'precision': 'fp32'
-    # }, {
-    #     'tp_size': 2,
-    #     'pp_size': 1,
-    #     'enable_all_optimization': True,
-    #     'use_lazy_init': True,
-    #     'zero_stage': 2,
-    #     'precision': 'fp16',
-    #     'initial_scale': 1
-    # },
-        {
-            'tp_size': 1,
-            'pp_size': 2,
-            'num_microbatches': 2,
-            'enable_all_optimization': True,
-            'use_lazy_init': True,
-            'zero_stage': 1,
-            'precision': 'fp16',
-            'initial_scale': 1
-        }
-    ])
+@parameterize('test_config', [{
+    'tp_size': 1,
+    'pp_size': 2,
+    'num_microbatches': 4,
+    'use_lazy_init': True,
+    'precision': 'fp32',
+}, {
+    'tp_size': 2,
+    'pp_size': 2,
+    'num_microbatches': 2,
+    'enable_all_optimization': True,
+    'use_lazy_init': True,
+    'precision': 'fp16',
+    'initial_scale': 1,
+}, {
+    'tp_size': 4,
+    'pp_size': 1,
+    'enable_all_optimization': True,
+    'use_lazy_init': False,
+    'precision': 'fp32',
+}, {
+    'tp_size': 2,
+    'pp_size': 1,
+    'enable_all_optimization': True,
+    'use_lazy_init': False,
+    'precision': 'fp32'
+}, {
+    'tp_size': 2,
+    'pp_size': 1,
+    'enable_all_optimization': True,
+    'use_lazy_init': True,
+    'zero_stage': 2,
+    'precision': 'fp16',
+    'initial_scale': 1
+}, {
+    'tp_size': 1,
+    'pp_size': 2,
+    'num_microbatches': 2,
+    'enable_all_optimization': True,
+    'use_lazy_init': True,
+    'zero_stage': 1,
+    'precision': 'fp16',
+    'initial_scale': 1
+}])
 def run_bert_test(test_config):
 
     sub_model_zoo = model_zoo.get_sub_registry('transformers_bert')
@@ -150,8 +144,6 @@ def run_bert_3d_test(test_config):
     sub_model_zoo = model_zoo.get_sub_registry('transformers_bert')
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
-        if name != "transformer_bert":
-            continue
         check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
 
     clear_layout_converter()
@@ -187,4 +179,4 @@ def test_bert_3d():
 
 if __name__ == "__main__":
     test_bert()
-    #test_bert_3d()
+    test_bert_3d()
diff --git a/tests/test_shardformer/test_model/test_shard_bloom.py b/tests/test_shardformer/test_model/test_shard_bloom.py
index 0e236fd47934..590eff642e2b 100644
--- a/tests/test_shardformer/test_model/test_shard_bloom.py
+++ b/tests/test_shardformer/test_model/test_shard_bloom.py
@@ -110,6 +110,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     'zero_stage': 2,
     'precision': 'fp16',
     'initial_scale': 1
+}, {
+    'tp_size': 1,
+    'pp_size': 2,
+    'num_microbatches': 2,
+    'enable_all_optimization': True,
+    'use_lazy_init': True,
+    'zero_stage': 1,
+    'precision': 'fp16',
+    'initial_scale': 1
 }])
 def run_bloom_test(test_config):
 
diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py
index 85d66e493e03..13458fc5420e 100644
--- a/tests/test_shardformer/test_model/test_shard_gpt2.py
+++ b/tests/test_shardformer/test_model/test_shard_gpt2.py
@@ -128,6 +128,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     'zero_stage': 2,
     'precision': 'fp16',
     'initial_scale': 1
+}, {
+    'tp_size': 1,
+    'pp_size': 2,
+    'num_microbatches': 2,
+    'enable_all_optimization': True,
+    'use_lazy_init': True,
+    'zero_stage': 1,
+    'precision': 'fp16',
+    'initial_scale': 1
 }])
 @clear_cache_before_run()
 def run_gpt2_test(test_config):
diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py
index 485d2685e8f4..8dc6376bfb90 100644
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@@ -142,6 +142,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     'zero_stage': 2,
     'precision': 'fp16',
     'initial_scale': 1
+}, {
+    'tp_size': 1,
+    'pp_size': 2,
+    'num_microbatches': 2,
+    'enable_all_optimization': True,
+    'use_lazy_init': True,
+    'zero_stage': 1,
+    'precision': 'fp16',
+    'initial_scale': 1
 }])
 def run_llama_test(test_config):
 
diff --git a/tests/test_shardformer/test_model/test_shard_opt.py b/tests/test_shardformer/test_model/test_shard_opt.py
index ad344585e8ce..5a73bb5fd8b3 100644
--- a/tests/test_shardformer/test_model/test_shard_opt.py
+++ b/tests/test_shardformer/test_model/test_shard_opt.py
@@ -135,6 +135,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     'zero_stage': 2,
     'precision': 'fp16',
     'initial_scale': 1
+}, {
+    'tp_size': 1,
+    'pp_size': 2,
+    'num_microbatches': 2,
+    'enable_all_optimization': True,
+    'use_lazy_init': True,
+    'zero_stage': 1,
+    'precision': 'fp16',
+    'initial_scale': 1
 }])
 def run_opt_test(test_config):
 
diff --git a/tests/test_shardformer/test_model/test_shard_t5.py b/tests/test_shardformer/test_model/test_shard_t5.py
index a853f024deb2..cd3d3d673132 100644
--- a/tests/test_shardformer/test_model/test_shard_t5.py
+++ b/tests/test_shardformer/test_model/test_shard_t5.py
@@ -118,6 +118,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     'zero_stage': 2,
     'precision': 'fp16',
     'initial_scale': 1
+}, {
+    'tp_size': 1,
+    'pp_size': 2,
+    'num_microbatches': 2,
+    'enable_all_optimization': True,
+    'use_lazy_init': True,
+    'zero_stage': 1,
+    'precision': 'fp16',
+    'initial_scale': 1
 }])
 @clear_cache_before_run()
 def run_t5_test(test_config):
diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py
index 0b092966cfd8..96b6bcbe8e97 100644
--- a/tests/test_shardformer/test_model/test_shard_vit.py
+++ b/tests/test_shardformer/test_model/test_shard_vit.py
@@ -45,7 +45,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 
         if org_model.__class__.__name__ == 'ViTModel':
             check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol)
-
         check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol)
 
     # unwrap model
@@ -97,6 +96,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     torch.cuda.empty_cache()
 
 
+#TODO: num_microbatch size = 2 inf loss
 @parameterize('test_config', [{
     'tp_size': 2,
     'pp_size': 2,
@@ -132,6 +132,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     'zero_stage': 2,
     'precision': 'fp16',
     'initial_scale': 1
+}, {
+    'tp_size': 1,
+    'pp_size': 2,
+    'num_microbatches': 4,
+    'enable_all_optimization': True,
+    'use_lazy_init': False,
+    'zero_stage': 1,
+    'precision': 'fp16',
+    'initial_scale': 1
 }])
 def run_vit_test(test_config):
 
@@ -195,4 +204,4 @@ def test_vit_3d():
 
 if __name__ == "__main__":
     test_vit()
-    test_vit_3d()
+    #test_vit_3d()
diff --git a/tests/test_shardformer/test_model/test_shard_whisper.py b/tests/test_shardformer/test_model/test_shard_whisper.py
index 6445b314dc97..4e85072039f2 100644
--- a/tests/test_shardformer/test_model/test_shard_whisper.py
+++ b/tests/test_shardformer/test_model/test_shard_whisper.py
@@ -114,34 +114,42 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 
 
 # TODO（jianghai) fix fp16
-@parameterize('test_config', [{
-    'tp_size': 2,
-    'pp_size': 2,
-    'num_microbatches': 2,
-    'enable_all_optimization': True,
-    'use_lazy_init': True,
-    'precision': 'fp32',
-    'initial_scale': 1,
-}, {
-    'tp_size': 1,
-    'pp_size': 2,
-    'num_microbatches': 4,
-    'use_lazy_init': False,
-    'precision': 'fp32',
-    'initial_scale': 1,
-}, {
-    'tp_size': 4,
-    'pp_size': 1,
-    'enable_all_optimization': True,
-    'use_lazy_init': False,
-    'precision': 'fp32',
-}, {
-    'tp_size': 1,
-    'pp_size': 4,
-    'num_microbatches': 4,
-    'use_lazy_init': False,
-    'precision': 'fp32',
-}])
+@parameterize(
+    'test_config',
+    [
+        {
+            'tp_size': 2,
+            'pp_size': 2,
+            'num_microbatches': 2,
+            'enable_all_optimization': True,
+            'use_lazy_init': True,
+            'precision': 'fp32',
+            'initial_scale': 1,
+        },
+        {
+            'tp_size': 1,
+            'pp_size': 2,
+            'num_microbatches': 4,
+            'use_lazy_init': False,
+            'precision': 'fp32',
+            'initial_scale': 1,
+        },
+        {
+            'tp_size': 4,
+            'pp_size': 1,
+            'enable_all_optimization': True,
+            'use_lazy_init': False,
+            'precision': 'fp32',
+        },
+        {
+            'tp_size': 1,
+            'pp_size': 4,
+            'num_microbatches': 4,
+            'use_lazy_init': False,
+            'precision': 'fp32',
+        },
+    # whisper is not supported fp16 for now.
+    ])
 def run_whisper_test(test_config):
     sub_model_zoo = model_zoo.get_sub_registry('transformers_whisper')
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():

From b4e324d6b439be8021c8eb6d5cf820806589f4d4 Mon Sep 17 00:00:00 2001
From: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Date: Fri, 25 Aug 2023 16:59:36 +0800
Subject: [PATCH 3/3] Update test_shard_vit.py

---
 tests/test_shardformer/test_model/test_shard_vit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py
index 96b6bcbe8e97..d40058bb73f7 100644
--- a/tests/test_shardformer/test_model/test_shard_vit.py
+++ b/tests/test_shardformer/test_model/test_shard_vit.py
@@ -204,4 +204,4 @@ def test_vit_3d():
 
 if __name__ == "__main__":
     test_vit()
-    #test_vit_3d()
+    test_vit_3d()