From 6a2aaaeabba69f173648c9771094665b1880a965 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 12 Jan 2024 18:42:02 +0800
Subject: [PATCH 01/33] benchmark gpt2

---
 .../pipeline/schedule/interleaved_pp.py       |   2 +-
 colossalai/pipeline/schedule/one_f_one_b.py   |   2 +-
 colossalai/shardformer/modeling/gpt2.py       | 106 +++++++++++++++++-
 colossalai/shardformer/policies/gpt2.py       |  16 ++-
 examples/language/llama2/benchmark.py         |  67 ++++++++---
 examples/language/llama2/data_utils.py        |   2 +-
 .../test_model/test_shard_bert.py             |  13 ++-
 .../test_model/test_shard_t5.py               |  10 +-
 8 files changed, 184 insertions(+), 34 deletions(-)

diff --git a/colossalai/pipeline/schedule/interleaved_pp.py b/colossalai/pipeline/schedule/interleaved_pp.py
index 0a01a1e7864b..9bf2608df4f4 100644
--- a/colossalai/pipeline/schedule/interleaved_pp.py
+++ b/colossalai/pipeline/schedule/interleaved_pp.py
@@ -22,7 +22,7 @@ def __init__(
         num_model_chunks: int,
         num_microbatch: Optional[int] = None,
         microbatch_size: Optional[int] = None,
-        enable_metadata_cache: bool = True,
+        enable_metadata_cache: bool = False,
     ) -> None:
         super().__init__(stage_manager)
         assert (
diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
index cb078b25faeb..d9379a2add5a 100644
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -30,7 +30,7 @@ def __init__(
         stage_manager: PipelineStageManager,
         num_microbatches: Optional[int] = None,
         microbatch_size: Optional[int] = None,
-        enable_metadata_cache: bool = True,
+        enable_metadata_cache: bool = False,
     ) -> None:
         """1F1B pipeline schedule.
 
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 8f456353742c..fd9b4c908bba 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -24,6 +24,8 @@
 from colossalai.shardformer.layer._operation import gather_forward_split_backward, split_forward_gather_backward
 from colossalai.shardformer.shard import ShardConfig
 
+from ..layer import cross_entropy_1d
+
 
 class GPT2PipelineForwards:
     """
@@ -326,7 +328,15 @@ def gpt2_lmhead_model_forward(
             shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+            shift_labels = shift_labels.view(-1)
+            if shard_config.enable_tensor_parallelism:
+                loss = cross_entropy_1d(
+                    shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
+                )
+            else:
+                loss = loss_fct(shift_logits, shift_labels)
+
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
             return ((loss,) + output) if loss is not None else output
@@ -716,7 +726,7 @@ def gpt2_for_sequence_classification_forward(
         )
 
 
-def get_gpt2_flash_attention_forward():
+def get_gpt2_flash_attention_forward(shard_config: ShardConfig):
     from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
 
     from colossalai.kernel.cuda_native import AttnMaskType, ColoAttention
@@ -767,10 +777,12 @@ def forward(
         else:
             present = None
 
+        flash_attention_mask = None
         if not self.is_cross_attention:
             attn_mask_type = AttnMaskType.causal
-            flash_attention_mask = None
-        if attention_mask != None:
+        else:
+            attn_mask_type = None
+        if not getattr(shard_config, "causal_lm", False) and attention_mask != None:
             flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous()
             if not torch.all(flash_attention_mask):
                 if attn_mask_type == AttnMaskType.causal:
@@ -1006,3 +1018,89 @@ def custom_forward(*inputs):
         )
 
     return forward
+
+
+def get_lm_forward_with_dist_cross_entropy(shard_config: ShardConfig):
+    from transformers import GPT2LMHeadModel
+
+    def forward(
+        self: GPT2LMHeadModel,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+            shift_labels = shift_labels.view(-1)
+            if shard_config.enable_tensor_parallelism:
+                loss = cross_entropy_1d(
+                    shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
+                )
+            else:
+                loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+
+    return forward
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index 022e6ff5b32c..dc659500892b 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -5,7 +5,12 @@
 
 import colossalai.shardformer.layer as col_nn
 
-from ..modeling.gpt2 import GPT2PipelineForwards, get_gpt2_flash_attention_forward, gpt2_sequence_parallel_forward_fn
+from ..modeling.gpt2 import (
+    GPT2PipelineForwards,
+    get_gpt2_flash_attention_forward,
+    get_lm_forward_with_dist_cross_entropy,
+    gpt2_sequence_parallel_forward_fn,
+)
 from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 __all__ = [
@@ -142,7 +147,7 @@ def module_policy(self):
         if self.shard_config.enable_flash_attention:
             self.append_or_create_method_replacement(
                 description={
-                    "forward": get_gpt2_flash_attention_forward(),
+                    "forward": get_gpt2_flash_attention_forward(self.shard_config),
                 },
                 policy=policy,
                 target_key=GPT2Attention,
@@ -227,14 +232,17 @@ def module_policy(self):
 
         module_policy = super().module_policy()
 
+        setattr(self.shard_config, "causal_lm", True)
+
         if self.shard_config.enable_tensor_parallelism:
             addon_module = {
                 GPT2LMHeadModel: ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": True}
+                            suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": False}
                         )
-                    ]
+                    ],
+                    method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)},
                 )
             }
             module_policy.update(addon_module)
diff --git a/examples/language/llama2/benchmark.py b/examples/language/llama2/benchmark.py
index a4c29b7c8231..5a6ee5e7bd27 100644
--- a/examples/language/llama2/benchmark.py
+++ b/examples/language/llama2/benchmark.py
@@ -8,9 +8,13 @@
 from model_utils import format_numel_str, get_model_numel
 from performance_evaluator import PerformanceEvaluator
 from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision
+
+# from colossalai.nn.optimizer import HybridAdam
+from torch.optim import Adam
 from tqdm import tqdm
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.models.llama.modeling_llama import LlamaForCausalLM
 
 import colossalai
 import colossalai.utils.device as device_utils
@@ -18,7 +22,6 @@
 from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, TorchFSDPPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.lazy import LazyInitContext
-from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
 
 # ==============================
@@ -59,8 +62,8 @@ def main():
         help="Choose which plugin to use",
     )
     parser.add_argument("-b", "--batch_size", type=int, default=2, help="Batch size")
-    parser.add_argument("-s", "--num_steps", type=int, default=5, help="Number of steps to run")
-    parser.add_argument("-i", "--ignore_steps", type=int, default=2, help="Number of steps to ignore")
+    parser.add_argument("-s", "--num_steps", type=int, default=200, help="Number of steps to run")
+    parser.add_argument("-i", "--ignore_steps", type=int, default=1, help="Number of steps to ignore")
     parser.add_argument("-g", "--grad_checkpoint", action="store_true", help="Use gradient checkpointing")
     parser.add_argument("-l", "--max_length", type=int, default=4096, help="Max sequence length")
     parser.add_argument(
@@ -98,7 +101,13 @@ def empty_init():
             extra_dp_size=args.extra_dp,
         )
     elif args.plugin == "gemini_auto":
-        plugin = GeminiPlugin(placement_policy="auto", precision="bf16", warmup_non_model_data_ratio=args.warmup_ratio, tp_size=args.tp, extra_dp_size=args.extra_dp)
+        plugin = GeminiPlugin(
+            placement_policy="auto",
+            precision="bf16",
+            warmup_non_model_data_ratio=args.warmup_ratio,
+            tp_size=args.tp,
+            extra_dp_size=args.extra_dp,
+        )
     elif args.plugin == "fsdp":
         if use_empty_init:
             plugin = TorchFSDPPlugin(
@@ -133,10 +142,12 @@ def empty_init():
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
             pp_size=args.pp,
-            pp_style="interleaved",
+            # pp_style="interleaved",
+            pp_style="1f1b",
             zero_stage=args.zero,
-            num_model_chunks=2,
-            enable_fused_normalization=torch.cuda.is_available(),
+            num_model_chunks=1,
+            # enable_fused_normalization=torch.cuda.is_available(),
+            enable_all_optimization=True,
             num_microbatches=args.mbs,
             precision="bf16",
         )
@@ -161,7 +172,8 @@ def empty_init():
     # ==============================
     dp_size = plugin.dp_size if isinstance(plugin, HybridParallelPlugin) else coordinator.world_size
 
-    config = MODEL_CONFIGS[args.config]
+    # config = MODEL_CONFIGS[args.config]
+    config = GPT2Config(n_layer=24, n_embd=1024, n_head=16, n_positions=1024)
     dataset = RandomDataset(
         num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
     )
@@ -176,8 +188,10 @@ def empty_init():
         else nullcontext()
     )
 
+    # with init_ctx:
+    #     model = LlamaForCausalLM(config)
     with init_ctx:
-        model = LlamaForCausalLM(config)
+        model = GPT2LMHeadModel(config)
 
     if args.grad_checkpoint:
         model.gradient_checkpointing_enable()
@@ -188,17 +202,27 @@ def empty_init():
 
     model_numel = get_model_numel(model)
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
+    # performance_evaluator = PerformanceEvaluator(
+    #     model_numel,
+    #     model.config.num_hidden_layers,
+    #     model.config.hidden_size,
+    #     model.config.vocab_size,
+    #     args.grad_checkpoint,
+    #     args.ignore_steps,
+    #     dp_world_size=dp_size,
+    # )
     performance_evaluator = PerformanceEvaluator(
         model_numel,
-        model.config.num_hidden_layers,
-        model.config.hidden_size,
+        model.config.n_layer,
+        model.config.n_embd,
         model.config.vocab_size,
         args.grad_checkpoint,
         args.ignore_steps,
         dp_world_size=dp_size,
     )
 
-    optimizer = HybridAdam(model.parameters())
+    # optimizer = HybridAdam(model.parameters())
+    optimizer = Adam(model.parameters())
     torch.set_default_dtype(torch.bfloat16)
     model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
     torch.set_default_dtype(torch.float)
@@ -220,12 +244,29 @@ def empty_init():
     else:
         for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not coordinator.is_master())):
             performance_evaluator.on_step_start(step)
+            # from torch.autograd import profiler
+            # with torch.profiler.profile(
+            #     activities=[
+            #         torch.profiler.ProfilerActivity.CPU,
+            #         torch.profiler.ProfilerActivity.CUDA,
+            #     ],record_shapes=True, with_stack=True) as prof:
+            # with torch.profiler.profile(record_shapes=True) as prof:
+            #     with profiler.record_function("model_inference"):
+            #         outputs = model(**batch)
+            #         loss = outputs[0]
+            #         booster.backward(loss, optimizer)
+            #         optimizer.step()
+            #         optimizer.zero_grad()
+            # if coordinator.is_master():
+            #     prof.export_chrome_trace('./llama_profile.json')
+
             outputs = model(**batch)
             loss = outputs[0]
             booster.backward(loss, optimizer)
             optimizer.step()
             optimizer.zero_grad()
             performance_evaluator.on_step_end(**batch)
+            coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
 
     performance_evaluator.on_fit_end()
     coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
diff --git a/examples/language/llama2/data_utils.py b/examples/language/llama2/data_utils.py
index a438833e1680..2a4a16c176a3 100644
--- a/examples/language/llama2/data_utils.py
+++ b/examples/language/llama2/data_utils.py
@@ -108,7 +108,7 @@ class RandomDataset(Dataset):
     def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000):
         self.num_samples = num_samples
         self.max_length = max_length
-        self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
+        self.input_ids = torch.randint(0, vocab_size // 4, (num_samples, max_length), device=get_current_device())
         self.attention_mask = torch.ones_like(self.input_ids)
 
     def __len__(self):
diff --git a/tests/test_shardformer/test_model/test_shard_bert.py b/tests/test_shardformer/test_model/test_shard_bert.py
index 768bd95bdb42..5a1d8c5727ea 100644
--- a/tests/test_shardformer/test_model/test_shard_bert.py
+++ b/tests/test_shardformer/test_model/test_shard_bert.py
@@ -1,6 +1,8 @@
 import pytest
 import torch
 
+torch.cuda.set_per_process_memory_fraction(0.125, 0)
+
 import colossalai
 from colossalai.logging import disable_existing_loggers
 from colossalai.shardformer.layer.utils import Randomizer
@@ -158,6 +160,7 @@ def run_bert_test(test_config):
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
 
+    print(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
     clear_layout_converter()
     Randomizer.reset_index()
     torch.cuda.empty_cache()
@@ -227,11 +230,11 @@ def test_bert():
     spawn(check_bert, 4)
 
 
-@pytest.mark.largedist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_bert_3d():
-    spawn(check_bert_3d, 8)
+# @pytest.mark.largedist
+# @rerun_if_address_is_in_use()
+# @clear_cache_before_run()
+# def test_bert_3d():
+#     spawn(check_bert_3d, 8)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_shardformer/test_model/test_shard_t5.py b/tests/test_shardformer/test_model/test_shard_t5.py
index 73f203d1f023..ca54e5b9060a 100644
--- a/tests/test_shardformer/test_model/test_shard_t5.py
+++ b/tests/test_shardformer/test_model/test_shard_t5.py
@@ -205,11 +205,11 @@ def test_t5():
     spawn(check_t5, 4)
 
 
-@pytest.mark.largedist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_t5_3d():
-    spawn(check_t5_3d, 8)
+# @pytest.mark.largedist
+# @rerun_if_address_is_in_use()
+# @clear_cache_before_run()
+# def test_t5_3d():
+#     spawn(check_t5_3d, 8)
 
 
 if __name__ == "__main__":

From 01ee32a75ebd4d5ad93665090dffc9a0d9fc8cf0 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 22 Jan 2024 15:24:35 +0800
Subject: [PATCH 02/33] fix

fix

fix

fix
---
 .../pipeline/schedule/interleaved_pp.py       |  2 +-
 colossalai/pipeline/schedule/one_f_one_b.py   |  2 +-
 examples/__init__.py                          |  0
 examples/language/__init__.py                 |  0
 examples/language/{llama2 => }/data_utils.py  |  2 +-
 examples/language/llama2/benchmark.py         | 73 +++++--------------
 examples/language/{llama2 => }/model_utils.py |  0
 .../{llama2 => }/performance_evaluator.py     |  0
 .../test_model/test_shard_bert.py             | 13 ++--
 .../test_model/test_shard_t5.py               | 10 +--
 10 files changed, 32 insertions(+), 70 deletions(-)
 create mode 100644 examples/__init__.py
 create mode 100644 examples/language/__init__.py
 rename examples/language/{llama2 => }/data_utils.py (97%)
 rename examples/language/{llama2 => }/model_utils.py (100%)
 rename examples/language/{llama2 => }/performance_evaluator.py (100%)

diff --git a/colossalai/pipeline/schedule/interleaved_pp.py b/colossalai/pipeline/schedule/interleaved_pp.py
index 9bf2608df4f4..0a01a1e7864b 100644
--- a/colossalai/pipeline/schedule/interleaved_pp.py
+++ b/colossalai/pipeline/schedule/interleaved_pp.py
@@ -22,7 +22,7 @@ def __init__(
         num_model_chunks: int,
         num_microbatch: Optional[int] = None,
         microbatch_size: Optional[int] = None,
-        enable_metadata_cache: bool = False,
+        enable_metadata_cache: bool = True,
     ) -> None:
         super().__init__(stage_manager)
         assert (
diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
index d9379a2add5a..cb078b25faeb 100644
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -30,7 +30,7 @@ def __init__(
         stage_manager: PipelineStageManager,
         num_microbatches: Optional[int] = None,
         microbatch_size: Optional[int] = None,
-        enable_metadata_cache: bool = False,
+        enable_metadata_cache: bool = True,
     ) -> None:
         """1F1B pipeline schedule.
 
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/examples/language/__init__.py b/examples/language/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/examples/language/llama2/data_utils.py b/examples/language/data_utils.py
similarity index 97%
rename from examples/language/llama2/data_utils.py
rename to examples/language/data_utils.py
index 2a4a16c176a3..a438833e1680 100644
--- a/examples/language/llama2/data_utils.py
+++ b/examples/language/data_utils.py
@@ -108,7 +108,7 @@ class RandomDataset(Dataset):
     def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000):
         self.num_samples = num_samples
         self.max_length = max_length
-        self.input_ids = torch.randint(0, vocab_size // 4, (num_samples, max_length), device=get_current_device())
+        self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
         self.attention_mask = torch.ones_like(self.input_ids)
 
     def __len__(self):
diff --git a/examples/language/llama2/benchmark.py b/examples/language/llama2/benchmark.py
index 5a6ee5e7bd27..701589042748 100644
--- a/examples/language/llama2/benchmark.py
+++ b/examples/language/llama2/benchmark.py
@@ -4,17 +4,10 @@
 
 import torch
 from attn import SUPPORT_FLASH, replace_xformers
-from data_utils import RandomDataset
-from model_utils import format_numel_str, get_model_numel
-from performance_evaluator import PerformanceEvaluator
 from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision
-
-# from colossalai.nn.optimizer import HybridAdam
-from torch.optim import Adam
 from tqdm import tqdm
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
 
 import colossalai
 import colossalai.utils.device as device_utils
@@ -22,7 +15,11 @@
 from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, TorchFSDPPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.lazy import LazyInitContext
+from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
+from examples.language.data_utils import RandomDataset
+from examples.language.model_utils import format_numel_str, get_model_numel
+from examples.language.performance_evaluator import PerformanceEvaluator
 
 # ==============================
 # Constants
@@ -62,8 +59,8 @@ def main():
         help="Choose which plugin to use",
     )
     parser.add_argument("-b", "--batch_size", type=int, default=2, help="Batch size")
-    parser.add_argument("-s", "--num_steps", type=int, default=200, help="Number of steps to run")
-    parser.add_argument("-i", "--ignore_steps", type=int, default=1, help="Number of steps to ignore")
+    parser.add_argument("-s", "--num_steps", type=int, default=5, help="Number of steps to run")
+    parser.add_argument("-i", "--ignore_steps", type=int, default=2, help="Number of steps to ignore")
     parser.add_argument("-g", "--grad_checkpoint", action="store_true", help="Use gradient checkpointing")
     parser.add_argument("-l", "--max_length", type=int, default=4096, help="Max sequence length")
     parser.add_argument(
@@ -77,8 +74,8 @@ def main():
     parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
     parser.add_argument("--extra_dp", type=int, default=1, help="Extra data parallel size, used for Gemini")
     parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel size")
-    parser.add_argument("--mbs", type=int, default=1)
-    parser.add_argument("--zero", type=int, default=0)
+    parser.add_argument("--mbs", type=int, default=1, help="Micro batch size of pipeline parallel")
+    parser.add_argument("--zero", type=int, default=0, help="Zero Stage when hybrid plugin is enabled")
     args = parser.parse_args()
 
     colossalai.launch_from_torch({})
@@ -142,13 +139,11 @@ def empty_init():
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
             pp_size=args.pp,
-            # pp_style="interleaved",
-            pp_style="1f1b",
+            pp_style="interleaved",
             zero_stage=args.zero,
-            num_model_chunks=1,
-            # enable_fused_normalization=torch.cuda.is_available(),
-            enable_all_optimization=True,
-            num_microbatches=args.mbs,
+            num_model_chunks=2,
+            enable_fused_normalization=torch.cuda.is_available(),
+            microbatch_size=args.mbs,
             precision="bf16",
         )
     elif args.plugin == "3d_cpu":
@@ -158,7 +153,7 @@ def empty_init():
             zero_stage=args.zero,
             cpu_offload=True,
             enable_fused_normalization=torch.cuda.is_available(),
-            num_microbatches=args.mbs,
+            microbatch_size=args.mbs,
             initial_scale=2**8,
             precision="bf16",
         )
@@ -172,8 +167,7 @@ def empty_init():
     # ==============================
     dp_size = plugin.dp_size if isinstance(plugin, HybridParallelPlugin) else coordinator.world_size
 
-    # config = MODEL_CONFIGS[args.config]
-    config = GPT2Config(n_layer=24, n_embd=1024, n_head=16, n_positions=1024)
+    config = MODEL_CONFIGS[args.config]
     dataset = RandomDataset(
         num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
     )
@@ -188,10 +182,8 @@ def empty_init():
         else nullcontext()
     )
 
-    # with init_ctx:
-    #     model = LlamaForCausalLM(config)
     with init_ctx:
-        model = GPT2LMHeadModel(config)
+        model = LlamaForCausalLM(config)
 
     if args.grad_checkpoint:
         model.gradient_checkpointing_enable()
@@ -202,27 +194,17 @@ def empty_init():
 
     model_numel = get_model_numel(model)
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
-    # performance_evaluator = PerformanceEvaluator(
-    #     model_numel,
-    #     model.config.num_hidden_layers,
-    #     model.config.hidden_size,
-    #     model.config.vocab_size,
-    #     args.grad_checkpoint,
-    #     args.ignore_steps,
-    #     dp_world_size=dp_size,
-    # )
     performance_evaluator = PerformanceEvaluator(
         model_numel,
-        model.config.n_layer,
-        model.config.n_embd,
+        model.config.num_hidden_layers,
+        model.config.hidden_size,
         model.config.vocab_size,
         args.grad_checkpoint,
         args.ignore_steps,
         dp_world_size=dp_size,
     )
 
-    # optimizer = HybridAdam(model.parameters())
-    optimizer = Adam(model.parameters())
+    optimizer = HybridAdam(model.parameters())
     torch.set_default_dtype(torch.bfloat16)
     model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
     torch.set_default_dtype(torch.float)
@@ -244,29 +226,12 @@ def empty_init():
     else:
         for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not coordinator.is_master())):
             performance_evaluator.on_step_start(step)
-            # from torch.autograd import profiler
-            # with torch.profiler.profile(
-            #     activities=[
-            #         torch.profiler.ProfilerActivity.CPU,
-            #         torch.profiler.ProfilerActivity.CUDA,
-            #     ],record_shapes=True, with_stack=True) as prof:
-            # with torch.profiler.profile(record_shapes=True) as prof:
-            #     with profiler.record_function("model_inference"):
-            #         outputs = model(**batch)
-            #         loss = outputs[0]
-            #         booster.backward(loss, optimizer)
-            #         optimizer.step()
-            #         optimizer.zero_grad()
-            # if coordinator.is_master():
-            #     prof.export_chrome_trace('./llama_profile.json')
-
             outputs = model(**batch)
             loss = outputs[0]
             booster.backward(loss, optimizer)
             optimizer.step()
             optimizer.zero_grad()
             performance_evaluator.on_step_end(**batch)
-            coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
 
     performance_evaluator.on_fit_end()
     coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
diff --git a/examples/language/llama2/model_utils.py b/examples/language/model_utils.py
similarity index 100%
rename from examples/language/llama2/model_utils.py
rename to examples/language/model_utils.py
diff --git a/examples/language/llama2/performance_evaluator.py b/examples/language/performance_evaluator.py
similarity index 100%
rename from examples/language/llama2/performance_evaluator.py
rename to examples/language/performance_evaluator.py
diff --git a/tests/test_shardformer/test_model/test_shard_bert.py b/tests/test_shardformer/test_model/test_shard_bert.py
index 5a1d8c5727ea..768bd95bdb42 100644
--- a/tests/test_shardformer/test_model/test_shard_bert.py
+++ b/tests/test_shardformer/test_model/test_shard_bert.py
@@ -1,8 +1,6 @@
 import pytest
 import torch
 
-torch.cuda.set_per_process_memory_fraction(0.125, 0)
-
 import colossalai
 from colossalai.logging import disable_existing_loggers
 from colossalai.shardformer.layer.utils import Randomizer
@@ -160,7 +158,6 @@ def run_bert_test(test_config):
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
 
-    print(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
     clear_layout_converter()
     Randomizer.reset_index()
     torch.cuda.empty_cache()
@@ -230,11 +227,11 @@ def test_bert():
     spawn(check_bert, 4)
 
 
-# @pytest.mark.largedist
-# @rerun_if_address_is_in_use()
-# @clear_cache_before_run()
-# def test_bert_3d():
-#     spawn(check_bert_3d, 8)
+@pytest.mark.largedist
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def test_bert_3d():
+    spawn(check_bert_3d, 8)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_shardformer/test_model/test_shard_t5.py b/tests/test_shardformer/test_model/test_shard_t5.py
index ca54e5b9060a..73f203d1f023 100644
--- a/tests/test_shardformer/test_model/test_shard_t5.py
+++ b/tests/test_shardformer/test_model/test_shard_t5.py
@@ -205,11 +205,11 @@ def test_t5():
     spawn(check_t5, 4)
 
 
-# @pytest.mark.largedist
-# @rerun_if_address_is_in_use()
-# @clear_cache_before_run()
-# def test_t5_3d():
-#     spawn(check_t5_3d, 8)
+@pytest.mark.largedist
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def test_t5_3d():
+    spawn(check_t5_3d, 8)
 
 
 if __name__ == "__main__":

From 9ce52807865416c0780e340f163f4d9b01d97a11 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Wed, 10 Jan 2024 19:24:56 +0800
Subject: [PATCH 03/33] [doc] fix typo in Colossal-LLaMA-2/README.md (#5247)

---
 applications/Colossal-LLaMA-2/README.md | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/applications/Colossal-LLaMA-2/README.md b/applications/Colossal-LLaMA-2/README.md
index 29abcbfb459e..1377e1facec0 100644
--- a/applications/Colossal-LLaMA-2/README.md
+++ b/applications/Colossal-LLaMA-2/README.md
@@ -10,8 +10,12 @@
 - [Colossal-LLaMA-2-7B](#colossal-llama-2-7b)
 - [Colossal-LLaMA-2-13B](#colossal-llama-2-13b)
   - [Performance Evaluation](#performance-evaluation)
+    - [Model with ~7 Billion Parameters](#model-with-7-billion-parameters)
+    - [Model with ~13 Billion Parameters](#model-with-13-billion-parameters)
   - [Examples](#examples)
   - [Training Logs](#training-logs)
+    - [Colossal-LLaMA-2-7b-base](#colossal-llama-2-7b-base)
+    - [Colossal-LLaMA-2-13b-base](#colossal-llama-2-13b-base)
   - [Inference](#inference)
     - [Import from HuggingFace](#import-from-huggingface)
     - [Import from Modelscope](#import-from-modelscope)
@@ -25,14 +29,14 @@
     - [1. Init Tokenizer Preparation](#1-init-tokenizer-preparation)
     - [2. Init Model Preparation](#2-init-model-preparation)
     - [3. Data Preparation](#3-data-preparation)
-        - [3.1 Data for Pretraining](#31-data-for-pretraining)
-        - [3.2 Data for Supervised Fine-tuning](#32-data-for-supervised-fine-tuning)
+      - [3.1 Data for Pretraining](#31-data-for-pretraining)
+      - [3.2 Data for Supervised Fine-tuning](#32-data-for-supervised-fine-tuning)
     - [4. Command Line Arguments for Training](#4-command-line-arguments-for-training)
-        - [4.1 Arguments for Pretraining](#41-arguments-for-pretraining)
-        - [4.2 Arguments for Supervised Fine-tuning](#42-arguments-for-supervised-fine-tuning)
+      - [4.1 Arguments for Pretraining](#41-arguments-for-pretraining)
+      - [4.2 Arguments for Supervised Fine-tuning](#42-arguments-for-supervised-fine-tuning)
     - [5. Running Command](#5-running-command)
-        - [5.1 Command for Pretraining](#51-command-for-pretraining)
-        - [5.2 Command for Supervised Fine-tuning](#52-command-for-supervised-fine-tuning)
+      - [5.1 Command for Pretraining](#51-command-for-pretraining)
+      - [5.2 Command for Supervised Fine-tuning](#52-command-for-supervised-fine-tuning)
 - [Technical Insights](#technical-insights)
   - [Data](#data)
   - [Tokenizer](#tokenizer)
@@ -366,7 +370,7 @@ python prepare_pretrain_dataset.py \
 ```
 Here is details about CLI arguments:
 * Source data directory: `data_input_dirs`. Each `<JSONL_DIR>` can have multiple file in `jsonl` format.
-* Tokenzier directory: `tokenizer_dir`. Path to the tokenizer in Hugging Face format.
+* Tokenizer directory: `tokenizer_dir`. Path to the tokenizer in Hugging Face format.
 * Data cache directory: `data_cache_dir`. Directory to store Hugging Face data cache. Default case will create `cache` folder locally.
 * Output directory for jsonl format: `data_jsonl_output_dir`. Output directory to store converted dataset in jsonl format.
 * Output directory for arrow format: `data_arrow_output_dir`. Output directory to store converted dataset in arrow format, which can be used for training directly.
@@ -386,7 +390,7 @@ Examples:
 Command to convert jsonl dataset to arrow format is similar to the command in [3.1 Data for Pretraining](#31-data-for-pretraining). In `prepare_sft_dataset.py`, we don't concatenate different data samples.
 ```
 python prepare_sft_dataset.py.py \
-    --data_input_dirs "<JOSNL_DIR_1>,<JOSNL_DIR_2>,<JOSNL_DIR_3>" \
+    --data_input_dirs "<JSONL_DIR_1>,<JSONL_DIR_2>,<JSONL_DIR_3>" \
     --tokenizer_dir "<TOKENIZER_DIR>" \
     --data_cache_dir "jsonl_to_arrow_cache" \
     --data_jsonl_output_dir "spliced_tokenized_output_jsonl" \
@@ -428,7 +432,7 @@ Here is details about CLI arguments:
 * Mixed precision: `--mixed_precision`. The default value is "fp16". "fp16" and "bf16" are supported.
 * Gradient clipping: `--gradient_clipping`. The default value is 1.0.
 * Weight decay: `-w`, `--weight_decay`. The default value is 0.1.
-* Warmup steps: `-s`, `--warmup_steps`. The default value is calcuated by 0.025 warmup ratio.
+* Warmup steps: `-s`, `--warmup_steps`. The default value is calculated by 0.025 warmup ratio.
 * Gradient checkpointing: `--use_grad_checkpoint`. The default value is `False`. This saves memory at the cost of speed. You'd better enable this option when training with a large batch size.
 * Flash attention: `--use_flash_attn`. If you want to use flash attention, you must install `flash-attn` and related packages. The default value is `False`. This is helpful to accelerate training while saving memory. We recommend you always use flash attention.
 * Freeze non-embedding parameters: `--freeze_non_embeds_params`. Freeze non-embedding parameters. It can be helpful to align embeddings after extending vocabulary size.
@@ -488,7 +492,7 @@ The following figure shows the data processing pipeline conducted for Colossal-L
 ❗️**Important**: We will open-source our data-processing toolkit soon, stay tuned!
 
 ### Tokenizer
-The original LLaMA-2 vacabulary comprises fewer than a thousand Chinese characters, thus proves inadequate for encoding comprehensive Chinese texts effectively. Secondly, the utilization of byte tokens presents a challenge for transformer encoders to capture the semantic nuances of Chinese characters.
+The original LLaMA-2 vocabulary comprises fewer than a thousand Chinese characters, thus proves inadequate for encoding comprehensive Chinese texts effectively. Secondly, the utilization of byte tokens presents a challenge for transformer encoders to capture the semantic nuances of Chinese characters.
 
 To address the above issues, we extend LLaMA-2 vocabulary from 32,000 to 69,104. To adapt the LLaMA-2 model for use with the Colossal-LLaMA-2 tokenizer, we initialize the new word embeddings by calculating the mean values from the original LLaMA-2 embeddings and subsequently append these new rows to the end of the original embedding matrices.
 

From 929c32e37fd97220ffd6df0f0e7175d4a933ce9c Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 10 Jan 2024 22:34:16 +0800
Subject: [PATCH 04/33] [workflow] fixed build CI (#5240)

* [workflow] fixed build CI

* polish

* polish

* polish

* polish

* polish
---
 .github/workflows/build_on_pr.yml             | 136 +++---------------
 .github/workflows/build_on_schedule.yml       |  15 +-
 .github/workflows/doc_test_on_schedule.yml    |   2 +-
 tests/kit/model_zoo/__init__.py               |  32 ++++-
 tests/kit/model_zoo/registry.py               |  17 ++-
 .../test_plugin/test_gemini_plugin.py         |   4 +-
 .../test_plugin/test_low_level_zero_plugin.py |   9 +-
 .../test_plugin/test_torch_ddp_plugin.py      |   9 +-
 .../test_plugin/test_torch_fsdp_plugin.py     |   9 +-
 .../test_gemini_checkpoint_io.py              |  14 +-
 .../test_gemini_torch_compability.py          |   2 +-
 ...st_hybrid_parallel_plugin_checkpoint_io.py |   2 +-
 .../test_plugins_huggingface_compatibility.py |   2 +-
 tests/test_lazy/test_models.py                |   4 +-
 14 files changed, 101 insertions(+), 156 deletions(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index 8eb358c4f42c..50417ac8a3a0 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -22,57 +22,6 @@ on:
   delete:
 
 jobs:
-  prepare_cache:
-    name: Prepare testmon cache
-    if: |
-      github.event_name == 'create' &&
-      github.event.ref_type == 'branch' &&
-      github.event.repository.full_name == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
-    container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
-      options: --rm
-    timeout-minutes: 5
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Copy testmon cache
-        run: | # branch name may contain slash, we need to replace it with space
-          export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
-          if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
-             cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
-          fi
-        env:
-          MAIN_BRANCH: ${{ github.event.master_branch }}
-
-  prepare_cache_for_pr:
-    name: Prepare testmon cache for PR
-    if: |
-      github.event_name == 'pull_request' &&
-      (github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
-      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
-    container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
-      options: --rm
-    timeout-minutes: 5
-    defaults:
-      run:
-        shell: bash
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache
-      cancel-in-progress: true
-    steps:
-      - name: Copy testmon cache
-        run: | # branch name may contain slash, we need to replace it with space
-          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
-          if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
-            mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}
-          fi
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-
   detect:
     name: Detect file change
     if: |
@@ -140,7 +89,7 @@ jobs:
     if: needs.detect.outputs.anyLibraryFileChanged == 'true'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
     timeout-minutes: 60
     defaults:
@@ -174,6 +123,7 @@ jobs:
         run: |
           cd TensorNVMe
           cp -p -r ./build /github/home/tensornvme_cache/
+          cp -p -r ./cmake-build /github/home/tensornvme_cache/
 
       - name: Checkout Colossal-AI
         uses: actions/checkout@v2
@@ -198,31 +148,27 @@ jobs:
           # -p flag is required to preserve the file timestamp to avoid ninja rebuild
           cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
 
-      - name: Restore Testmon Cache
-        run: |
-          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
-            cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
-          fi
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-
       - name: Execute Unit Testing
         run: |
-          CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/
+          CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \
+          -m "not largedist" \
+          --durations=0 \
+          --ignore tests/test_analyzer \
+          --ignore tests/test_auto_parallel \
+          --ignore tests/test_fx \
+          --ignore tests/test_autochunk \
+          --ignore tests/test_gptq \
+          --ignore tests/test_infer_ops \
+          --ignore tests/test_legacy \
+          --ignore tests/test_moe \
+          --ignore tests/test_smoothquant \
+          --ignore tests/test_checkpoint_io \
+          tests/
         env:
-          DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-          TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt
           LLAMA_PATH: /data/scratch/llama-tiny
 
-      - name: Store Testmon Cache
-        run: |
-          mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
-          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-
       - name: Collate artifact
         env:
           PR_NUMBER: ${{ github.event.number }}
@@ -260,53 +206,3 @@ jobs:
           name: report
           path: report/
 
-  store_cache:
-    name: Store testmon cache for PR
-    if: |
-      github.event_name == 'pull_request' &&
-      github.event.action == 'closed' &&
-      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
-    container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
-      options: --rm
-    timeout-minutes: 5
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Store testmon cache if possible
-        if: github.event.pull_request.merged == true
-        run: | # branch name may contain slash, we need to replace it with space
-          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
-          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
-            cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
-          fi
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-
-      - name: Remove testmon cache
-        run: |
-          rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-
-  remove_cache:
-    name: Remove testmon cache
-    if: |
-      github.event_name == 'delete' &&
-      github.event.ref_type == 'branch' &&
-      github.event.repository.full_name == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
-    container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
-      options: --rm
-    timeout-minutes: 5
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Remove testmon cache
-        run: | # branch name may contain slash, we need to replace it with space
-          export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
-          rm -rf "/github/home/testmon_cache/${BASE}"
diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index e5afe9622931..3bee3b4f96e2 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -10,20 +10,20 @@ jobs:
   build:
     name: Build and Test Colossal-AI
     if: github.repository == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, 8-gpu]
+    runs-on: [self-hosted, gpu]
     container:
       image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
-    timeout-minutes: 40
+    timeout-minutes: 90
     steps:
       - name: Check GPU Availability # ensure all GPUs have enough memory
         id: check-avai
         run: |
           avai=true
-          for i in $(seq 0 7);
+          for i in $(seq 0 3);
           do
             gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
-            [ "$gpu_used" -gt "10000" ] && avai=false
+            [ "$gpu_used" -gt "2000" ] && avai=false
           done
 
           echo "GPU is available: $avai"
@@ -60,9 +60,12 @@ jobs:
       - name: Unit Testing
         if: steps.check-avai.outputs.avai == 'true'
         run: |
-          PYTHONPATH=$PWD pytest --durations=0 tests
+          PYTHONPATH=$PWD pytest \
+          -m "not largedist" \
+          --durations=0 \
+          tests/
         env:
-          DATA: /data/scratch/cifar-10
+          NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
           LLAMA_PATH: /data/scratch/llama-tiny
 
diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml
index b4c77674746f..b3536184d78a 100644
--- a/.github/workflows/doc_test_on_schedule.yml
+++ b/.github/workflows/doc_test_on_schedule.yml
@@ -12,7 +12,7 @@ jobs:
     name: Test the changed Doc
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
       options: --gpus all --rm
     timeout-minutes: 60
     steps:
diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py
index 62b9123b59b0..5f6789ff3357 100644
--- a/tests/kit/model_zoo/__init__.py
+++ b/tests/kit/model_zoo/__init__.py
@@ -1,5 +1,33 @@
-from . import custom, diffusers, timm, torchaudio, torchrec, torchvision, transformers
+import os
+from . import custom, diffusers, timm, torchaudio, torchvision, transformers
 from .executor import run_fwd, run_fwd_bwd
 from .registry import model_zoo
 
-__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd"]
+# We pick a subset of models for fast testing in order to reduce the total testing time
+COMMON_MODELS = [
+    'custom_hanging_param_model',
+    'custom_nested_model',
+    'custom_repeated_computed_layers',
+    'custom_simple_net',
+    'diffusers_clip_text_model',
+    'diffusers_auto_encoder_kl',
+    'diffusers_unet2d_model',
+    'timm_densenet',
+    'timm_resnet',
+    'timm_swin_transformer',
+    'torchaudio_wav2vec2_base',
+    'torchaudio_conformer',
+    'transformers_bert_for_masked_lm',
+    'transformers_bloom_for_causal_lm',
+    'transformers_falcon_for_causal_lm',
+    'transformers_chatglm_for_conditional_generation',
+    'transformers_llama_for_casual_lm',
+    'transformers_vit_for_masked_image_modeling',
+    'transformers_mistral_for_casual_lm'
+]
+
+IS_FAST_TEST = os.environ.get('FAST_TEST', '0') == '1'
+
+
+__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", 'COMMON_MODELS', 'IS_FAST_TEST']
+
diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py
index bb522778bb5d..44a0adc6a3af 100644
--- a/tests/kit/model_zoo/registry.py
+++ b/tests/kit/model_zoo/registry.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 from dataclasses import dataclass
-from typing import Callable
+from typing import Callable, List, Union
 
 __all__ = ["ModelZooRegistry", "ModelAttribute", "model_zoo"]
 
@@ -61,7 +61,7 @@ def register(
         """
         self[name] = (model_fn, data_gen_fn, output_transform_fn, loss_fn, model_attribute)
 
-    def get_sub_registry(self, keyword: str):
+    def get_sub_registry(self, keyword: Union[str, List[str]]):
         """
         Get a sub registry with models that contain the keyword.
 
@@ -70,12 +70,15 @@ def get_sub_registry(self, keyword: str):
         """
         new_dict = dict()
 
+        if isinstance(keyword, str):
+            keyword_list = [keyword]
+        else:
+            keyword_list = keyword
+        assert isinstance(keyword_list, (list, tuple))
+
         for k, v in self.items():
-            if keyword == "transformers_gpt":
-                if keyword in k and not "gptj" in k:  # ensure GPT2 does not retrieve GPTJ models
-                    new_dict[k] = v
-            else:
-                if keyword in k:
+            for kw in keyword_list:
+                if kw in k:
                     new_dict[k] = v
 
         assert len(new_dict) > 0, f"No model found with keyword {keyword}"
diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py
index d4205e1f9d73..3462d5dde52b 100644
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -13,7 +13,7 @@
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo
+from tests.kit.model_zoo import model_zoo, COMMON_MODELS, IS_FAST_TEST
 
 
 def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]:
@@ -66,7 +66,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t
 # @parameterize('init_method', ['lazy', 'none', 'colo'])
 
 
-@parameterize("subset", ["torchvision", "transformers", "diffusers"])
+@parameterize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "transformers", "diffusers"])
 @parameterize("init_method", ["none"])
 @parameterize("zero_size", [2])
 @parameterize("tp_size", [2])
diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
index 3eaaf882c9ba..bcdcc1470e6c 100644
--- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
+++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
@@ -11,7 +11,7 @@
 
 # from colossalai.nn.optimizer import HybridAdam
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo
+from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
 
 # These models are not compatible with AMP
 _AMP_ERR_MODELS = ["timm_convit", "deepfm_interactionarch"]
@@ -62,7 +62,12 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
     ignore_models = _AMP_ERR_MODELS + _LOW_LEVEL_ZERO_ERR_MODELS + _STUCK_MODELS
     skipped_models = []
 
-    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
+    if IS_FAST_TEST:
+        registry = model_zoo.get_sub_registry(COMMON_MODELS)
+    else:
+        registry = model_zoo
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
         # FIXME(ver217): fix these models
         if name in ignore_models:
             skipped_models.append(name)
diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
index 1a7ca6f2a30c..fa32feb2ff85 100644
--- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
@@ -11,7 +11,7 @@
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.interface import OptimizerWrapper
 from colossalai.testing import rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo
+from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
 
 
 def run_fn(model_fn, data_gen_fn, output_transform_fn):
@@ -40,7 +40,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):
 
 
 def check_torch_ddp_plugin():
-    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
+    if IS_FAST_TEST:
+        registry = model_zoo.get_sub_registry(COMMON_MODELS)
+    else:
+        registry = model_zoo
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
         if name == "dlrm_interactionarch":
             continue
         run_fn(model_fn, data_gen_fn, output_transform_fn)
diff --git a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
index 8bcbffdd06fe..8a14d7cf872d 100644
--- a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
@@ -12,7 +12,7 @@
 
 from colossalai.interface import OptimizerWrapper
 from colossalai.testing import rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo
+from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
 
 
 # test basic fsdp function
@@ -42,7 +42,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):
 
 
 def check_torch_fsdp_plugin():
-    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
+    if IS_FAST_TEST:
+        registry = model_zoo.get_sub_registry(COMMON_MODELS)
+    else:
+        registry = model_zoo
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
         if any(
             element in name
             for element in [
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
index 8343c5f07e30..49fd85ffba0a 100644
--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -7,6 +7,7 @@
 from utils import shared_tempdir
 
 import colossalai
+from colossalai.testing import skip_if_not_enough_gpus
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin
 from colossalai.lazy import LazyInitContext
@@ -68,7 +69,7 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b
 @clear_cache_before_run()
 @parameterize("placement_config", OPTIM_PLACEMENT_CONFIGS)
 @parameterize("shard", [True, False])
-@parameterize("model_name", ["transformers_gpt"])
+@parameterize("model_name", ["transformers_llama_for_casual_lm"])
 @parameterize("size_per_shard", [32])
 @parameterize("tp_size", [1, 2])
 @parameterize("zero_size", [2])
@@ -156,13 +157,12 @@ def run_dist(rank, world_size, port):
 
 
 @pytest.mark.dist
-@pytest.mark.parametrize("world_size", [4])
 @rerun_if_address_is_in_use()
-def test_gemini_ckpIO(world_size):
-    spawn(run_dist, world_size)
+def test_gemini_ckpIO():
+    spawn(run_dist, 4)
 
 @pytest.mark.largedist
-@pytest.mark.parametrize("world_size", [8])
+@skip_if_not_enough_gpus(min_gpus=8)
 @rerun_if_address_is_in_use()
-def test_gemini_ckpIO_3d(world_size):
-    spawn(run_dist, world_size)
\ No newline at end of file
+def test_gemini_ckpIO_3d():
+    spawn(run_dist, 8)
\ No newline at end of file
diff --git a/tests/test_checkpoint_io/test_gemini_torch_compability.py b/tests/test_checkpoint_io/test_gemini_torch_compability.py
index bb7a60035e02..44a000113629 100644
--- a/tests/test_checkpoint_io/test_gemini_torch_compability.py
+++ b/tests/test_checkpoint_io/test_gemini_torch_compability.py
@@ -20,7 +20,7 @@
 
 @clear_cache_before_run()
 @parameterize("shard", [False, True])
-@parameterize("model_name", ["transformers_gpt"])
+@parameterize("model_name", ["transformers_llama_for_casual_lm"])
 def exam_torch_load_from_gemini(shard: bool, model_name: str):
     (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
     criterion = lambda x: x.mean()
diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
index c0bc2d2f5d0a..db3c56da874d 100644
--- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
@@ -40,7 +40,7 @@
 
 @clear_cache_before_run()
 @parameterize("shard", [True, False])
-@parameterize("model_name", ["transformers_gpt"])
+@parameterize("model_name", ["transformers_llama_for_casual_lm"])
 @parameterize("size_per_shard", [32])
 @parameterize("test_config", TEST_CONFIGS)
 def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_config: dict):
diff --git a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
index a6f67e0d7729..0353ff115840 100644
--- a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
+++ b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
@@ -18,7 +18,7 @@
 
 
 @clear_cache_before_run()
-@parameterize("model_name", ["transformers_gpt"])
+@parameterize("model_name", ["transformers_llama_for_casual_lm"])
 @parameterize("plugin_type", ["ddp", "zero", "gemini"])
 def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per_shard=32):
     (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
diff --git a/tests/test_lazy/test_models.py b/tests/test_lazy/test_models.py
index a1b5763d4cd8..ee50e5b61009 100644
--- a/tests/test_lazy/test_models.py
+++ b/tests/test_lazy/test_models.py
@@ -1,11 +1,11 @@
 import pytest
 from lazy_init_utils import SUPPORT_LAZY, check_lazy_init
 
-from tests.kit.model_zoo import model_zoo
+from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
 
 
 @pytest.mark.skipif(not SUPPORT_LAZY, reason="requires torch >= 1.12.0")
-@pytest.mark.parametrize("subset", ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"])
+@pytest.mark.parametrize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"])
 @pytest.mark.parametrize("default_device", ["cpu", "cuda"])
 def test_torchvision_models_lazy_init(subset, default_device):
     sub_model_zoo = model_zoo.get_sub_registry(subset)

From 03c6112b5e1c855a89179389a443aac1ab900bd7 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 11 Jan 2024 16:04:45 +0800
Subject: [PATCH 05/33] [ci] fixed booster test (#5251)

* [ci] fixed booster test

* [ci] fixed booster test

* [ci] fixed booster test
---
 .github/workflows/build_on_pr.yml                    | 4 +---
 .github/workflows/build_on_schedule.yml              | 9 +++++----
 tests/kit/model_zoo/transformers/chatglm2.py         | 1 -
 tests/test_booster/test_plugin/test_3d_plugin.py     | 4 ++--
 tests/test_booster/test_plugin/test_gemini_plugin.py | 8 ++++----
 5 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index 50417ac8a3a0..54e8a6d93487 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -90,7 +90,7 @@ jobs:
     runs-on: [self-hosted, gpu]
     container:
       image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
-      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
+      options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
     timeout-minutes: 60
     defaults:
       run:
@@ -165,7 +165,6 @@ jobs:
           --ignore tests/test_checkpoint_io \
           tests/
         env:
-          NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
           LLAMA_PATH: /data/scratch/llama-tiny
 
@@ -205,4 +204,3 @@ jobs:
         with:
           name: report
           path: report/
-
diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index 3bee3b4f96e2..5b0103eb770d 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -13,15 +13,16 @@ jobs:
     runs-on: [self-hosted, gpu]
     container:
       image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
-      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
+      options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
     timeout-minutes: 90
     steps:
       - name: Check GPU Availability # ensure all GPUs have enough memory
         id: check-avai
         run: |
           avai=true
-          for i in $(seq 0 3);
-          do
+          ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+          endIndex=$(($ngpu-1))
+          for i in $(seq 0 $endIndex);
             gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
             [ "$gpu_used" -gt "2000" ] && avai=false
           done
@@ -74,7 +75,7 @@ jobs:
         if: ${{ failure() }}
         run: |
           url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
-          msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details"
+          msg="Scheduled Build and Test failed, please visit $url for details"
           echo $msg
           python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
         env:
diff --git a/tests/kit/model_zoo/transformers/chatglm2.py b/tests/kit/model_zoo/transformers/chatglm2.py
index 0b178d58ce33..e27fdb4e2efe 100644
--- a/tests/kit/model_zoo/transformers/chatglm2.py
+++ b/tests/kit/model_zoo/transformers/chatglm2.py
@@ -2,7 +2,6 @@
 
 from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
 from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel
-
 from ..registry import ModelAttribute, model_zoo
 
 # ================================
diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py
index ad878fb0c86a..e724d7359c54 100644
--- a/tests/test_booster/test_plugin/test_3d_plugin.py
+++ b/tests/test_booster/test_plugin/test_3d_plugin.py
@@ -10,10 +10,11 @@
 from colossalai.fx import is_compatible_with_meta
 from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
 from tests.kit.model_zoo import model_zoo
 
 
+@clear_cache_before_run()
 def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
     try:
         if init_method == "lazy":
@@ -69,7 +70,6 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True):
         "transformers_llama_for_casual_lm"
     ).items():
         err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
-        torch.cuda.empty_cache()
 
         if err is None:
             passed_models.append(name)
diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py
index 3462d5dde52b..9952e41e5b13 100644
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -12,10 +12,11 @@
 from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor.colo_parameter import ColoParameter
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo, COMMON_MODELS, IS_FAST_TEST
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
 
 
+@clear_cache_before_run()
 def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]:
     try:
         if init_method == "lazy":
@@ -116,7 +117,7 @@ def check_gemini_plugin(
             "transformers_falcon_for_sequence_classification",
             "transformers_falcon_for_token_classification",
             "transformers_falcon_for_question_answering",
-            "transformers_gptj_lm", # lead to OOM when running in ci
+            "transformers_gptj_lm",  # lead to OOM when running in ci
             "transformers_gptj_for_question_answering",
             "transformers_gptj_for_sequence_classification",
         ]:
@@ -145,7 +146,6 @@ def check_gemini_plugin(
             tp_size = 1
 
         err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size)
-        torch.cuda.empty_cache()
         if err is None:
             passed_models.append(name)
         else:

From 482f1eaf98b89460f3f8d0a9c0114abdb126394e Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 11 Jan 2024 17:16:32 +0800
Subject: [PATCH 06/33] [ci] fixed ddp test (#5254)

* [ci] fixed ddp test

* polish
---
 tests/kit/model_zoo/registry.py               | 18 ++++++++++++++++--
 tests/test_shardformer/test_with_torch_ddp.py |  2 +-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py
index 44a0adc6a3af..5e8e0b3822df 100644
--- a/tests/kit/model_zoo/registry.py
+++ b/tests/kit/model_zoo/registry.py
@@ -61,7 +61,7 @@ def register(
         """
         self[name] = (model_fn, data_gen_fn, output_transform_fn, loss_fn, model_attribute)
 
-    def get_sub_registry(self, keyword: Union[str, List[str]]):
+    def get_sub_registry(self, keyword: Union[str, List[str]], exclude: Union[str, List[str]] = None):
         """
         Get a sub registry with models that contain the keyword.
 
@@ -76,10 +76,24 @@ def get_sub_registry(self, keyword: Union[str, List[str]]):
             keyword_list = keyword
         assert isinstance(keyword_list, (list, tuple))
 
+        if exclude is None:
+            exclude_keywords = []
+        elif isinstance(exclude, str):
+            exclude_keywords = [exclude]
+        else:
+            exclude_keywords = exclude
+        assert isinstance(exclude_keywords, (list, tuple))
+
         for k, v in self.items():
             for kw in keyword_list:
                 if kw in k:
-                    new_dict[k] = v
+                    should_exclude = False
+                    for ex_kw in exclude_keywords:
+                        if ex_kw in k:
+                            should_exclude = True
+
+                    if not should_exclude:
+                        new_dict[k] = v
 
         assert len(new_dict) > 0, f"No model found with keyword {keyword}"
         return new_dict
diff --git a/tests/test_shardformer/test_with_torch_ddp.py b/tests/test_shardformer/test_with_torch_ddp.py
index f642a9dcada4..4b741c21b48c 100644
--- a/tests/test_shardformer/test_with_torch_ddp.py
+++ b/tests/test_shardformer/test_with_torch_ddp.py
@@ -16,7 +16,7 @@
 
 @parameterize("lazy_init", [True, False])
 def check_shardformer_with_ddp(lazy_init: bool):
-    sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt")
+    sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt", exclude="transformers_gptj")
 
     # create shardformer
     # ranks: [0, 1, 2, 3]

From 907ee2a8c7eef8e033f77a86dbc693a82f805d31 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Thu, 11 Jan 2024 17:58:38 +0800
Subject: [PATCH 07/33] fix typo in  applications/ColossalEval/README.md
 (#5250)

---
 applications/ColossalEval/README.md | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/applications/ColossalEval/README.md b/applications/ColossalEval/README.md
index 0cd47d68d2b2..a1a76f750fb9 100644
--- a/applications/ColossalEval/README.md
+++ b/applications/ColossalEval/README.md
@@ -9,6 +9,8 @@
 - [Table of Contents](#table-of-contents)
 - [Overview](#overview)
 - [Leaderboard](#leaderboard)
+  - [Model with ~13 Billion Parameters](#model-with-13-billion-parameters)
+  - [Model with ~7 Billion Parameters](#model-with-7-billion-parameters)
 - [Install](#install)
 - [Evaluation Process](#evaluation-process)
   - [Inference](#inference)
@@ -179,7 +181,7 @@ A data sample basically follow the format of Alpaca. It should contain the follo
 
 * `dataset` (str, compulsory): The name of the dataset.
 * `split` (str, compulsory): The split of the instruction.
-* `catrgory` (str, compulsory): The category of the instruction.
+* `category` (str, compulsory): The category of the instruction.
 * `instruction` (str, compulsory): The instruction for the LLM.
 * `input` (str, optional): The additional context of the instruction.
 * `output` (str, optional): The model output of the instruction.
@@ -392,18 +394,18 @@ To make it more easier to set the config, you only need to specify all metrics y
 
 - `combined_single_choice_accuracy`: A combination of `first_token_logit` and `single_choice_accuracy`. If one of these is correct, the model will get the score. It can be used in all dataset that contains single-choice questions.
 - `first_token_logit`: Calculate score based on softmax score over the given choices. If the argmax of the softmax is equal to the reference, the model will get the score. If there is `NaN` in softmax score, it will calculate the score using exact match. It can be used in all dataset that contains single-choice questions.
-- `single_choice_accuracy`: Calculate score using exact match. It will only get the first uppercase letter such as A, B, C or D that is not surrouded by lowercase letters. If the uppercase letter is equal to the reference, the model will get the score. It can be used in all dataset that contains single-choice questions.
-- `multi_choice_accuracy`: Calculate score on multi-choice questions. It will get a set of all uppercase letters such as A, B, C or D that is not surrouded by lowercase letters. If the prediction conatains uppercase letters that are not in reference. The model will get 0 score. If the prediction contains a uppercase letter that is in reference, the model will get a score of `1/len(reference)`. It is used in AGIEval and GAOKAO-Bench.
+- `single_choice_accuracy`: Calculate score using exact match. It will only get the first uppercase letter such as A, B, C or D that is not surrounded by lowercase letters. If the uppercase letter is equal to the reference, the model will get the score. It can be used in all dataset that contains single-choice questions.
+- `multi_choice_accuracy`: Calculate score on multi-choice questions. It will get a set of all uppercase letters such as A, B, C or D that is not surrounded by lowercase letters. If the prediction contains uppercase letters that are not in reference. The model will get 0 score. If the prediction contains a uppercase letter that is in reference, the model will get a score of `1/len(reference)`. It is used in AGIEval and GAOKAO-Bench.
 - `math_equivalence`: Code from [hendrycks](https://github.com/hendrycks/math/blob/main/modeling/math_equivalence.py). Compute scores over the prediction math formula and reference math formula. It is used in AGIEval and GAOKAO-Bench.
 - `f1_score`: Calculate English f1 score between prediction and reference. It is used in Longbench.
 - `f1_zh_score`: Calculate Chinese f1 score between prediction and reference. It is used in Longbench.
 - `rouge_score`: Calculate English f1 score between prediction and reference. It is used in GAOKAO-Bench and LongBench.
 - `rouge_zh_score`: Calculate Chinese rouge score between prediction and reference. It is used in GAOKAO-Bench and LongBench.
-- `retrieval_score`: Calculate English retrieval score between prediction and reference. It determines whether the ouput(which paragraph) corresponds to the given abstract. It is used in Longbench.
-- `retrieval_zh_score`: Calculate Chinese retrieval score between prediction and reference. It determines whether the ouput(which paragraph) corresponds to the given abstract. It is used in Longbench.
-- `classification_score`: Calculate classification score between prediction and reference. It determines whether the ouput(a class) is equal to the reference. It is used in Longbench.
+- `retrieval_score`: Calculate English retrieval score between prediction and reference. It determines whether the output(which paragraph) corresponds to the given abstract. It is used in Longbench.
+- `retrieval_zh_score`: Calculate Chinese retrieval score between prediction and reference. It determines whether the output(which paragraph) corresponds to the given abstract. It is used in Longbench.
+- `classification_score`: Calculate classification score between prediction and reference. It determines whether the output(a class) is equal to the reference. It is used in Longbench.
 - `code_sim_score`: Calculate similarity score between prediction and reference. It is used in Longbench.
-- `count_score`: Calculate count score between prediction and reference. It determines whether the ouput(number of given passages) is equal to the reference. It is used in Longbench.
+- `count_score`: Calculate count score between prediction and reference. It determines whether the output(number of given passages) is equal to the reference. It is used in Longbench.
 - `gsm_accuracy`: Calculate scores between prediction and reference.. It is used in GSM8K.
 - `perplexity`: Calculate perplexity. The formula is $ perplexity = \frac{1}{n} \sum_i e^{loss_i} $ where $n$ is the number of samples and $ loss_i $ is the average loss for sample $ i $. It can be used in all dataset.
 - `ppl_score`: Calculate perplexity score. The formula is $ ppl\_score = \frac{1}{n} \sum_i e^{-loss_i} $ where $n$ is the number of samples and $ loss_i $ is the average loss for sample $ i $. It can be used in all dataset.
@@ -455,7 +457,7 @@ def CustomizedMetric(prediction: str, reference: str):
 	return score
 ```
 
-Once you have successfully added your own metric, you should specify your metric both in `colossal_eval/evaluate/dataset_evaluator/metric.py` (suggest which subcategories shoule the metric be applied to) and your evaluation config.
+Once you have successfully added your own metric, you should specify your metric both in `colossal_eval/evaluate/dataset_evaluator/metric.py` (suggest which subcategories should the metric be applied to) and your evaluation config.
 
 ### How to Add a New Dataset?
 

From 54aca87252a2c8ae5ab8e11218738f2514c8803e Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Thu, 11 Jan 2024 19:07:45 +0800
Subject: [PATCH 08/33] [ci] fix shardformer tests. (#5255)

* fix ci

fix

* revert: revert p2p

* feat: add enable_metadata_cache option

* revert: enable t5 tests

---------

Co-authored-by: Wenhao Chen <cwher@outlook.com>
---
 colossalai/booster/plugin/hybrid_parallel_plugin.py     | 8 +++++++-
 tests/test_shardformer/test_model/test_shard_gpt2.py    | 4 ++--
 tests/test_shardformer/test_model/test_shard_t5.py      | 6 ++++++
 tests/test_shardformer/test_model/test_shard_whisper.py | 5 +++++
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 205660f946e9..8ee1e97c6ce3 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -919,6 +919,7 @@ class HybridParallelPlugin(PipelinePluginBase):
         custom_policy (Policy, optional): Custom policy for Shardformer. Defaults to None.
         pp_style (str, optional): The style for pipeline parallelism. Defaults to '1f1b'.
         num_model_chunks (int, optional): The number of model chunks for interleaved pipeline parallelism. Defaults to 1.
+        enable_metadata_cache (bool, optional): Whether to enable metadata cache for pipeline parallelism. Defaults to True.
     """
 
     def __init__(
@@ -956,6 +957,7 @@ def __init__(
         custom_policy: Policy = None,
         pp_style: str = "1f1b",
         num_model_chunks: int = 1,
+        enable_metadata_cache: bool = True,
     ) -> None:
         super().__init__()
         assert (
@@ -1002,10 +1004,14 @@ def __init__(
                     num_model_chunks=num_model_chunks,
                     num_microbatch=num_microbatches,
                     microbatch_size=microbatch_size,
+                    enable_metadata_cache=enable_metadata_cache,
                 )
             elif pp_style == "1f1b":
                 self.schedule = OneForwardOneBackwardSchedule(
-                    self.stage_manager, num_microbatches=num_microbatches, microbatch_size=microbatch_size
+                    stage_manager=self.stage_manager,
+                    num_microbatches=num_microbatches,
+                    microbatch_size=microbatch_size,
+                    enable_metadata_cache=enable_metadata_cache,
                 )
             else:
                 raise NotImplementedError()
diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py
index 66b30641acc8..3155420f1cf2 100644
--- a/tests/test_shardformer/test_model/test_shard_gpt2.py
+++ b/tests/test_shardformer/test_model/test_shard_gpt2.py
@@ -165,7 +165,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 )
 @clear_cache_before_run()
 def run_gpt2_test(test_config):
-    sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt")
+    sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt", exclude="transformers_gptj")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
@@ -200,7 +200,7 @@ def run_gpt2_test(test_config):
 )
 @clear_cache_before_run()
 def run_gpt2_3d_test(test_config):
-    sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt")
+    sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt", exclude="transformers_gptj")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
diff --git a/tests/test_shardformer/test_model/test_shard_t5.py b/tests/test_shardformer/test_model/test_shard_t5.py
index 73f203d1f023..22c201458ad4 100644
--- a/tests/test_shardformer/test_model/test_shard_t5.py
+++ b/tests/test_shardformer/test_model/test_shard_t5.py
@@ -86,6 +86,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "tp_size": 2,
             "pp_size": 2,
             "num_microbatches": 2,
+            "enable_metadata_cache": False,
             "enable_all_optimization": True,
             "use_lazy_init": True,
             "precision": "fp16",
@@ -95,6 +96,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "tp_size": 1,
             "pp_size": 2,
             "num_microbatches": 4,
+            "enable_metadata_cache": False,
             "use_lazy_init": False,
             "precision": "fp16",
             "initial_scale": 1,
@@ -110,6 +112,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "tp_size": 1,
             "pp_size": 4,
             "num_microbatches": 4,
+            "enable_metadata_cache": False,
             "enable_all_optimization": False,
             "use_lazy_init": False,
             "precision": "fp32",
@@ -128,6 +131,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "tp_size": 1,
             "pp_size": 2,
             "num_microbatches": 2,
+            "enable_metadata_cache": False,
             "enable_all_optimization": True,
             "use_lazy_init": True,
             "zero_stage": 1,
@@ -159,6 +163,7 @@ def run_t5_test(test_config):
             "tp_size": 2,
             "pp_size": 2,
             "num_microbatches": 4,
+            "enable_metadata_cache": False,
             "enable_all_optimization": False,
             "use_lazy_init": False,
             "precision": "fp32",
@@ -168,6 +173,7 @@ def run_t5_test(test_config):
             "tp_size": 2,
             "pp_size": 2,
             "num_microbatches": 4,
+            "enable_metadata_cache": False,
             "enable_all_optimization": False,
             "use_lazy_init": False,
             "precision": "fp16",
diff --git a/tests/test_shardformer/test_model/test_shard_whisper.py b/tests/test_shardformer/test_model/test_shard_whisper.py
index f839bd84ab69..6efb8a922f85 100644
--- a/tests/test_shardformer/test_model/test_shard_whisper.py
+++ b/tests/test_shardformer/test_model/test_shard_whisper.py
@@ -114,6 +114,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "tp_size": 2,
             "pp_size": 2,
             "num_microbatches": 2,
+            "enable_metadata_cache": False,
             "enable_all_optimization": True,
             "use_lazy_init": True,
             "precision": "fp32",
@@ -123,6 +124,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "tp_size": 1,
             "pp_size": 2,
             "num_microbatches": 4,
+            "enable_metadata_cache": False,
             "use_lazy_init": False,
             "precision": "fp32",
             "initial_scale": 1,
@@ -138,6 +140,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "tp_size": 1,
             "pp_size": 4,
             "num_microbatches": 4,
+            "enable_metadata_cache": False,
             "use_lazy_init": False,
             "precision": "fp32",
         },
@@ -163,6 +166,7 @@ def run_whisper_test(test_config):
             "tp_size": 2,
             "pp_size": 2,
             "num_microbatches": 4,
+            "enable_metadata_cache": False,
             "enable_all_optimization": False,
             "use_lazy_init": False,
             "precision": "fp32",
@@ -172,6 +176,7 @@ def run_whisper_test(test_config):
             "tp_size": 2,
             "pp_size": 2,
             "num_microbatches": 2,
+            "enable_metadata_cache": False,
             "enable_all_optimization": False,
             "use_lazy_init": False,
             "precision": "fp32",

From 1b53824ccf3521499bead13e49fe4498fbc24a1a Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Thu, 11 Jan 2024 21:01:11 +0800
Subject: [PATCH 09/33] [doc] fix doc typo (#5256)

* [doc] fix annotation display

* [doc] fix llama2 doc
---
 colossalai/shardformer/README.md   | 26 +++++++++++++-------------
 examples/language/llama2/README.md |  4 +---
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/colossalai/shardformer/README.md b/colossalai/shardformer/README.md
index 3ce4baa64112..c8670affbd2a 100644
--- a/colossalai/shardformer/README.md
+++ b/colossalai/shardformer/README.md
@@ -116,18 +116,18 @@ We will follow this roadmap to develop Shardformer:
 
 | model |   tensor parallel    |  pipeline parallel   |   lazy initialization |  xformer   |  flash attn2 | jit fused operator | fused layernorm |  sequence parallel |  overlap |
 | :------: | :-----: | :-----: | :--------: | :---------: | :------: | :-----: | :-----: | :--------: | :---------: |
-| bert |   [x]   |  [x]   |   [x] |  [x]   |  [x] | [x] | [x] |  [x] |  [x] |
-| t5 |   [x]   |  [x]   |   [x] |  [x]   |  [x] | [x] | [x] |  [ ] |  [ ] |
-| llama V1/V2 |   [x]   |  [x]   |   [x] |  [x]   |  [x] | [x] | [x] |  [ ] |  [ ] |
-| gpt2 |   [x]   |  [x]   |   [x] |  [x]   |  [x] | [x] | [x] |  [x] |  [x] |
-| opt |   [x]   |  [x]   |   [x] |  [x]   |  [x] | [x] | [x] |  [ ] |  [ ] |
-| bloom |   [x]   |  [x]   |   [x] |  [x]   |  [x] | [x] | [x] |  [x] |  [x] |
-| chatglm2 |   [x]   |  [x]   |   [x] |  [x]   |  [x] | [x] | [x] |  [x] |  [x] |
-| vit |   [x]   |  [x]   |   [ ] |  [x]   |  [x] | [x] | [x] |  [ ] |  [ ] |
-| whisper |   [x]   |  [x]   |   [x] |  [x]   |  [x] | [ ] | [x] |  [ ] |  [ ] |
-| sam |   [x]   |  [ ]   |   [ ] |  [x]   |  [x] | [x] | [x] |  [ ] |  [ ] |
-| blip2 |   [x]   |  [ ]   |   [ ] |  [x]   |  [x] | [x] | [x] |  [ ] |  [ ] |
-| falcon |   [x]   |  [x]   |   [x] |  [x]   |  [x] | [ ] | [x] |  [ ] |  [ ] |
+| bert |   [√]   |  [√]   |   [√] |  [√]   |  [√] | [√] | [√] |  [√] |  [√] |
+| t5 |   [√]   |  [√]   |   [√] |  [√]   |  [√] | [√] | [√] |  [ ] |  [ ] |
+| llama V1/V2 |   [√]   |  [√]   |   [√] |  [√]   |  [√] | [√] | [√] |  [ ] |  [ ] |
+| gpt2 |   [√]   |  [√]   |   [√] |  [√]   |  [√] | [√] | [√] |  [√] |  [√] |
+| opt |   [√]   |  [√]   |   [√] |  [√]   |  [√] | [√] | [√] |  [ ] |  [ ] |
+| bloom |   [√]   |  [√]   |   [√] |  [√]   |  [√] | [√] | [√] |  [√] |  [√] |
+| chatglm2 |   [√]   |  [√]   |   [√] |  [√]   |  [√] | [√] | [√] |  [√] |  [√] |
+| vit |   [√]   |  [√]   |   [ ] |  [√]   |  [√] | [√] | [√] |  [ ] |  [ ] |
+| whisper |   [√]   |  [√]   |   [√] |  [√]   |  [√] | [ ] | [√] |  [ ] |  [ ] |
+| sam |   [√]   |  [ ]   |   [ ] |  [√]   |  [√] | [√] | [√] |  [ ] |  [ ] |
+| blip2 |   [√]   |  [ ]   |   [ ] |  [√]   |  [√] | [√] | [√] |  [ ] |  [ ] |
+| falcon |   [√]   |  [√]   |   [√] |  [√]   |  [√] | [ ] | [√] |  [ ] |  [ ] |
 | roberta |   [ ]   |  [ ]   |   [ ] |  [ ]   |  [ ] | [ ] | [ ] |  [ ] |  [ ] |
 | albert |   [ ]   |  [ ]   |   [ ] |  [ ]   |  [ ] | [ ] | [ ] |  [ ] |  [ ] |
 | ernie |   [ ]   |  [ ]   |   [ ] |  [ ]   |  [ ] | [ ] | [ ] |  [ ] |  [ ] |
@@ -137,7 +137,7 @@ We will follow this roadmap to develop Shardformer:
 | swin |   [ ]   |  [ ]   |   [ ] |  [ ]   |  [ ] | [ ] | [ ] |  [ ] |  [ ] |
 | swin V2 |   [ ]   |  [ ]   |   [ ] |  [ ]   |  [ ] | [ ] | [ ] |  [ ] |  [ ] |
 | qwen |   [ ]   |  [ ]   |   [ ] |  [ ]   |  [ ] | [ ] | [ ] |  [ ] |  [ ] |
-| mistral |   [x]   |  [ ]   |   [ ] |  [x]   |  [x] | [x] | [x] |  [ ] |  [ ] |
+| mistral |   [√]   |  [ ]   |   [ ] |  [√]   |  [√] | [√] | [√] |  [ ] |  [ ] |
 
 
 ## 💡 API Design
diff --git a/examples/language/llama2/README.md b/examples/language/llama2/README.md
index f29b9dcddbd9..752453b5a7e3 100644
--- a/examples/language/llama2/README.md
+++ b/examples/language/llama2/README.md
@@ -6,7 +6,6 @@
 </p>
 
 - 70 billion parameter LLaMA2 model training accelerated by 195%
-[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/llama2)
 [[blog]](https://www.hpc-ai.tech/blog/70b-llama2-training)
 
 ### LLaMA1
@@ -15,7 +14,6 @@
 </p>
 
 - 65-billion-parameter large model pretraining accelerated by 38%
-[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
 [[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining)
 
 ## Dataset
@@ -123,7 +121,7 @@ Here we will show an example of how to run training
 llama pretraining with `gemini, batch_size=16, sequence_length=4096, gradient_checkpoint=True, flash_attn=True`.
 
 #### a. Running environment
-This experiment was performed on 4 computing nodes with 32 A800 GPUs in total for LLaMA-1 65B. The nodes are
+This experiment was performed on 4 computing nodes with 32 A800/H800 80GB GPUs in total for LLaMA-1 65B or LLaMA-2 70B. The nodes are
 connected with RDMA and GPUs within one node are fully connected with NVLink.
 
 #### b. Running command

From 94bd340f6d86daa0c7d069ca599f58d4e4460243 Mon Sep 17 00:00:00 2001
From: Wenhao Chen <cwher@outlook.com>
Date: Mon, 15 Jan 2024 15:57:40 +0800
Subject: [PATCH 10/33] [hotfix]: add pp sanity check and fix mbs arg (#5268)

* fix: fix misleading mbs arg

* feat: add pp sanity check

* fix: fix 1f1b sanity check
---
 colossalai/pipeline/schedule/interleaved_pp.py       | 4 ++++
 colossalai/pipeline/schedule/one_f_one_b.py          | 4 ++++
 examples/language/llama2/scripts/benchmark_70B/3d.sh | 2 +-
 tests/test_pipeline/test_schedule/test_oneF_oneB.py  | 2 +-
 4 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/colossalai/pipeline/schedule/interleaved_pp.py b/colossalai/pipeline/schedule/interleaved_pp.py
index 0a01a1e7864b..53fc43040831 100644
--- a/colossalai/pipeline/schedule/interleaved_pp.py
+++ b/colossalai/pipeline/schedule/interleaved_pp.py
@@ -72,6 +72,10 @@ def load_batch(self, data_iter: Iterable, device: Optional[torch.device] = None)
             assert self.last_batch_size is None or self.last_batch_size == self.batch_size
             assert self.batch_size == self.microbatch_size * self.num_microbatch
 
+            assert (
+                self.num_microbatch % self.stage_manager.num_stages == 0
+            ), "Number of microbatch should be an integer multiple of number of pipeline parallel devices"
+
         if self.forward_only:
             self.num_microbatch = (self.batch_size - 1) // self.microbatch_size + 1
             # NOTE: disable metadata cache when batch size changes (not valid anymore)
diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
index cb078b25faeb..d69f28e74be9 100644
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -85,6 +85,10 @@ def load_batch(self, data_iter: Iterable, device: Optional[torch.device] = None)
             assert self.last_batch_size is None or self.last_batch_size == self.batch_size
             assert self.batch_size == self.microbatch_size * self.num_microbatches
 
+            assert (
+                self.num_microbatches >= self.stage_manager.num_stages
+            ), "Number of microbatch should be larger than number of stages"
+
         if self.forward_only:
             self.num_microbatches = (self.batch_size - 1) // self.microbatch_size + 1
             # NOTE: disable metadata cache when batch size changes (not valid anymore)
diff --git a/examples/language/llama2/scripts/benchmark_70B/3d.sh b/examples/language/llama2/scripts/benchmark_70B/3d.sh
index d50c57042d1a..cb8f218fa3fc 100644
--- a/examples/language/llama2/scripts/benchmark_70B/3d.sh
+++ b/examples/language/llama2/scripts/benchmark_70B/3d.sh
@@ -14,4 +14,4 @@ cd ../..
 
 export OMP_NUM_THREADS=8
 
-colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -c 70b -p 3d -g -x -b 8 --tp 4 --pp 2 --mbs 4
+colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -c 70b -p 3d -g -x -b 8 --tp 4 --pp 2 --mbs 1
diff --git a/tests/test_pipeline/test_schedule/test_oneF_oneB.py b/tests/test_pipeline/test_schedule/test_oneF_oneB.py
index 5f27be39657d..a08dc6d277d0 100644
--- a/tests/test_pipeline/test_schedule/test_oneF_oneB.py
+++ b/tests/test_pipeline/test_schedule/test_oneF_oneB.py
@@ -155,7 +155,7 @@ def run_dist(
 
 
 @pytest.mark.dist
-@pytest.mark.parametrize("num_microbatch", [4, 12])
+@pytest.mark.parametrize("num_microbatch", [4, 6])
 @pytest.mark.parametrize("batch_size", [12])
 @pytest.mark.parametrize("world_size", [2, 4])
 @rerun_if_address_is_in_use()

From 7f282f7754ddafd94332b0e37e768381bdc0ea40 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 16 Jan 2024 11:54:44 +0800
Subject: [PATCH 11/33] [workflow] fixed incomplete bash command (#5272)

---
 .github/workflows/build_on_schedule.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index 5b0103eb770d..03f9c53f1d28 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -23,6 +23,7 @@ jobs:
           ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
           endIndex=$(($ngpu-1))
           for i in $(seq 0 $endIndex);
+          do
             gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
             [ "$gpu_used" -gt "2000" ] && avai=false
           done

From 6e158b7f9afb3c583cfe55bd4545da7bd0f045e8 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 16 Jan 2024 18:55:13 +0800
Subject: [PATCH 12/33] [workflow] fixed oom tests (#5275)

* [workflow] fixed oom tests

* polish

* polish

* polish
---
 .github/workflows/build_on_pr.yml             |   2 -
 tests/kit/model_zoo/registry.py               |   7 +-
 tests/kit/model_zoo/transformers/gptj.py      |   3 +
 .../test_plugin/test_gemini_plugin.py         |   9 +-
 .../test_plugin/test_low_level_zero_plugin.py |   5 +-
 .../test_plugin/test_torch_ddp_plugin.py      |   5 +-
 .../test_plugin/test_torch_fsdp_plugin.py     |  18 ++-
 ...st_hybrid_parallel_plugin_checkpoint_io.py |   6 +-
 tests/test_infer_ops/triton/kernel_utils.py   |  27 ----
 .../triton/test_bloom_context_attention.py    |  52 -------
 .../triton/test_copy_kv_dest.py               |  39 -----
 .../triton/test_layernorm_triton.py           |  43 ------
 .../triton/test_llama_act_combine.py          |  56 -------
 .../triton/test_llama_context_attention.py    |  50 ------
 .../triton/test_self_attention_nonfusion.py   | 143 ------------------
 tests/test_infer_ops/triton/test_softmax.py   |  36 -----
 .../triton/test_token_attn_fwd.py             |  72 ---------
 .../triton/test_token_softmax.py              |  48 ------
 tests/test_lazy/test_models.py                |  11 +-
 19 files changed, 50 insertions(+), 582 deletions(-)
 delete mode 100644 tests/test_infer_ops/triton/kernel_utils.py
 delete mode 100644 tests/test_infer_ops/triton/test_bloom_context_attention.py
 delete mode 100644 tests/test_infer_ops/triton/test_copy_kv_dest.py
 delete mode 100644 tests/test_infer_ops/triton/test_layernorm_triton.py
 delete mode 100644 tests/test_infer_ops/triton/test_llama_act_combine.py
 delete mode 100644 tests/test_infer_ops/triton/test_llama_context_attention.py
 delete mode 100644 tests/test_infer_ops/triton/test_self_attention_nonfusion.py
 delete mode 100644 tests/test_infer_ops/triton/test_softmax.py
 delete mode 100644 tests/test_infer_ops/triton/test_token_attn_fwd.py
 delete mode 100644 tests/test_infer_ops/triton/test_token_softmax.py

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index 54e8a6d93487..a34a60669031 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -160,9 +160,7 @@ jobs:
           --ignore tests/test_gptq \
           --ignore tests/test_infer_ops \
           --ignore tests/test_legacy \
-          --ignore tests/test_moe \
           --ignore tests/test_smoothquant \
-          --ignore tests/test_checkpoint_io \
           tests/
         env:
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py
index 5e8e0b3822df..a16b16ad6af7 100644
--- a/tests/kit/model_zoo/registry.py
+++ b/tests/kit/model_zoo/registry.py
@@ -61,7 +61,9 @@ def register(
         """
         self[name] = (model_fn, data_gen_fn, output_transform_fn, loss_fn, model_attribute)
 
-    def get_sub_registry(self, keyword: Union[str, List[str]], exclude: Union[str, List[str]] = None):
+    def get_sub_registry(
+        self, keyword: Union[str, List[str]], exclude: Union[str, List[str]] = None, allow_empty: bool = False
+    ):
         """
         Get a sub registry with models that contain the keyword.
 
@@ -95,7 +97,8 @@ def get_sub_registry(self, keyword: Union[str, List[str]], exclude: Union[str, L
                     if not should_exclude:
                         new_dict[k] = v
 
-        assert len(new_dict) > 0, f"No model found with keyword {keyword}"
+        if not allow_empty:
+            assert len(new_dict) > 0, f"No model found with keyword {keyword}"
         return new_dict
 
 
diff --git a/tests/kit/model_zoo/transformers/gptj.py b/tests/kit/model_zoo/transformers/gptj.py
index 9eefbb43dad8..c89124f0164d 100644
--- a/tests/kit/model_zoo/transformers/gptj.py
+++ b/tests/kit/model_zoo/transformers/gptj.py
@@ -63,6 +63,9 @@ def data_gen_for_sequence_classification():
     n_layer=2,
     n_head=4,
     vocab_size=50258,
+    n_embd=256,
+    hidden_size=256,
+    n_positions=512,
     attn_pdrop=0,
     embd_pdrop=0,
     resid_pdrop=0,
diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py
index 9952e41e5b13..17dfa3a1860d 100644
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -12,7 +12,13 @@
 from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor.colo_parameter import ColoParameter
-from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing import (
+    clear_cache_before_run,
+    parameterize,
+    rerun_if_address_is_in_use,
+    skip_if_not_enough_gpus,
+    spawn,
+)
 from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
 
 
@@ -172,6 +178,7 @@ def test_gemini_plugin(early_stop: bool = True):
 
 
 @pytest.mark.largedist
+@skip_if_not_enough_gpus(8)
 @rerun_if_address_is_in_use()
 def test_gemini_plugin_3d(early_stop: bool = True):
     spawn(run_dist, 8, early_stop=early_stop)
diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
index bcdcc1470e6c..286f431d5c8c 100644
--- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
+++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
@@ -10,8 +10,8 @@
 from colossalai.booster.plugin import LowLevelZeroPlugin
 
 # from colossalai.nn.optimizer import HybridAdam
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
 
 # These models are not compatible with AMP
 _AMP_ERR_MODELS = ["timm_convit", "deepfm_interactionarch"]
@@ -21,6 +21,7 @@
 _STUCK_MODELS = ["transformers_albert_for_multiple_choice"]
 
 
+@clear_cache_before_run()
 def run_fn(stage, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
     device = device_utils.get_current_device()
     try:
diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
index fa32feb2ff85..e785843fb053 100644
--- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
@@ -10,10 +10,11 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.interface import OptimizerWrapper
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
+from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
 
 
+@clear_cache_before_run()
 def run_fn(model_fn, data_gen_fn, output_transform_fn):
     plugin = TorchDDPPlugin()
     booster = Booster(plugin=plugin)
diff --git a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
index 8a14d7cf872d..f698070465d6 100644
--- a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
@@ -11,11 +11,12 @@
     from colossalai.booster.plugin import TorchFSDPPlugin
 
 from colossalai.interface import OptimizerWrapper
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
+from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
 
 
 # test basic fsdp function
+@clear_cache_before_run()
 def run_fn(model_fn, data_gen_fn, output_transform_fn):
     plugin = TorchFSDPPlugin()
     booster = Booster(plugin=plugin)
@@ -40,12 +41,18 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):
     optimizer.clip_grad_by_norm(1.0)
     optimizer.step()
 
+    del model
+    del optimizer
+    del criterion
+    del booster
+    del plugin
+
 
 def check_torch_fsdp_plugin():
     if IS_FAST_TEST:
         registry = model_zoo.get_sub_registry(COMMON_MODELS)
     else:
-        registry = model_zoo
+        registry = model_zoo.get_sub_registry("transformers_gptj")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
         if any(
@@ -59,6 +66,7 @@ def check_torch_fsdp_plugin():
             ]
         ):
             continue
+        print(name)
         run_fn(model_fn, data_gen_fn, output_transform_fn)
         torch.cuda.empty_cache()
 
@@ -73,3 +81,7 @@ def run_dist(rank, world_size, port):
 @rerun_if_address_is_in_use()
 def test_torch_fsdp_plugin():
     spawn(run_dist, 2)
+
+
+if __name__ == "__main__":
+    test_torch_fsdp_plugin()
diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
index db3c56da874d..865262cae623 100644
--- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
@@ -38,11 +38,11 @@
     ]
 
 
-@clear_cache_before_run()
 @parameterize("shard", [True, False])
 @parameterize("model_name", ["transformers_llama_for_casual_lm"])
 @parameterize("size_per_shard", [32])
 @parameterize("test_config", TEST_CONFIGS)
+@clear_cache_before_run()
 def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_config: dict):
     (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
         iter(model_zoo.get_sub_registry(model_name).values())
@@ -145,3 +145,7 @@ def run_dist(rank, world_size, port):
 @rerun_if_address_is_in_use()
 def test_hybrid_ckpIO(world_size):
     spawn(run_dist, world_size)
+
+
+if __name__ == "__main__":
+    test_hybrid_ckpIO(4)
diff --git a/tests/test_infer_ops/triton/kernel_utils.py b/tests/test_infer_ops/triton/kernel_utils.py
deleted file mode 100644
index 0732ace1e04b..000000000000
--- a/tests/test_infer_ops/triton/kernel_utils.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import math
-
-import torch
-from torch.nn import functional as F
-
-
-def torch_context_attention(xq, xk, xv, bs, seqlen, num_head, head_dim):
-    """
-    adepted from https://github.com/ModelTC/lightllm/blob/main/lightllm/models/bloom/triton_kernel/context_flashattention_nopad.py#L253
-    """
-    xq = xq.view(bs, seqlen, num_head, head_dim)
-    xk = xk.view(bs, seqlen, num_head, head_dim)
-    xv = xv.view(bs, seqlen, num_head, head_dim)
-    mask = torch.tril(torch.ones(seqlen, seqlen), diagonal=0).unsqueeze(0).unsqueeze(0).cuda()
-    mask[mask == 0.0] = -100000000.0
-    mask = mask.repeat(bs, num_head, 1, 1)
-    keys = xk
-    values = xv
-    xq = xq.transpose(1, 2)
-    keys = keys.transpose(1, 2)
-    values = values.transpose(1, 2)
-    sm_scale = 1 / math.sqrt(head_dim)
-    scores = torch.matmul(xq, keys.transpose(2, 3)) * sm_scale
-    scores = F.softmax(scores.float() + mask, dim=-1).to(dtype=torch.float16)
-
-    output = torch.matmul(scores, values).transpose(1, 2).contiguous().reshape(-1, num_head, head_dim)
-    return output
diff --git a/tests/test_infer_ops/triton/test_bloom_context_attention.py b/tests/test_infer_ops/triton/test_bloom_context_attention.py
deleted file mode 100644
index 7a6c218a6691..000000000000
--- a/tests/test_infer_ops/triton/test_bloom_context_attention.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import pytest
-import torch
-from packaging import version
-
-try:
-    pass
-
-    from colossalai.kernel.triton import bloom_context_attn_fwd
-    from tests.test_infer_ops.triton.kernel_utils import torch_context_attention
-
-    HAS_TRITON = True
-except ImportError:
-    HAS_TRITON = False
-    print("please install triton from https://github.com/openai/triton")
-
-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
-
-
-@pytest.mark.skipif(
-    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
-)
-def test_bloom_context_attention():
-    bs = 4
-    head_num = 8
-    seq_len = 1024
-    head_dim = 64
-
-    query = torch.randn((bs * seq_len, head_num, head_dim), dtype=torch.float16, device="cuda")
-    k = torch.randn((bs * seq_len, head_num, head_dim), dtype=torch.float16, device="cuda")
-    v = torch.randn((bs * seq_len, head_num, head_dim), dtype=torch.float16, device="cuda")
-
-    max_input_len = seq_len
-    b_start = torch.zeros((bs,), device="cuda", dtype=torch.int32)
-    b_len = torch.zeros((bs,), device="cuda", dtype=torch.int32)
-
-    for i in range(bs):
-        b_start[i] = i * seq_len
-        b_len[i] = seq_len
-
-    o = torch.randn((bs * seq_len, head_num, head_dim), dtype=torch.float16, device="cuda")
-    alibi = torch.zeros((head_num,), dtype=torch.float32, device="cuda")
-    bloom_context_attn_fwd(query.clone(), k.clone(), v.clone(), o, b_start, b_len, max_input_len, alibi)
-
-    torch_out = torch_context_attention(query.clone(), k.clone(), v.clone(), bs, seq_len, head_num, head_dim)
-
-    assert torch.allclose(
-        torch_out.cpu(), o.cpu(), rtol=1e-3, atol=1e-2
-    ), "outputs from triton and torch are not matched"
-
-
-if __name__ == "__main__":
-    test_bloom_context_attention()
diff --git a/tests/test_infer_ops/triton/test_copy_kv_dest.py b/tests/test_infer_ops/triton/test_copy_kv_dest.py
deleted file mode 100644
index 34e453f7840e..000000000000
--- a/tests/test_infer_ops/triton/test_copy_kv_dest.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import pytest
-import torch
-from packaging import version
-
-try:
-    pass
-
-    from colossalai.kernel.triton.copy_kv_cache_dest import copy_kv_cache_to_dest
-
-    HAS_TRITON = True
-except ImportError:
-    HAS_TRITON = False
-    print("please install triton from https://github.com/openai/triton")
-
-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
-
-
-@pytest.mark.skipif(
-    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
-)
-def test_kv_cache_copy_op():
-    B_NTX = 32 * 2048
-    head_num = 8
-    head_dim = 64
-
-    cache = torch.randn((B_NTX, head_num, head_dim), device="cuda", dtype=torch.float16)
-    dest_index = torch.arange(0, B_NTX, device="cuda", dtype=torch.int32)
-
-    dest_data = torch.ones((B_NTX, head_num, head_dim), device="cuda", dtype=torch.float16)
-
-    copy_kv_cache_to_dest(cache, dest_index, dest_data)
-
-    assert torch.allclose(
-        cache.cpu(), dest_data.cpu(), rtol=1e-3, atol=1e-3
-    ), "copy_kv_cache_to_dest outputs from triton and torch are not matched"
-
-
-if __name__ == "__main__":
-    test_kv_cache_copy_op()
diff --git a/tests/test_infer_ops/triton/test_layernorm_triton.py b/tests/test_infer_ops/triton/test_layernorm_triton.py
deleted file mode 100644
index 7f814e8c9a9f..000000000000
--- a/tests/test_infer_ops/triton/test_layernorm_triton.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import pytest
-import torch
-from packaging import version
-
-from colossalai.kernel.triton import layer_norm
-from colossalai.testing.utils import parameterize
-
-try:
-    pass
-
-    HAS_TRITON = True
-except ImportError:
-    HAS_TRITON = False
-    print("please install triton from https://github.com/openai/triton")
-
-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
-
-
-@pytest.mark.skipif(
-    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
-)
-@parameterize("M", [2, 4, 8, 16])
-@parameterize("N", [64, 128])
-def test_layer_norm(M, N):
-    dtype = torch.float16
-    eps = 1e-5
-    x_shape = (M, N)
-    w_shape = (x_shape[-1],)
-    weight = torch.rand(w_shape, dtype=dtype, device="cuda")
-    bias = torch.rand(w_shape, dtype=dtype, device="cuda")
-    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device="cuda")
-
-    y_triton = layer_norm(x, weight, bias, eps)
-    y_torch = torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps).to(dtype)
-
-    assert y_triton.shape == y_torch.shape
-    assert y_triton.dtype == y_torch.dtype
-    print("max delta: ", torch.max(torch.abs(y_triton - y_torch)))
-    assert torch.allclose(y_triton, y_torch, atol=1e-2, rtol=0)
-
-
-if __name__ == "__main__":
-    test_layer_norm()
diff --git a/tests/test_infer_ops/triton/test_llama_act_combine.py b/tests/test_infer_ops/triton/test_llama_act_combine.py
deleted file mode 100644
index 5341aa35ab90..000000000000
--- a/tests/test_infer_ops/triton/test_llama_act_combine.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import pytest
-import torch
-from packaging import version
-from torch import nn
-
-from colossalai.kernel.triton.llama_act_combine_kernel import LlamaActCombine
-
-try:
-    import triton
-    HAS_TRITON = True
-except ImportError:
-    HAS_TRITON = False
-    print("please install triton from https://github.com/openai/triton")
-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
-
-BATCH_SIZE = 4
-SEQ_LEN = 16
-HIDDEN_SIZE = 32
-
-
-def SwiGLU(x):
-    """Gated linear unit activation function.
-    Args:
-        x : input array
-        axis: the axis along which the split should be computed (default: -1)
-    """
-    size = x.shape[-1]
-    assert size % 2 == 0, "axis size must be divisible by 2"
-    x1, x2 = torch.split(x, size // 2, -1)
-    return x1 * (x2 * torch.sigmoid(x2.to(torch.float32)).to(x.dtype))
-
-
-@pytest.mark.skipif(not (HAS_TRITON and TRITON_CUDA_SUPPORT), reason="requires triton")
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
-def test_llama_act_combine(dtype: str):
-    x_gate = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE * 2, dtype=dtype).cuda()
-    x_gate_torch = nn.Parameter(x_gate.detach().clone())
-    x_gate_kernel = nn.Parameter(x_gate.detach().clone())
-    x_up = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, dtype=dtype).cuda()
-    x_up_torch = nn.Parameter(x_up.detach().clone())
-    x_up_kernel = nn.Parameter(x_up.detach().clone())
-
-    torch_out = SwiGLU(x_gate_torch) * x_up_torch
-    kernel_out = LlamaActCombine.apply(x_gate_kernel, x_up_kernel)
-    atol = 1e-5 if dtype == torch.float32 else 5e-2
-    assert torch.allclose(torch_out, kernel_out, atol=atol)
-
-    torch_out.mean().backward()
-    kernel_out.mean().backward()
-    assert all(grad is not None for grad in [x_gate_torch.grad, x_up_torch.grad, x_gate_kernel.grad, x_up_kernel.grad])
-    assert torch.allclose(x_gate_torch.grad, x_gate_kernel.grad, atol=atol)
-    assert torch.allclose(x_up_torch.grad, x_up_kernel.grad, atol=atol)
-
-
-if __name__ == '__main__':
-    test_llama_act_combine(torch.float16)
diff --git a/tests/test_infer_ops/triton/test_llama_context_attention.py b/tests/test_infer_ops/triton/test_llama_context_attention.py
deleted file mode 100644
index 95fe50cf1d9c..000000000000
--- a/tests/test_infer_ops/triton/test_llama_context_attention.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import pytest
-import torch
-from packaging import version
-
-try:
-    pass
-
-    from colossalai.kernel.triton import llama_context_attn_fwd
-    from tests.test_infer_ops.triton.kernel_utils import torch_context_attention
-
-    HAS_TRITON = True
-except ImportError:
-    HAS_TRITON = False
-    print("please install triton from https://github.com/openai/triton")
-
-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
-
-
-@pytest.mark.skipif(
-    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
-)
-def test_llama_context_attention():
-    bs = 4
-    head_num = 8
-    seq_len = 1024
-    head_dim = 64
-
-    query = torch.randn((bs * seq_len, head_num, head_dim), dtype=torch.float16, device="cuda")
-    k = torch.randn((bs * seq_len, head_num, head_dim), dtype=torch.float16, device="cuda")
-    v = torch.randn((bs * seq_len, head_num, head_dim), dtype=torch.float16, device="cuda")
-
-    max_input_len = seq_len
-    b_start = torch.zeros((bs,), device="cuda", dtype=torch.int32)
-    b_len = torch.zeros((bs,), device="cuda", dtype=torch.int32)
-
-    for i in range(bs):
-        b_start[i] = i * seq_len
-        b_len[i] = seq_len
-
-    o = torch.randn((bs * seq_len, head_num, head_dim), dtype=torch.float16, device="cuda")
-    llama_context_attn_fwd(query.clone(), k.clone(), v.clone(), o, b_start, b_len, max_input_len)
-
-    torch_out = torch_context_attention(query.clone(), k.clone(), v.clone(), bs, seq_len, head_num, head_dim)
-    assert torch.allclose(
-        torch_out.cpu(), o.cpu(), rtol=1e-3, atol=1e-3
-    ), "outputs from triton and torch are not matched"
-
-
-if __name__ == "__main__":
-    test_llama_context_attention()
diff --git a/tests/test_infer_ops/triton/test_self_attention_nonfusion.py b/tests/test_infer_ops/triton/test_self_attention_nonfusion.py
deleted file mode 100644
index 9bdec86645b2..000000000000
--- a/tests/test_infer_ops/triton/test_self_attention_nonfusion.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import pytest
-import torch
-import torch.nn.functional as F
-from packaging import version
-
-try:
-    import triton
-
-    from colossalai.kernel.triton.qkv_matmul_kernel import qkv_gemm_4d_kernel
-    from colossalai.kernel.triton.self_attention_nofusion import self_attention_compute_using_triton
-
-    HAS_TRITON = True
-except ImportError:
-    HAS_TRITON = False
-    print("please install triton from https://github.com/openai/triton")
-
-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
-
-
-@pytest.mark.skipif(
-    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
-)
-def test_qkv_matmul():
-    qkv = torch.randn((4, 24, 64 * 3), device="cuda", dtype=torch.float16)
-    scale = 1.2
-    head_size = 32
-    batches = qkv.shape[0]
-    d_model = qkv.shape[-1] // 3
-    num_of_heads = d_model // head_size
-
-    q = qkv[:, :, :d_model]
-    k = qkv[:, :, d_model : d_model * 2]
-
-    q = q.view(batches, -1, num_of_heads, head_size)
-    k = k.view(batches, -1, num_of_heads, head_size)
-    q_copy = q.clone()
-    k_copy = k.clone()
-    q = torch.transpose(q, 1, 2).contiguous()
-    k = torch.transpose(k, 1, 2).contiguous()
-    k = torch.transpose(k, 2, 3).contiguous()
-
-    torch_ouput = torch.einsum("bnij,bnjk->bnik", q, k)
-    torch_ouput *= 1.2
-
-    q, k = q_copy, k_copy
-    batches, M, H, K = q.shape
-    N = k.shape[1]
-    score_output = torch.empty((batches, H, M, N), device=q.device, dtype=q.dtype)
-
-    grid = lambda meta: (
-        batches,
-        H,
-        triton.cdiv(M, meta["BLOCK_SIZE_M"]) * triton.cdiv(N, meta["BLOCK_SIZE_N"]),
-    )
-
-    K = q.shape[3]
-    qkv_gemm_4d_kernel[grid](
-        q,
-        k,
-        score_output,
-        M,
-        N,
-        K,
-        q.stride(0),
-        q.stride(2),
-        q.stride(1),
-        q.stride(3),
-        k.stride(0),
-        k.stride(2),
-        k.stride(3),
-        k.stride(1),
-        score_output.stride(0),
-        score_output.stride(1),
-        score_output.stride(2),
-        score_output.stride(3),
-        scale=scale,
-        # currently manually setting, later on we can use auto-tune config to match best setting
-        BLOCK_SIZE_M=64,
-        BLOCK_SIZE_N=32,
-        BLOCK_SIZE_K=32,
-        GROUP_SIZE_M=8,
-    )
-
-    check = torch.allclose(torch_ouput.cpu(), score_output.cpu(), rtol=1e-3, atol=1e-5)
-    assert check is True, "the outputs of triton and torch are not matched"
-
-
-def self_attention_compute_using_torch(qkv, input_mask, scale, head_size):
-    batches = qkv.shape[0]
-    d_model = qkv.shape[-1] // 3
-    num_of_heads = d_model // head_size
-
-    q = qkv[:, :, :d_model]
-    k = qkv[:, :, d_model : d_model * 2]
-    v = qkv[:, :, d_model * 2 :]
-    q = q.view(batches, -1, num_of_heads, head_size)
-    k = k.view(batches, -1, num_of_heads, head_size)
-    v = v.view(batches, -1, num_of_heads, head_size)
-
-    q = torch.transpose(q, 1, 2).contiguous()
-    k = torch.transpose(k, 1, 2).contiguous()
-    v = torch.transpose(v, 1, 2).contiguous()
-
-    k = torch.transpose(k, -1, -2).contiguous()
-
-    score_output = torch.einsum("bnij,bnjk->bnik", q, k)
-    score_output *= scale
-
-    softmax_output = F.softmax(score_output, dim=-1)
-    res = torch.einsum("bnij,bnjk->bnik", softmax_output, v)
-    res = torch.transpose(res, 1, 2)
-    res = res.contiguous()
-
-    return res.view(batches, -1, d_model), score_output, softmax_output
-
-
-@pytest.mark.skipif(
-    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
-)
-def test_self_atttention_test():
-    qkv = torch.randn((4, 24, 64 * 3), device="cuda", dtype=torch.float16)
-    data_output_torch, score_output_torch, softmax_output_torch = self_attention_compute_using_torch(
-        qkv.clone(), input_mask=None, scale=1.2, head_size=32
-    )
-
-    data_output_triton = self_attention_compute_using_triton(
-        qkv.clone(),
-        alibi=None,
-        head_size=32,
-        scale=1.2,
-        input_mask=None,
-        layer_past=None,
-        use_flash=False,
-        triangular=True,
-    )
-
-    check = torch.allclose(data_output_triton.cpu(), data_output_torch.cpu(), rtol=1e-4, atol=1e-2)
-    assert check is True, "the triton output is not matched with torch output"
-
-
-if __name__ == "__main__":
-    test_qkv_matmul()
-    test_self_atttention_test()
diff --git a/tests/test_infer_ops/triton/test_softmax.py b/tests/test_infer_ops/triton/test_softmax.py
deleted file mode 100644
index 43b9c0929c4a..000000000000
--- a/tests/test_infer_ops/triton/test_softmax.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import pytest
-import torch
-from packaging import version
-from torch import nn
-
-try:
-    from colossalai.kernel.triton.softmax import softmax
-
-    HAS_TRITON = True
-except ImportError:
-    HAS_TRITON = False
-    print("please install triton from https://github.com/openai/triton")
-
-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
-
-
-@pytest.mark.skipif(
-    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
-)
-def test_softmax_op():
-    data_samples = [
-        torch.randn((3, 4, 5, 32), device="cuda", dtype=torch.float32),
-        torch.randn((320, 320, 78), device="cuda", dtype=torch.float32),
-        torch.randn((2345, 4, 5, 64), device="cuda", dtype=torch.float16),
-    ]
-
-    for data in data_samples:
-        module = nn.Softmax(dim=-1)
-        data_torch_out = module(data)
-        data_triton_out = softmax(data)
-        check = torch.allclose(data_torch_out.cpu(), data_triton_out.cpu(), rtol=1e-3, atol=1e-3)
-        assert check is True, "softmax outputs from triton and torch are not matched"
-
-
-if __name__ == "__main__":
-    test_softmax_op()
diff --git a/tests/test_infer_ops/triton/test_token_attn_fwd.py b/tests/test_infer_ops/triton/test_token_attn_fwd.py
deleted file mode 100644
index 4ee1a5fb1234..000000000000
--- a/tests/test_infer_ops/triton/test_token_attn_fwd.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import pytest
-import torch
-from packaging import version
-
-try:
-    from colossalai.kernel.triton.token_attention_kernel import token_attention_fwd
-
-    HAS_TRITON = True
-except ImportError:
-    HAS_TRITON = False
-    print("please install triton from https://github.com/openai/triton")
-
-
-import importlib.util
-
-HAS_LIGHTLLM_KERNEL = True
-
-if importlib.util.find_spec("lightllm") is None:
-    HAS_LIGHTLLM_KERNEL = False
-
-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) >= version.parse("11.6")
-
-
-def torch_att(xq, xk, xv, bs, seqlen, num_head, head_dim):
-    xq = xq.view(bs, 1, num_head, head_dim)
-    xk = xk.view(bs, seqlen, num_head, head_dim)
-    xv = xv.view(bs, seqlen, num_head, head_dim)
-
-    logics = torch.sum(xq * xk, dim=3, keepdim=False) * 1 / (head_dim**0.5)
-    prob = torch.softmax(logics, dim=1)
-    prob = prob.view(bs, seqlen, num_head, 1)
-
-    return torch.sum(prob * xv, dim=1, keepdim=False)
-
-
-@pytest.mark.skipif(
-    not TRITON_CUDA_SUPPORT or not HAS_TRITON or not HAS_LIGHTLLM_KERNEL,
-    reason="triton requires cuda version to be higher than 11.4 or not install lightllm",
-)
-def test():
-    Z, head_num, seq_len, head_dim = 22, 112 // 8, 2048, 128
-    dtype = torch.float16
-    q = torch.empty((Z, head_num, head_dim), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2)
-    k = torch.empty((Z * seq_len, head_num, head_dim), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2)
-    v = torch.empty((Z * seq_len, head_num, head_dim), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2)
-    o = torch.empty((Z, head_num, head_dim), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2)
-    alibi = torch.zeros((head_num,), dtype=torch.float32, device="cuda")
-
-    max_kv_cache_len = seq_len
-    kv_cache_start_loc = torch.zeros((Z,), dtype=torch.int32, device="cuda")
-    kv_cache_loc = torch.zeros((Z, seq_len), dtype=torch.int32, device="cuda")
-    kv_cache_seq_len = torch.ones((Z,), dtype=torch.int32, device="cuda")
-
-    kv_cache_seq_len[:] = seq_len
-    kv_cache_start_loc[0] = 0
-    kv_cache_start_loc[1] = seq_len
-    kv_cache_start_loc[2] = 2 * seq_len
-    kv_cache_start_loc[3] = 3 * seq_len
-
-    for i in range(Z):
-        kv_cache_loc[i, :] = torch.arange(i * seq_len, (i + 1) * seq_len, dtype=torch.int32, device="cuda")
-
-    token_attention_fwd(q, k, v, o, kv_cache_loc, kv_cache_start_loc, kv_cache_seq_len, max_kv_cache_len, alibi=alibi)
-    torch_out = torch_att(q, k, v, Z, seq_len, head_num, head_dim)
-
-    print("max ", torch.max(torch.abs(torch_out - o)))
-    print("mean ", torch.mean(torch.abs(torch_out - o)))
-    assert torch.allclose(torch_out, o, atol=1e-2, rtol=0)
-
-
-if __name__ == "__main__":
-    test()
diff --git a/tests/test_infer_ops/triton/test_token_softmax.py b/tests/test_infer_ops/triton/test_token_softmax.py
deleted file mode 100644
index 1f97f1674818..000000000000
--- a/tests/test_infer_ops/triton/test_token_softmax.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import pytest
-import torch
-from packaging import version
-
-try:
-    pass
-
-    from colossalai.kernel.triton.token_attention_kernel import token_attn_softmax_fwd
-
-    HAS_TRITON = True
-except ImportError:
-    HAS_TRITON = False
-    print("please install triton from https://github.com/openai/triton")
-
-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
-
-
-@pytest.mark.skipif(
-    not TRITON_CUDA_SUPPORT or not HAS_TRITON, reason="triton requires cuda version to be higher than 11.4"
-)
-def test_softmax():
-    import torch
-
-    batch_size, seq_len, head_num, head_dim = 4, 1025, 12, 128
-
-    dtype = torch.float16
-
-    Logics = torch.empty((head_num, batch_size * seq_len), dtype=dtype, device="cuda").normal_(mean=0.1, std=10)
-    ProbOut = torch.empty((head_num, batch_size * seq_len), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2)
-
-    kv_cache_start_loc = torch.zeros((batch_size,), dtype=torch.int32, device="cuda")
-    kv_cache_seq_len = torch.zeros((batch_size,), dtype=torch.int32, device="cuda")
-
-    for i in range(batch_size):
-        kv_cache_start_loc[i] = i * seq_len
-        kv_cache_seq_len[i] = seq_len
-
-    token_attn_softmax_fwd(Logics, kv_cache_start_loc, kv_cache_seq_len, ProbOut, seq_len)
-
-    torch_out = Logics.reshape(head_num * batch_size, -1).softmax(-1).reshape(head_num, batch_size * seq_len)
-    o = ProbOut
-    print("max ", torch.max(torch.abs(torch_out - o)))
-    print("mean ", torch.mean(torch.abs(torch_out - o)))
-    assert torch.allclose(torch_out, o, atol=1e-2, rtol=0)
-
-
-if __name__ == "__main__":
-    test_softmax()
diff --git a/tests/test_lazy/test_models.py b/tests/test_lazy/test_models.py
index ee50e5b61009..d0c4cd0a7c48 100644
--- a/tests/test_lazy/test_models.py
+++ b/tests/test_lazy/test_models.py
@@ -1,14 +1,19 @@
 import pytest
 from lazy_init_utils import SUPPORT_LAZY, check_lazy_init
 
-from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
 
 
 @pytest.mark.skipif(not SUPPORT_LAZY, reason="requires torch >= 1.12.0")
-@pytest.mark.parametrize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"])
+@pytest.mark.parametrize(
+    "subset",
+    [COMMON_MODELS]
+    if IS_FAST_TEST
+    else ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"],
+)
 @pytest.mark.parametrize("default_device", ["cpu", "cuda"])
 def test_torchvision_models_lazy_init(subset, default_device):
-    sub_model_zoo = model_zoo.get_sub_registry(subset)
+    sub_model_zoo = model_zoo.get_sub_registry(subset, allow_empty=True)
     for name, entry in sub_model_zoo.items():
         # TODO(ver217): lazy init does not support weight norm, skip these models
         if name in ("torchaudio_wav2vec2_base", "torchaudio_hubert_base") or name.startswith(

From ce924a19ad8bb7c13c2f5f7ba66ce742be70bf66 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Wed, 17 Jan 2024 13:38:55 +0800
Subject: [PATCH 13/33] [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py
 (#5276)

* fix ci

fix

* fix test

* revert: revert p2p

* feat: add enable_metadata_cache option

* revert: enable t5 tests

* fix

---------

Co-authored-by: Wenhao Chen <cwher@outlook.com>
---
 ...st_hybrid_parallel_plugin_checkpoint_io.py | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
index 865262cae623..a42b550cd6fc 100644
--- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
@@ -104,30 +104,32 @@ def _preprocess_data(data):
     # Check whether the loaded model & optimizer works smoothly.
     model.train()
     new_model.train()
+    data_for_shard = data_gen_fn()
+    data_for_origin = data_gen_fn()
     if booster.plugin.stage_manager is not None:
         booster.execute_pipeline(
-            _preprocess_data(data), model, _criterion, optimizer, return_loss=True, return_outputs=False
+            _preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True, return_outputs=False
         )
         booster.execute_pipeline(
-            _preprocess_data(data), new_model, _criterion, new_optimizer, return_loss=True, return_outputs=False
+            _preprocess_data(data_for_origin),
+            new_model,
+            _criterion,
+            new_optimizer,
+            return_loss=True,
+            return_outputs=False,
         )
     else:
-        old_model_loss = criterion(model(**_preprocess_data(data)))
+        old_model_loss = criterion(model(**_preprocess_data(data_for_shard)))
         optimizer.backward(old_model_loss)
-        new_model_loss = criterion(new_model(**_preprocess_data(data)))
+        new_model_loss = criterion(new_model(**_preprocess_data(data_for_origin)))
         new_optimizer.backward(new_model_loss)
 
     optimizer.step()
     new_optimizer.step()
 
     # Check updated weights.
-    stage_manager = booster.plugin.stage_manager
-
-    if stage_manager is None or stage_manager.is_first_stage():
-        assert_close_loose(model.unwrap().wte.weight.data, new_model.unwrap().wte.weight.data, atol=5e-3, rtol=5e-3)
-        assert_close_loose(
-            model.unwrap().h[0].mlp.c_fc.weight.data, new_model.unwrap().h[0].mlp.c_fc.weight.data, atol=5e-3, rtol=5e-3
-        )
+    for p1, p2 in zip(model.unwrap().parameters(), new_model.unwrap().parameters()):
+        assert_close_loose(p1, p2, atol=5e-3, rtol=5e-3)
 
     dist.barrier()
     Randomizer.reset_index()

From e7ca755124f2395e2952487566fca220abd0753a Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Wed, 17 Jan 2024 15:22:33 +0800
Subject: [PATCH 14/33] [shardformer] hybridparallelplugin support gradients
 accumulation. (#5246)

* support gradients acc

fix

fix

fix

fix

fix

fix

fix

fix

fix

fix

fix

fix

fix

* fix

fix

* fix

fix

fix
---
 .../booster/plugin/hybrid_parallel_plugin.py  |  20 ++-
 .../test_plugin/test_3d_plugin.py             | 162 ++++++++++++++++++
 2 files changed, 174 insertions(+), 8 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 8ee1e97c6ce3..e1593cf6b26c 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -165,7 +165,6 @@ def sync_sp_grads(self, grads: Optional[List[torch.Tensor]] = None):
         Returns:
             None
         """
-
         if self.tp_group.size() > 1 and self.shard_config.enable_sequence_parallelism:
             if grads is not None:
                 # Synchronize provided gradient tensors across the tensor parallelism group.
@@ -487,7 +486,6 @@ def backward(self, loss: Tensor, *args, **kwargs):
         Returns:
             None
         """
-
         # Call the superclass backward method to compute gradients.
         super().backward(loss, *args, **kwargs)
 
@@ -513,7 +511,6 @@ def backward_by_grad(self, tensor: Tensor, grad: Tensor):
         Returns:
             None
         """
-
         # Call the superclass backward method to compute gradients.
         super().backward_by_grad(tensor, grad)
 
@@ -674,7 +671,6 @@ def sync_dp_grads(self):
         Returns:
             None
         """
-
         # Call the superclass `_sync_grad` method to synchronize gradients.
         super()._sync_grad()
 
@@ -1081,7 +1077,7 @@ def control_precision(self) -> bool:
         return True
 
     def support_no_sync(self) -> bool:
-        return False
+        return True
 
     def control_checkpoint_io(self) -> bool:
         return True
@@ -1175,9 +1171,14 @@ def execute_pipeline(
                 model, data_iter, criterion, optimizer, return_loss, return_outputs
             )
 
+        # run with gradients accumulation
+        if model.require_grad_sync == False or (
+            isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False
+        ):
+            return outputs
+
         # Synchronize the grads of shared parameters of the model.
         model.sync_shared_params()
-
         # Synchronize sequence parallelism gradients of the model.
         model.sync_sp_grads()
 
@@ -1241,5 +1242,8 @@ def seed_worker(worker_id):
     def get_checkpoint_io(self) -> CheckpointIO:
         return HybridParallelCheckpointIO(self.dp_group, self.pp_group, self.tp_group, self.zero_stage)
 
-    def no_sync(self, model: Module) -> Iterator[None]:
-        raise NotImplementedError
+    def no_sync(self, model: Module, optimizer: OptimizerWrapper) -> Iterator[None]:
+        assert (
+            self.zero_stage != 2
+        ), "ZERO2 is not compatible with no_sync function, please run gradient accumulation with gradient synchronization allowed."
+        return optimizer.no_sync() if isinstance(optimizer, HybridParallelZeroOptimizer) else model.no_sync()
diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py
index e724d7359c54..6f2fc104fc07 100644
--- a/tests/test_booster/test_plugin/test_3d_plugin.py
+++ b/tests/test_booster/test_plugin/test_3d_plugin.py
@@ -1,8 +1,11 @@
+import copy
 from contextlib import nullcontext
 from typing import Optional
 
 import torch
 import torch.distributed as dist
+from torch.testing import assert_close
+from torch.utils.data import Dataset
 
 import colossalai
 from colossalai.booster import Booster
@@ -11,9 +14,33 @@
 from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils import get_current_device, set_seed
 from tests.kit.model_zoo import model_zoo
 
 
+class RandomDataset(Dataset):
+    def __init__(self, num_samples: int = 100, max_length: int = 512, vocab_size: int = 32000):
+        self.num_samples = num_samples
+        self.max_length = max_length
+        set_seed(42)
+        self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
+        self.attention_mask = torch.ones_like(self.input_ids)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        return {
+            "input_ids": self.input_ids[idx],
+            "attention_mask": self.attention_mask[idx],
+            "labels": self.input_ids[idx],
+        }
+
+
+def move_to_cuda(batch):
+    return {k: v.cuda() for k, v in batch.items()}
+
+
 @clear_cache_before_run()
 def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
     try:
@@ -85,10 +112,145 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True):
     assert len(failed_info) == 0, "\n".join([f"{k}: {v}" for k, v in failed_info.items()])
 
 
+@parameterize(
+    "test_args",
+    [
+        {
+            "batch_size": 8,
+            "num_steps": 4,
+            "tp": 2,
+            "pp": 2,
+            "pp_style": "1f1b",
+            "num_model_chunks": 1,
+            "num_microbatches": 4,
+            "zero": 0,
+            "precision": "fp16",
+            "initial_scale": 1,
+            "max_length": 512,
+            "gradient_accumulation_step": 2,
+        },
+        {
+            "batch_size": 8,
+            "num_steps": 4,
+            "tp": 1,
+            "pp": 2,
+            "pp_style": "1f1b",
+            "num_model_chunks": 1,
+            "num_microbatches": 4,
+            "zero": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+            "max_length": 512,
+            "gradient_accumulation_step": 2,
+        },
+        {
+            "batch_size": 1,
+            "num_steps": 4,
+            "tp": 2,
+            "pp": 1,
+            "pp_style": "1f1b",
+            "num_model_chunks": 1,
+            "num_microbatches": 1,
+            "zero": 2,
+            "precision": "fp16",
+            "initial_scale": 1,
+            "max_length": 512,
+            "gradient_accumulation_step": 2,
+        },
+        {
+            "batch_size": 1,
+            "num_steps": 4,
+            "tp": 2,
+            "pp": 1,
+            "pp_style": "1f1b",
+            "num_model_chunks": 1,
+            "num_microbatches": 1,
+            "zero": 0,
+            "precision": "fp16",
+            "initial_scale": 1,
+            "max_length": 512,
+            "gradient_accumulation_step": 2,
+        },
+    ],
+)
+def run_grad_acc_test(test_args):
+    model_fn, *_ = next(iter(model_zoo.get_sub_registry("transformers_gpt_lm").values()))
+    model = model_fn()
+    optimizer = HybridAdam(model.parameters())
+    origin_model = copy.deepcopy(model).cuda()
+    origin_optimizer = HybridAdam(origin_model.parameters())
+
+    plugin = HybridParallelPlugin(
+        tp_size=test_args["tp"],
+        pp_size=test_args["pp"],
+        pp_style=test_args["pp_style"],
+        zero_stage=test_args["zero"],
+        num_model_chunks=test_args["num_model_chunks"],
+        enable_fused_normalization=True,
+        num_microbatches=test_args["num_microbatches"],
+        precision=test_args["precision"],
+    )
+    booster = Booster(plugin=plugin)
+
+    dataset = RandomDataset(
+        num_samples=test_args["batch_size"] * test_args["num_steps"] * plugin.dp_size,
+        max_length=test_args["max_length"],
+        vocab_size=model.config.vocab_size,
+    )
+    dataloader = plugin.prepare_dataloader(dataset, batch_size=test_args["batch_size"], shuffle=True, drop_last=True)
+
+    model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
+
+    grad_accu_step = test_args["gradient_accumulation_step"]
+    for step, batch in enumerate(dataloader):
+        batch = move_to_cuda(batch)
+        # train origin model
+        origin_output = origin_model(**batch)
+        origin_loss = origin_output[0] / grad_accu_step
+        origin_loss.backward()
+
+        if (step + 1) % grad_accu_step != 0 and test_args["zero"] != 2:
+            ctx = booster.no_sync(model, optimizer)
+        else:
+            ctx = nullcontext()
+
+        with ctx:
+            if plugin.stage_manager is not None:
+                batch = iter([batch])
+                booster.execute_pipeline(
+                    batch,
+                    model,
+                    criterion=lambda outputs, inputs: outputs[0] / grad_accu_step,
+                    optimizer=optimizer,
+                    return_loss=False,
+                )
+            else:
+                outputs = model(**batch)
+                loss = outputs[0] / grad_accu_step
+                booster.backward(loss, optimizer)
+
+        if (step + 1) % grad_accu_step == 0:
+            # update origin model weight
+            origin_optimizer.step()
+            origin_optimizer.zero_grad()
+
+            # update sharded model
+            optimizer.step()
+            optimizer.zero_grad()
+
+    # tricky code here, shard the origin model inorder to check the parameters in the same stage.
+    origin_model, origin_optimizer, _, dataloader, _ = booster.boost(
+        origin_model, origin_optimizer, dataloader=dataloader
+    )
+    for p1, p2 in zip(model.unwrap().parameters(), origin_model.unwrap().parameters()):
+        assert_close(p1.to(p2.dtype), p2, atol=1e-2, rtol=1e-2)
+
+
 def run_dist(rank, world_size, port, early_stop: bool = True):
     # init dist env
     colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
     check_3d_plugin(early_stop=early_stop)
+    run_grad_acc_test()
 
 
 @rerun_if_address_is_in_use()

From ef535038a6116cb4660428acbd040369b8571976 Mon Sep 17 00:00:00 2001
From: Zhongkai Zhao <kanezz620@gmail.com>
Date: Wed, 17 Jan 2024 17:42:29 +0800
Subject: [PATCH 15/33] [hotfix] Fix ShardFormer test execution path when using
 sequence parallelism (#5230)

---
 tests/test_shardformer/test_model/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_shardformer/test_model/_utils.py b/tests/test_shardformer/test_model/_utils.py
index 87e6618023d3..62d4d1bf3c7c 100644
--- a/tests/test_shardformer/test_model/_utils.py
+++ b/tests/test_shardformer/test_model/_utils.py
@@ -154,7 +154,7 @@ def _criterion(outputs, inputs):
 
     data = data_gen_fn()
 
-    if booster.plugin.enable_sequence_parallelism and booster.plugin.tp_size != 0:
+    if booster.plugin.shard_config.enable_sequence_parallelism and booster.plugin.tp_size != 0:
         seq_len = data["input_ids"].shape[-1]
         lcm = booster.plugin.tp_size * seq_len // math.gcd(booster.plugin.tp_size, seq_len)
         times = lcm // seq_len

From 85860e7a813a2170fec8229b37c8558f9d39fd33 Mon Sep 17 00:00:00 2001
From: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Date: Thu, 18 Jan 2024 14:08:29 +0800
Subject: [PATCH 16/33] fix auto loading gpt2 tokenizer (#5279)

---
 applications/ColossalQA/colossalqa/local/llm.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/applications/ColossalQA/colossalqa/local/llm.py b/applications/ColossalQA/colossalqa/local/llm.py
index ff7346adcf61..0aa383e9d0b9 100644
--- a/applications/ColossalQA/colossalqa/local/llm.py
+++ b/applications/ColossalQA/colossalqa/local/llm.py
@@ -136,6 +136,19 @@ def _identifying_params(self) -> Mapping[str, int]:
         """Get the identifying parameters."""
         return {"n": self.n}
 
+    def get_token_ids(self, text: str) -> List[int]:
+        """Return the ordered ids of the tokens in a text.
+
+        Args:
+            text: The string input to tokenize.
+
+        Returns:
+            A list of ids corresponding to the tokens in the text, in order they occur
+                in the text.
+        """
+        # use the colossal llm's tokenizer instead of langchain's cached GPT2 tokenizer
+        return self.api.tokenizer.encode(text)
+
 
 class VllmLLM(LLM):
     """

From b40cc068ef8cc5f5e06ac53f96348917dd63b564 Mon Sep 17 00:00:00 2001
From: Desperado-Jia <502205863@qq.com>
Date: Fri, 19 Jan 2024 16:04:08 +0800
Subject: [PATCH 17/33] [doc] add llama2-13B disyplay (#5285)

* Update README.md

* fix 13b typo

---------

Co-authored-by: binmakeswell <binmakeswell@gmail.com>
---
 README.md | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 971f4375a289..13757eece7db 100644
--- a/README.md
+++ b/README.md
@@ -141,25 +141,26 @@ distributed training and inference in a few lines.
 [[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-13b-base)
 [[Modelscope model weights]](https://www.modelscope.cn/models/colossalai/Colossal-LLaMA-2-13b-base/summary)
 
-|              Model             |  Backbone  | Tokens Consumed |     MMLU (5-shot)    | CMMLU (5-shot)| AGIEval (5-shot) | GAOKAO (0-shot) | CEval (5-shot)  |
-| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :--------------: | :-------------: | :-------------: |
-|          Baichuan-7B           |     -      |      1.2T       |    42.32 (42.30)     | 44.53 (44.02) |        38.72     |       36.74     |       42.80     |
-|       Baichuan-13B-Base        |     -      |      1.4T       |    50.51 (51.60)     | 55.73 (55.30) |        47.20     |       51.41     |       53.60     |
-|       Baichuan2-7B-Base        |     -      |      2.6T       |    46.97 (54.16)     | 57.67 (57.07) |        45.76     |       52.60     |       54.00     |
-|       Baichuan2-13B-Base       |     -      |      2.6T       |    54.84 (59.17)     | 62.62 (61.97) |        52.08     |       58.25     |       58.10     |
-|           ChatGLM-6B           |     -      |      1.0T       |    39.67 (40.63)     |   41.17 (-)   |        40.10     |       36.53     |       38.90     |
-|          ChatGLM2-6B           |     -      |      1.4T       |    44.74 (45.46)     |   49.40 (-)   |        46.36     |       45.49     |       51.70     |
-|          InternLM-7B           |     -      |      1.6T       |    46.70 (51.00)     |   52.00 (-)   |        44.77     |       61.64     |       52.80     |
-|            Qwen-7B             |     -      |      2.2T       |        54.29 (56.70) | 56.03 (58.80) |        52.47     |       56.42     |       59.60     |
-|           Llama-2-7B           |     -      |      2.0T       |    44.47 (45.30)     |   32.97 (-)   |        32.60     |       25.46     |         -       |
-| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B |      1.0T       |        37.43         |     29.92     |        32.00     |       27.57     |         -       |
-| wenge-research/yayi-7b-llama2  | Llama-2-7B |        -        |        38.56         |     31.52     |        30.99     |       25.95     |         -       |
-| ziqingyang/chinese-llama-2-7b  | Llama-2-7B |        -        |        33.86         |     34.69     |        34.52     |       25.18     |        34.2     |
-| TigerResearch/tigerbot-7b-base | Llama-2-7B |      0.3T       |        43.73         |     42.04     |        37.64     |       30.61     |         -       |
-|  LinkSoul/Chinese-Llama-2-7b   | Llama-2-7B |        -        |        48.41         |     38.31     |        38.45     |       27.72     |         -       |
-|       FlagAlpha/Atom-7B        | Llama-2-7B |      0.1T       |        49.96         |     41.10     |        39.83     |       33.00     |         -       |
-| IDEA-CCNL/Ziya-LLaMA-13B-v1.1  | Llama-13B  |      0.11T      |        50.25         |     40.99     |        40.04     |       30.54     |         -       |
-|  **Colossal-LLaMA-2-7b-base**  | Llama-2-7B |   **0.0085T**   |        53.06         |     49.89     |        51.48     |       58.82     |        50.2     |
+|              Model              |  Backbone  | Tokens Consumed |     MMLU (5-shot)    | CMMLU (5-shot)| AGIEval (5-shot) | GAOKAO (0-shot) | CEval (5-shot)  |
+| :-----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :--------------: | :-------------: | :-------------: |
+|          Baichuan-7B            |     -      |      1.2T       |    42.32 (42.30)     | 44.53 (44.02) |        38.72     |       36.74     |       42.80     |
+|       Baichuan-13B-Base         |     -      |      1.4T       |    50.51 (51.60)     | 55.73 (55.30) |        47.20     |       51.41     |       53.60     |
+|       Baichuan2-7B-Base         |     -      |      2.6T       |    46.97 (54.16)     | 57.67 (57.07) |        45.76     |       52.60     |       54.00     |
+|       Baichuan2-13B-Base        |     -      |      2.6T       |    54.84 (59.17)     | 62.62 (61.97) |        52.08     |       58.25     |       58.10     |
+|           ChatGLM-6B            |     -      |      1.0T       |    39.67 (40.63)     |   41.17 (-)   |        40.10     |       36.53     |       38.90     |
+|          ChatGLM2-6B            |     -      |      1.4T       |    44.74 (45.46)     |   49.40 (-)   |        46.36     |       45.49     |       51.70     |
+|          InternLM-7B            |     -      |      1.6T       |    46.70 (51.00)     |   52.00 (-)   |        44.77     |       61.64     |       52.80     |
+|            Qwen-7B              |     -      |      2.2T       |    54.29 (56.70)     | 56.03 (58.80) |        52.47     |       56.42     |       59.60     |
+|           Llama-2-7B            |     -      |      2.0T       |    44.47 (45.30)     |   32.97 (-)   |        32.60     |       25.46     |         -       |
+| Linly-AI/Chinese-LLaMA-2-7B-hf  | Llama-2-7B |      1.0T       |        37.43         |     29.92     |        32.00     |       27.57     |         -       |
+| wenge-research/yayi-7b-llama2   | Llama-2-7B |        -        |        38.56         |     31.52     |        30.99     |       25.95     |         -       |
+| ziqingyang/chinese-llama-2-7b   | Llama-2-7B |        -        |        33.86         |     34.69     |        34.52     |       25.18     |        34.2     |
+| TigerResearch/tigerbot-7b-base  | Llama-2-7B |      0.3T       |        43.73         |     42.04     |        37.64     |       30.61     |         -       |
+|  LinkSoul/Chinese-Llama-2-7b    | Llama-2-7B |        -        |        48.41         |     38.31     |        38.45     |       27.72     |         -       |
+|       FlagAlpha/Atom-7B         | Llama-2-7B |      0.1T       |        49.96         |     41.10     |        39.83     |       33.00     |         -       |
+| IDEA-CCNL/Ziya-LLaMA-13B-v1.1   | Llama-13B  |      0.11T      |        50.25         |     40.99     |        40.04     |       30.54     |         -       |
+|  **Colossal-LLaMA-2-7b-base**   | Llama-2-7B |   **0.0085T**   |        53.06         |     49.89     |        51.48     |       58.82     |        50.2     |
+|  **Colossal-LLaMA-2-13b-base**  | Llama-2-13B |   **0.025T**    |        56.42         |     61.80     |        54.69     |       69.53     |        60.3     |
 
 
 ### ColossalChat

From 94223514334c9ce5522fa09aedd02072abff58cb Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 19 Jan 2024 17:49:02 +0800
Subject: [PATCH 18/33] fix llama pretrain (#5287)

---
 examples/language/llama2/pretrain.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/language/llama2/pretrain.py b/examples/language/llama2/pretrain.py
index bb10f7a00e8a..8d5b7c8db05d 100644
--- a/examples/language/llama2/pretrain.py
+++ b/examples/language/llama2/pretrain.py
@@ -273,11 +273,10 @@ def main():
     dataloader.sampler.set_start_index(sampler_start_idx)
     for epoch in range(start_epoch, args.num_epochs):
         dataloader.sampler.set_epoch(epoch)
-        step_nums = num_steps_per_epoch - start_step
         dataloader_iter = iter(dataloader)
 
         with tqdm(
-            range(step_nums),
+            range(start_step, num_steps_per_epoch),
             desc=f"Epoch {epoch}",
             disable=not print_flag,
             total=num_steps_per_epoch,

From 977888b587073a44cd3f2a10c72c889a0c7f3eb0 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 22 Jan 2024 15:42:43 +0800
Subject: [PATCH 19/33] fix

---
 colossalai/shardformer/policies/gpt2.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index dc659500892b..f428c3cd0d08 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -232,8 +232,6 @@ def module_policy(self):
 
         module_policy = super().module_policy()
 
-        setattr(self.shard_config, "causal_lm", True)
-
         if self.shard_config.enable_tensor_parallelism:
             addon_module = {
                 GPT2LMHeadModel: ModulePolicyDescription(

From f556e1dde0ad3525786d61448d014333dd9b1a37 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 22 Jan 2024 17:20:14 +0800
Subject: [PATCH 20/33] fix

---
 .../gpt/hybridparallelism/benchmark.py        | 217 ++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 examples/language/gpt/hybridparallelism/benchmark.py

diff --git a/examples/language/gpt/hybridparallelism/benchmark.py b/examples/language/gpt/hybridparallelism/benchmark.py
new file mode 100644
index 000000000000..2815faa4f104
--- /dev/null
+++ b/examples/language/gpt/hybridparallelism/benchmark.py
@@ -0,0 +1,217 @@
+import argparse
+import resource
+from contextlib import nullcontext
+
+import torch
+from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision
+from torch.optim import Adam
+from tqdm import tqdm
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+
+import colossalai
+import colossalai.utils.device as device_utils
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, TorchFSDPPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.utils import get_current_device
+from examples.language.data_utils import RandomDataset
+from examples.language.model_utils import format_numel_str, get_model_numel
+from examples.language.performance_evaluator import PerformanceEvaluator
+
+# ==============================
+# Constants
+# ==============================
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", type=str, default="7b", help="Model configuration")
+    parser.add_argument(
+        "-p",
+        "--plugin",
+        choices=["gemini", "gemini_auto", "fsdp", "fsdp_cpu", "3d", "3d_cpu"],
+        default="gemini",
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("-b", "--batch_size", type=int, default=2, help="Batch size")
+    parser.add_argument("-s", "--num_steps", type=int, default=200, help="Number of steps to run")
+    parser.add_argument("-i", "--ignore_steps", type=int, default=1, help="Number of steps to ignore")
+    parser.add_argument("-g", "--grad_checkpoint", action="store_true", help="Use gradient checkpointing")
+    parser.add_argument("-l", "--max_length", type=int, default=1024, help="Max sequence length")
+    parser.add_argument(
+        "-w", "--warmup_ratio", type=float, default=0.8, help="warm up ratio of non-model data. Only for gemini-auto"
+    )
+    parser.add_argument("-m", "--memory_limit", type=int, help="Gemini memory limit in mb")
+    parser.add_argument("--shard_param_frac", type=float, default=1.0, help="Shard param fraction. Only for gemini")
+    parser.add_argument("--offload_optim_frac", type=float, default=0.0, help="Offload optim fraction. Only for gemini")
+    parser.add_argument("--offload_param_frac", type=float, default=0.0, help="Offload param fraction. Only for gemini")
+    parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument("--extra_dp", type=int, default=1, help="Extra data parallel size, used for Gemini")
+    parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel size")
+    parser.add_argument("--mbs", type=int, default=1)
+    parser.add_argument("--zero", type=int, default=0)
+    args = parser.parse_args()
+
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+
+    def empty_init():
+        pass
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    use_empty_init = True
+    if args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision="bf16",
+            shard_param_frac=args.shard_param_frac,
+            offload_optim_frac=args.offload_optim_frac,
+            offload_param_frac=args.offload_param_frac,
+            tp_size=args.tp,
+            extra_dp_size=args.extra_dp,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            placement_policy="auto",
+            precision="bf16",
+            warmup_non_model_data_ratio=args.warmup_ratio,
+            tp_size=args.tp,
+            extra_dp_size=args.extra_dp,
+        )
+    elif args.plugin == "fsdp":
+        if use_empty_init:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                ),
+                param_init_fn=empty_init(),
+            )
+        else:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                )
+            )
+    elif args.plugin == "fsdp_cpu":
+        if use_empty_init:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                ),
+                cpu_offload=CPUOffload(offload_params=True),
+                param_init_fn=empty_init(),
+            )
+        else:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                ),
+                cpu_offload=CPUOffload(offload_params=True),
+            )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=args.pp,
+            pp_style="interleaved",
+            zero_stage=args.zero,
+            num_model_chunks=2,
+            enable_all_optimization=True,
+            num_microbatches=args.mbs,
+            precision="bf16",
+        )
+    elif args.plugin == "3d_cpu":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=args.pp,
+            zero_stage=args.zero,
+            cpu_offload=True,
+            enable_fused_normalization=torch.cuda.is_available(),
+            num_microbatches=args.mbs,
+            initial_scale=2**8,
+            precision="bf16",
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    booster = Booster(plugin=plugin)
+
+    # ==============================
+    # Initialize Dataset and Dataloader
+    # ==============================
+    dp_size = plugin.dp_size if isinstance(plugin, HybridParallelPlugin) else coordinator.world_size
+
+    config = GPT2Config(n_layer=24, n_embd=1024, n_head=16, n_positions=1024)
+    dataset = RandomDataset(
+        num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
+    )
+    dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
+
+    # ==============================
+    # Initialize Model and Optimizer
+    # ==============================
+    init_ctx = (
+        LazyInitContext(default_device=get_current_device())
+        if isinstance(plugin, (GeminiPlugin, HybridParallelPlugin))
+        else nullcontext()
+    )
+
+    with init_ctx:
+        model = GPT2LMHeadModel(config)
+
+    if args.grad_checkpoint:
+        model.gradient_checkpointing_enable()
+
+    model_numel = get_model_numel(model)
+    coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
+    performance_evaluator = PerformanceEvaluator(
+        model_numel,
+        model.config.n_layer,
+        model.config.n_embd,
+        model.config.vocab_size,
+        args.grad_checkpoint,
+        args.ignore_steps,
+        dp_world_size=dp_size,
+    )
+
+    optimizer = Adam(model.parameters())
+    torch.set_default_dtype(torch.bfloat16)
+    model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
+    torch.set_default_dtype(torch.float)
+    coordinator.print_on_master(f"Booster init max CUDA memory: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB"
+    )
+
+    if isinstance(plugin, HybridParallelPlugin) and args.pp > 1:
+        data_iter = iter(dataloader)
+        for step in tqdm(range(len(dataloader)), desc="Step", disable=not coordinator.is_master()):
+            performance_evaluator.on_step_start(step)
+            booster.execute_pipeline(
+                data_iter, model, criterion=lambda outputs, inputs: outputs[0], optimizer=optimizer, return_loss=False
+            )
+            optimizer.step()
+            optimizer.zero_grad()
+            performance_evaluator.on_step_end(input_ids=torch.empty(args.batch_size, args.max_length))
+    else:
+        for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not coordinator.is_master())):
+            performance_evaluator.on_step_start(step)
+            outputs = model(**batch)
+            loss = outputs[0]
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            optimizer.zero_grad()
+            performance_evaluator.on_step_end(**batch)
+            coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
+
+    performance_evaluator.on_fit_end()
+    coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    main()

From 2da389de8a0af2b46557a6d7a4d3e08b9e3647ae Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 22 Jan 2024 17:20:14 +0800
Subject: [PATCH 21/33] fix

fix
---
 .../gpt/hybridparallelism/benchmark.py        | 217 ++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 examples/language/gpt/hybridparallelism/benchmark.py

diff --git a/examples/language/gpt/hybridparallelism/benchmark.py b/examples/language/gpt/hybridparallelism/benchmark.py
new file mode 100644
index 000000000000..1cf8aed14a59
--- /dev/null
+++ b/examples/language/gpt/hybridparallelism/benchmark.py
@@ -0,0 +1,217 @@
+import argparse
+import resource
+from contextlib import nullcontext
+
+import torch
+from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision
+from torch.optim import Adam
+from tqdm import tqdm
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+
+import colossalai
+import colossalai.utils.device as device_utils
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, TorchFSDPPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.utils import get_current_device
+from examples.language.data_utils import RandomDataset
+from examples.language.model_utils import format_numel_str, get_model_numel
+from examples.language.performance_evaluator import PerformanceEvaluator
+
+# ==============================
+# Constants
+# ==============================
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", type=str, default="7b", help="Model configuration")
+    parser.add_argument(
+        "-p",
+        "--plugin",
+        choices=["gemini", "gemini_auto", "fsdp", "fsdp_cpu", "3d", "3d_cpu"],
+        default="gemini",
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("-b", "--batch_size", type=int, default=2, help="Batch size")
+    parser.add_argument("-s", "--num_steps", type=int, default=200, help="Number of steps to run")
+    parser.add_argument("-i", "--ignore_steps", type=int, default=1, help="Number of steps to ignore")
+    parser.add_argument("-g", "--grad_checkpoint", action="store_true", help="Use gradient checkpointing")
+    parser.add_argument("-l", "--max_length", type=int, default=1024, help="Max sequence length")
+    parser.add_argument(
+        "-w", "--warmup_ratio", type=float, default=0.8, help="warm up ratio of non-model data. Only for gemini-auto"
+    )
+    parser.add_argument("-m", "--memory_limit", type=int, help="Gemini memory limit in mb")
+    parser.add_argument("--shard_param_frac", type=float, default=1.0, help="Shard param fraction. Only for gemini")
+    parser.add_argument("--offload_optim_frac", type=float, default=0.0, help="Offload optim fraction. Only for gemini")
+    parser.add_argument("--offload_param_frac", type=float, default=0.0, help="Offload param fraction. Only for gemini")
+    parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument("--extra_dp", type=int, default=1, help="Extra data parallel size, used for Gemini")
+    parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel size")
+    parser.add_argument("--mbs", type=int, default=1)
+    parser.add_argument("--zero", type=int, default=0)
+    args = parser.parse_args()
+
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+
+    def empty_init():
+        pass
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    use_empty_init = True
+    if args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision="bf16",
+            shard_param_frac=args.shard_param_frac,
+            offload_optim_frac=args.offload_optim_frac,
+            offload_param_frac=args.offload_param_frac,
+            tp_size=args.tp,
+            extra_dp_size=args.extra_dp,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            placement_policy="auto",
+            precision="bf16",
+            warmup_non_model_data_ratio=args.warmup_ratio,
+            tp_size=args.tp,
+            extra_dp_size=args.extra_dp,
+        )
+    elif args.plugin == "fsdp":
+        if use_empty_init:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                ),
+                param_init_fn=empty_init(),
+            )
+        else:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                )
+            )
+    elif args.plugin == "fsdp_cpu":
+        if use_empty_init:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                ),
+                cpu_offload=CPUOffload(offload_params=True),
+                param_init_fn=empty_init(),
+            )
+        else:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                ),
+                cpu_offload=CPUOffload(offload_params=True),
+            )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=args.pp,
+            pp_style="1f1b",
+            zero_stage=args.zero,
+            num_model_chunks=1,
+            enable_all_optimization=True,
+            num_microbatches=args.mbs,
+            precision="bf16",
+        )
+    elif args.plugin == "3d_cpu":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=args.pp,
+            zero_stage=args.zero,
+            cpu_offload=True,
+            enable_fused_normalization=torch.cuda.is_available(),
+            num_microbatches=args.mbs,
+            initial_scale=2**8,
+            precision="bf16",
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    booster = Booster(plugin=plugin)
+
+    # ==============================
+    # Initialize Dataset and Dataloader
+    # ==============================
+    dp_size = plugin.dp_size if isinstance(plugin, HybridParallelPlugin) else coordinator.world_size
+
+    config = GPT2Config(n_layer=24, n_embd=1024, n_head=16, n_positions=1024)
+    dataset = RandomDataset(
+        num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
+    )
+    dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
+
+    # ==============================
+    # Initialize Model and Optimizer
+    # ==============================
+    init_ctx = (
+        LazyInitContext(default_device=get_current_device())
+        if isinstance(plugin, (GeminiPlugin, HybridParallelPlugin))
+        else nullcontext()
+    )
+
+    with init_ctx:
+        model = GPT2LMHeadModel(config)
+
+    if args.grad_checkpoint:
+        model.gradient_checkpointing_enable()
+
+    model_numel = get_model_numel(model)
+    coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
+    performance_evaluator = PerformanceEvaluator(
+        model_numel,
+        model.config.n_layer,
+        model.config.n_embd,
+        model.config.vocab_size,
+        args.grad_checkpoint,
+        args.ignore_steps,
+        dp_world_size=dp_size,
+    )
+
+    optimizer = Adam(model.parameters())
+    torch.set_default_dtype(torch.bfloat16)
+    model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
+    torch.set_default_dtype(torch.float)
+    coordinator.print_on_master(f"Booster init max CUDA memory: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB"
+    )
+
+    if isinstance(plugin, HybridParallelPlugin) and args.pp > 1:
+        data_iter = iter(dataloader)
+        for step in tqdm(range(len(dataloader)), desc="Step", disable=not coordinator.is_master()):
+            performance_evaluator.on_step_start(step)
+            booster.execute_pipeline(
+                data_iter, model, criterion=lambda outputs, inputs: outputs[0], optimizer=optimizer, return_loss=False
+            )
+            optimizer.step()
+            optimizer.zero_grad()
+            performance_evaluator.on_step_end(input_ids=torch.empty(args.batch_size, args.max_length))
+    else:
+        for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not coordinator.is_master())):
+            performance_evaluator.on_step_start(step)
+            outputs = model(**batch)
+            loss = outputs[0]
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            optimizer.zero_grad()
+            performance_evaluator.on_step_end(**batch)
+            coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
+
+    performance_evaluator.on_fit_end()
+    coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    main()

From 46f4c87f14c5ae7b0808f6e19def73bdb648bc24 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 22 Jan 2024 17:20:14 +0800
Subject: [PATCH 22/33] fix

fix

fix
---
 .../gpt/hybridparallelism/benchmark.py        | 216 ++++++++++++++++++
 1 file changed, 216 insertions(+)
 create mode 100644 examples/language/gpt/hybridparallelism/benchmark.py

diff --git a/examples/language/gpt/hybridparallelism/benchmark.py b/examples/language/gpt/hybridparallelism/benchmark.py
new file mode 100644
index 000000000000..7893b13f3eb3
--- /dev/null
+++ b/examples/language/gpt/hybridparallelism/benchmark.py
@@ -0,0 +1,216 @@
+import argparse
+import resource
+from contextlib import nullcontext
+
+import torch
+from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision
+from torch.optim import Adam
+from tqdm import tqdm
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+
+import colossalai
+import colossalai.utils.device as device_utils
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, TorchFSDPPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.utils import get_current_device
+from examples.language.data_utils import RandomDataset
+from examples.language.model_utils import format_numel_str, get_model_numel
+from examples.language.performance_evaluator import PerformanceEvaluator
+
+# ==============================
+# Constants
+# ==============================
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-p",
+        "--plugin",
+        choices=["gemini", "gemini_auto", "fsdp", "fsdp_cpu", "3d", "3d_cpu"],
+        default="gemini",
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("-b", "--batch_size", type=int, default=2, help="Batch size")
+    parser.add_argument("-s", "--num_steps", type=int, default=200, help="Number of steps to run")
+    parser.add_argument("-i", "--ignore_steps", type=int, default=1, help="Number of steps to ignore")
+    parser.add_argument("-g", "--grad_checkpoint", action="store_true", help="Use gradient checkpointing")
+    parser.add_argument("-l", "--max_length", type=int, default=1024, help="Max sequence length")
+    parser.add_argument(
+        "-w", "--warmup_ratio", type=float, default=0.8, help="warm up ratio of non-model data. Only for gemini-auto"
+    )
+    parser.add_argument("-m", "--memory_limit", type=int, help="Gemini memory limit in mb")
+    parser.add_argument("--shard_param_frac", type=float, default=1.0, help="Shard param fraction. Only for gemini")
+    parser.add_argument("--offload_optim_frac", type=float, default=0.0, help="Offload optim fraction. Only for gemini")
+    parser.add_argument("--offload_param_frac", type=float, default=0.0, help="Offload param fraction. Only for gemini")
+    parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument("--extra_dp", type=int, default=1, help="Extra data parallel size, used for Gemini")
+    parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel size")
+    parser.add_argument("--mbs", type=int, default=1)
+    parser.add_argument("--zero", type=int, default=0)
+    args = parser.parse_args()
+
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+
+    def empty_init():
+        pass
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    use_empty_init = True
+    if args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision="bf16",
+            shard_param_frac=args.shard_param_frac,
+            offload_optim_frac=args.offload_optim_frac,
+            offload_param_frac=args.offload_param_frac,
+            tp_size=args.tp,
+            extra_dp_size=args.extra_dp,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            placement_policy="auto",
+            precision="bf16",
+            warmup_non_model_data_ratio=args.warmup_ratio,
+            tp_size=args.tp,
+            extra_dp_size=args.extra_dp,
+        )
+    elif args.plugin == "fsdp":
+        if use_empty_init:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                ),
+                param_init_fn=empty_init(),
+            )
+        else:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                )
+            )
+    elif args.plugin == "fsdp_cpu":
+        if use_empty_init:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                ),
+                cpu_offload=CPUOffload(offload_params=True),
+                param_init_fn=empty_init(),
+            )
+        else:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                ),
+                cpu_offload=CPUOffload(offload_params=True),
+            )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=args.pp,
+            pp_style="1f1b",
+            zero_stage=args.zero,
+            num_model_chunks=1,
+            enable_all_optimization=True,
+            num_microbatches=args.mbs,
+            precision="bf16",
+        )
+    elif args.plugin == "3d_cpu":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=args.pp,
+            zero_stage=args.zero,
+            cpu_offload=True,
+            enable_fused_normalization=torch.cuda.is_available(),
+            num_microbatches=args.mbs,
+            initial_scale=2**8,
+            precision="bf16",
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    booster = Booster(plugin=plugin)
+
+    # ==============================
+    # Initialize Dataset and Dataloader
+    # ==============================
+    dp_size = plugin.dp_size if isinstance(plugin, HybridParallelPlugin) else coordinator.world_size
+
+    config = GPT2Config(n_layer=24, n_embd=1024, n_head=16, n_positions=1024, activation_function="gelu")
+    dataset = RandomDataset(
+        num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
+    )
+    dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
+
+    # ==============================
+    # Initialize Model and Optimizer
+    # ==============================
+    init_ctx = (
+        LazyInitContext(default_device=get_current_device())
+        if isinstance(plugin, (GeminiPlugin, HybridParallelPlugin))
+        else nullcontext()
+    )
+
+    with init_ctx:
+        model = GPT2LMHeadModel(config)
+
+    if args.grad_checkpoint:
+        model.gradient_checkpointing_enable()
+
+    model_numel = get_model_numel(model)
+    coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
+    performance_evaluator = PerformanceEvaluator(
+        model_numel,
+        model.config.n_layer,
+        model.config.n_embd,
+        model.config.vocab_size,
+        args.grad_checkpoint,
+        args.ignore_steps,
+        dp_world_size=dp_size,
+    )
+
+    optimizer = Adam(model.parameters())
+    torch.set_default_dtype(torch.bfloat16)
+    model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
+    torch.set_default_dtype(torch.float)
+    coordinator.print_on_master(f"Booster init max CUDA memory: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB"
+    )
+
+    if isinstance(plugin, HybridParallelPlugin) and args.pp > 1:
+        data_iter = iter(dataloader)
+        for step in tqdm(range(len(dataloader)), desc="Step", disable=not coordinator.is_master()):
+            performance_evaluator.on_step_start(step)
+            booster.execute_pipeline(
+                data_iter, model, criterion=lambda outputs, inputs: outputs[0], optimizer=optimizer, return_loss=False
+            )
+            optimizer.step()
+            optimizer.zero_grad()
+            performance_evaluator.on_step_end(input_ids=torch.empty(args.batch_size, args.max_length))
+    else:
+        for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not coordinator.is_master())):
+            performance_evaluator.on_step_start(step)
+            outputs = model(**batch)
+            loss = outputs[0]
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            optimizer.zero_grad()
+            performance_evaluator.on_step_end(**batch)
+            coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
+
+    performance_evaluator.on_fit_end()
+    coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    main()

From d2593b86beee05bbc29b820dc02577e37cf9d5d7 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Sun, 28 Jan 2024 22:42:43 +0800
Subject: [PATCH 23/33] fix

fix
---
 colossalai/shardformer/modeling/gpt2.py       |  5 --
 colossalai/shardformer/policies/gpt2.py       | 66 ++++++++++++++-----
 .../gpt/hybridparallelism/benchmark.py        | 13 +++-
 3 files changed, 60 insertions(+), 24 deletions(-)

diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index fd9b4c908bba..ed321f4ce908 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -1065,11 +1065,6 @@ def forward(
         )
         hidden_states = transformer_outputs[0]
 
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.transformer.first_device)
-            hidden_states = hidden_states.to(self.lm_head.weight.device)
-
         lm_logits = self.lm_head(hidden_states)
 
         loss = None
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index f428c3cd0d08..a0774aee6d3a 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -172,15 +172,35 @@ def get_held_layers(self) -> List[nn.Module]:
         stage_manager = self.pipeline_stage_manager
 
         held_layers = []
-        layers_per_stage = self.distribute_layers(len(module.h), stage_manager.num_stages)
-        if stage_manager.is_first_stage():
-            held_layers.append(module.wte)
-            held_layers.append(module.wpe)
-            held_layers.append(module.drop)
-        start_idx, end_idx = self.get_stage_index(layers_per_stage, stage_manager.stage)
-        held_layers.extend(module.h[start_idx:end_idx])
-        if stage_manager.is_last_stage():
-            held_layers.append(module.ln_f)
+        if stage_manager.is_interleave:
+            assert stage_manager.num_model_chunks is not None
+            layers_per_stage = self.distribute_layers(
+                len(module.h), stage_manager.num_stages * stage_manager.num_model_chunks
+            )
+            stage_indices = Policy.get_stage_index(
+                layers_per_stage,
+                stage_manager.stage,
+                num_model_chunks=stage_manager.num_model_chunks,
+                num_stages=stage_manager.num_stages,
+            )
+            if stage_manager.is_first_stage(ignore_chunk=True):
+                held_layers.append(module.wte)
+                held_layers.append(module.wpe)
+                held_layers.append(module.drop)
+            for start_idx, end_idx in stage_indices:
+                held_layers.extend(module.h[start_idx:end_idx])
+            if stage_manager.is_last_stage(ignore_chunk=True):
+                held_layers.append(module.ln_f)
+        else:
+            layers_per_stage = self.distribute_layers(len(module.h), stage_manager.num_stages)
+            if stage_manager.is_first_stage():
+                held_layers.append(module.wte)
+                held_layers.append(module.wpe)
+                held_layers.append(module.drop)
+            start_idx, end_idx = self.get_stage_index(layers_per_stage, stage_manager.stage)
+            held_layers.extend(module.h[start_idx:end_idx])
+            if stage_manager.is_last_stage():
+                held_layers.append(module.ln_f)
         return held_layers
 
     def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, policy: Dict) -> None:
@@ -194,13 +214,27 @@ def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, poli
         else:
             module = self.model.transformer
 
-        layers_per_stage = Policy.distribute_layers(len(module.h), stage_manager.num_stages)
-        stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
-        method_replacement = {
-            "forward": partial(
-                new_forward, stage_manager=stage_manager, stage_index=stage_index, shard_config=self.shard_config
+        if stage_manager.is_interleave:
+            layers_per_stage = self.distribute_layers(
+                len(module.h), stage_manager.num_stages * stage_manager.num_model_chunks
             )
-        }
+            stage_manager.stage_indices = Policy.get_stage_index(
+                layers_per_stage,
+                stage_manager.stage,
+                num_model_chunks=stage_manager.num_model_chunks,
+                num_stages=stage_manager.num_stages,
+            )
+            method_replacement = {
+                "forward": partial(new_forward, stage_manager=stage_manager, shard_config=self.shard_config)
+            }
+        else:
+            layers_per_stage = Policy.distribute_layers(len(module.h), stage_manager.num_stages)
+            stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
+            method_replacement = {
+                "forward": partial(
+                    new_forward, stage_manager=stage_manager, stage_index=stage_index, shard_config=self.shard_config
+                )
+            }
         self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=model_cls)
 
 
@@ -255,7 +289,7 @@ def module_policy(self):
 
     def get_held_layers(self) -> List[nn.Module]:
         held_layers = super().get_held_layers()
-        if self.pipeline_stage_manager.is_last_stage():
+        if self.pipeline_stage_manager.is_last_stage(ignore_chunk=True):
             held_layers.append(self.model.lm_head)
         return held_layers
 
diff --git a/examples/language/gpt/hybridparallelism/benchmark.py b/examples/language/gpt/hybridparallelism/benchmark.py
index 7893b13f3eb3..7916bbdfb231 100644
--- a/examples/language/gpt/hybridparallelism/benchmark.py
+++ b/examples/language/gpt/hybridparallelism/benchmark.py
@@ -24,12 +24,19 @@
 # Constants
 # ==============================
 
+MODEL_CONFIGS = {
+    "small": GPT2Config(),
+    "medium": GPT2Config(n_head=16, n_layer=24, activation_function="gelu"),
+    "large": GPT2Config(n_embd=1280, n_head=20, n_layer=36),
+}
+
 
 def main():
     # ==============================
     # Parse Arguments
     # ==============================
     parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", type=str, default="medium", help="Model configuration")
     parser.add_argument(
         "-p",
         "--plugin",
@@ -117,9 +124,9 @@ def empty_init():
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
             pp_size=args.pp,
-            pp_style="1f1b",
+            pp_style="interleaved",
             zero_stage=args.zero,
-            num_model_chunks=1,
+            num_model_chunks=2,
             enable_all_optimization=True,
             num_microbatches=args.mbs,
             precision="bf16",
@@ -145,7 +152,7 @@ def empty_init():
     # ==============================
     dp_size = plugin.dp_size if isinstance(plugin, HybridParallelPlugin) else coordinator.world_size
 
-    config = GPT2Config(n_layer=24, n_embd=1024, n_head=16, n_positions=1024, activation_function="gelu")
+    config = MODEL_CONFIGS[args.config]
     dataset = RandomDataset(
         num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
     )

From 30ffe1029e18d2777af5db082f3ba28658051b80 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 12 Jan 2024 18:42:02 +0800
Subject: [PATCH 24/33] benchmark gpt2

---
 .../pipeline/schedule/interleaved_pp.py       |   2 +-
 colossalai/pipeline/schedule/one_f_one_b.py   |   2 +-
 colossalai/shardformer/modeling/gpt2.py       | 106 +++++++++++++++++-
 colossalai/shardformer/policies/gpt2.py       |  16 ++-
 examples/language/llama2/benchmark.py         |   2 +-
 .../test_model/test_shard_bert.py             |  13 ++-
 .../test_model/test_shard_t5.py               |  10 +-
 7 files changed, 130 insertions(+), 21 deletions(-)

diff --git a/colossalai/pipeline/schedule/interleaved_pp.py b/colossalai/pipeline/schedule/interleaved_pp.py
index a4ace5e1baad..6d2a6ce4717f 100644
--- a/colossalai/pipeline/schedule/interleaved_pp.py
+++ b/colossalai/pipeline/schedule/interleaved_pp.py
@@ -23,7 +23,7 @@ def __init__(
         num_model_chunks: int,
         num_microbatch: Optional[int] = None,
         microbatch_size: Optional[int] = None,
-        enable_metadata_cache: bool = True,
+        enable_metadata_cache: bool = False,
     ) -> None:
         super().__init__(stage_manager)
         assert (
diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
index bf2f01b10e9b..03cca8feb5e6 100644
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -31,7 +31,7 @@ def __init__(
         stage_manager: PipelineStageManager,
         num_microbatches: Optional[int] = None,
         microbatch_size: Optional[int] = None,
-        enable_metadata_cache: bool = True,
+        enable_metadata_cache: bool = False,
     ) -> None:
         """1F1B pipeline schedule.
 
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 055e3096d794..ca806713c046 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -24,6 +24,8 @@
 from colossalai.shardformer.layer._operation import gather_forward_split_backward, split_forward_gather_backward
 from colossalai.shardformer.shard import ShardConfig
 
+from ..layer import cross_entropy_1d
+
 
 class GPT2PipelineForwards:
     """
@@ -326,7 +328,15 @@ def gpt2_lmhead_model_forward(
             shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+            shift_labels = shift_labels.view(-1)
+            if shard_config.enable_tensor_parallelism:
+                loss = cross_entropy_1d(
+                    shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
+                )
+            else:
+                loss = loss_fct(shift_logits, shift_labels)
+
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
             return ((loss,) + output) if loss is not None else output
@@ -716,7 +726,7 @@ def gpt2_for_sequence_classification_forward(
         )
 
 
-def get_gpt2_flash_attention_forward():
+def get_gpt2_flash_attention_forward(shard_config: ShardConfig):
     from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
 
     from colossalai.nn.layer.colo_attention import AttnMaskType, ColoAttention
@@ -767,10 +777,12 @@ def forward(
         else:
             present = None
 
+        flash_attention_mask = None
         if not self.is_cross_attention:
             attn_mask_type = AttnMaskType.causal
-            flash_attention_mask = None
-        if attention_mask != None:
+        else:
+            attn_mask_type = None
+        if not getattr(shard_config, "causal_lm", False) and attention_mask != None:
             flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous()
             if not torch.all(flash_attention_mask):
                 if attn_mask_type == AttnMaskType.causal:
@@ -1006,3 +1018,89 @@ def custom_forward(*inputs):
         )
 
     return forward
+
+
+def get_lm_forward_with_dist_cross_entropy(shard_config: ShardConfig):
+    from transformers import GPT2LMHeadModel
+
+    def forward(
+        self: GPT2LMHeadModel,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+            shift_labels = shift_labels.view(-1)
+            if shard_config.enable_tensor_parallelism:
+                loss = cross_entropy_1d(
+                    shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
+                )
+            else:
+                loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+
+    return forward
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index 022e6ff5b32c..dc659500892b 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -5,7 +5,12 @@
 
 import colossalai.shardformer.layer as col_nn
 
-from ..modeling.gpt2 import GPT2PipelineForwards, get_gpt2_flash_attention_forward, gpt2_sequence_parallel_forward_fn
+from ..modeling.gpt2 import (
+    GPT2PipelineForwards,
+    get_gpt2_flash_attention_forward,
+    get_lm_forward_with_dist_cross_entropy,
+    gpt2_sequence_parallel_forward_fn,
+)
 from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 __all__ = [
@@ -142,7 +147,7 @@ def module_policy(self):
         if self.shard_config.enable_flash_attention:
             self.append_or_create_method_replacement(
                 description={
-                    "forward": get_gpt2_flash_attention_forward(),
+                    "forward": get_gpt2_flash_attention_forward(self.shard_config),
                 },
                 policy=policy,
                 target_key=GPT2Attention,
@@ -227,14 +232,17 @@ def module_policy(self):
 
         module_policy = super().module_policy()
 
+        setattr(self.shard_config, "causal_lm", True)
+
         if self.shard_config.enable_tensor_parallelism:
             addon_module = {
                 GPT2LMHeadModel: ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": True}
+                            suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": False}
                         )
-                    ]
+                    ],
+                    method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)},
                 )
             }
             module_policy.update(addon_module)
diff --git a/examples/language/llama2/benchmark.py b/examples/language/llama2/benchmark.py
index b8f70ce9c9d8..017b4f263943 100644
--- a/examples/language/llama2/benchmark.py
+++ b/examples/language/llama2/benchmark.py
@@ -239,4 +239,4 @@ def empty_init():
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/tests/test_shardformer/test_model/test_shard_bert.py b/tests/test_shardformer/test_model/test_shard_bert.py
index 768bd95bdb42..5a1d8c5727ea 100644
--- a/tests/test_shardformer/test_model/test_shard_bert.py
+++ b/tests/test_shardformer/test_model/test_shard_bert.py
@@ -1,6 +1,8 @@
 import pytest
 import torch
 
+torch.cuda.set_per_process_memory_fraction(0.125, 0)
+
 import colossalai
 from colossalai.logging import disable_existing_loggers
 from colossalai.shardformer.layer.utils import Randomizer
@@ -158,6 +160,7 @@ def run_bert_test(test_config):
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
 
+    print(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
     clear_layout_converter()
     Randomizer.reset_index()
     torch.cuda.empty_cache()
@@ -227,11 +230,11 @@ def test_bert():
     spawn(check_bert, 4)
 
 
-@pytest.mark.largedist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_bert_3d():
-    spawn(check_bert_3d, 8)
+# @pytest.mark.largedist
+# @rerun_if_address_is_in_use()
+# @clear_cache_before_run()
+# def test_bert_3d():
+#     spawn(check_bert_3d, 8)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_shardformer/test_model/test_shard_t5.py b/tests/test_shardformer/test_model/test_shard_t5.py
index 22c201458ad4..05cf958fa675 100644
--- a/tests/test_shardformer/test_model/test_shard_t5.py
+++ b/tests/test_shardformer/test_model/test_shard_t5.py
@@ -211,11 +211,11 @@ def test_t5():
     spawn(check_t5, 4)
 
 
-@pytest.mark.largedist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_t5_3d():
-    spawn(check_t5_3d, 8)
+# @pytest.mark.largedist
+# @rerun_if_address_is_in_use()
+# @clear_cache_before_run()
+# def test_t5_3d():
+#     spawn(check_t5_3d, 8)
 
 
 if __name__ == "__main__":

From 1149884bcacdcfecad38a1766bc5974c94a7ccc0 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 22 Jan 2024 15:24:35 +0800
Subject: [PATCH 25/33] fix

fix

fix

fix
---
 colossalai/pipeline/schedule/interleaved_pp.py      |  2 +-
 colossalai/pipeline/schedule/one_f_one_b.py         |  2 +-
 examples/__init__.py                                |  0
 examples/language/__init__.py                       |  0
 examples/language/{llama2 => }/data_utils.py        |  2 +-
 examples/language/{llama2 => }/model_utils.py       |  0
 .../language/{llama2 => }/performance_evaluator.py  |  0
 .../test_shardformer/test_model/test_shard_bert.py  | 13 +++++--------
 tests/test_shardformer/test_model/test_shard_t5.py  | 10 +++++-----
 9 files changed, 13 insertions(+), 16 deletions(-)
 create mode 100644 examples/__init__.py
 create mode 100644 examples/language/__init__.py
 rename examples/language/{llama2 => }/data_utils.py (99%)
 rename examples/language/{llama2 => }/model_utils.py (100%)
 rename examples/language/{llama2 => }/performance_evaluator.py (100%)

diff --git a/colossalai/pipeline/schedule/interleaved_pp.py b/colossalai/pipeline/schedule/interleaved_pp.py
index 6d2a6ce4717f..a4ace5e1baad 100644
--- a/colossalai/pipeline/schedule/interleaved_pp.py
+++ b/colossalai/pipeline/schedule/interleaved_pp.py
@@ -23,7 +23,7 @@ def __init__(
         num_model_chunks: int,
         num_microbatch: Optional[int] = None,
         microbatch_size: Optional[int] = None,
-        enable_metadata_cache: bool = False,
+        enable_metadata_cache: bool = True,
     ) -> None:
         super().__init__(stage_manager)
         assert (
diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
index 03cca8feb5e6..bf2f01b10e9b 100644
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -31,7 +31,7 @@ def __init__(
         stage_manager: PipelineStageManager,
         num_microbatches: Optional[int] = None,
         microbatch_size: Optional[int] = None,
-        enable_metadata_cache: bool = False,
+        enable_metadata_cache: bool = True,
     ) -> None:
         """1F1B pipeline schedule.
 
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/examples/language/__init__.py b/examples/language/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/examples/language/llama2/data_utils.py b/examples/language/data_utils.py
similarity index 99%
rename from examples/language/llama2/data_utils.py
rename to examples/language/data_utils.py
index 6b9e8ef28eb7..ec849ef9d1eb 100644
--- a/examples/language/llama2/data_utils.py
+++ b/examples/language/data_utils.py
@@ -121,4 +121,4 @@ def __getitem__(self, idx):
             "input_ids": self.input_ids[idx],
             "attention_mask": self.attention_mask[idx],
             "labels": self.input_ids[idx],
-        }
+        }
\ No newline at end of file
diff --git a/examples/language/llama2/model_utils.py b/examples/language/model_utils.py
similarity index 100%
rename from examples/language/llama2/model_utils.py
rename to examples/language/model_utils.py
diff --git a/examples/language/llama2/performance_evaluator.py b/examples/language/performance_evaluator.py
similarity index 100%
rename from examples/language/llama2/performance_evaluator.py
rename to examples/language/performance_evaluator.py
diff --git a/tests/test_shardformer/test_model/test_shard_bert.py b/tests/test_shardformer/test_model/test_shard_bert.py
index 5a1d8c5727ea..768bd95bdb42 100644
--- a/tests/test_shardformer/test_model/test_shard_bert.py
+++ b/tests/test_shardformer/test_model/test_shard_bert.py
@@ -1,8 +1,6 @@
 import pytest
 import torch
 
-torch.cuda.set_per_process_memory_fraction(0.125, 0)
-
 import colossalai
 from colossalai.logging import disable_existing_loggers
 from colossalai.shardformer.layer.utils import Randomizer
@@ -160,7 +158,6 @@ def run_bert_test(test_config):
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
 
-    print(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
     clear_layout_converter()
     Randomizer.reset_index()
     torch.cuda.empty_cache()
@@ -230,11 +227,11 @@ def test_bert():
     spawn(check_bert, 4)
 
 
-# @pytest.mark.largedist
-# @rerun_if_address_is_in_use()
-# @clear_cache_before_run()
-# def test_bert_3d():
-#     spawn(check_bert_3d, 8)
+@pytest.mark.largedist
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def test_bert_3d():
+    spawn(check_bert_3d, 8)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_shardformer/test_model/test_shard_t5.py b/tests/test_shardformer/test_model/test_shard_t5.py
index 05cf958fa675..22c201458ad4 100644
--- a/tests/test_shardformer/test_model/test_shard_t5.py
+++ b/tests/test_shardformer/test_model/test_shard_t5.py
@@ -211,11 +211,11 @@ def test_t5():
     spawn(check_t5, 4)
 
 
-# @pytest.mark.largedist
-# @rerun_if_address_is_in_use()
-# @clear_cache_before_run()
-# def test_t5_3d():
-#     spawn(check_t5_3d, 8)
+@pytest.mark.largedist
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def test_t5_3d():
+    spawn(check_t5_3d, 8)
 
 
 if __name__ == "__main__":

From e5a33da8bad9e0c39c68dfbee2838ca42b3d5aaf Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 10 Jan 2024 22:34:16 +0800
Subject: [PATCH 26/33] [workflow] fixed build CI (#5240)

* [workflow] fixed build CI

* polish

* polish

* polish

* polish

* polish
---
 .github/workflows/build_on_pr.yml             |   2 +-
 .github/workflows/build_on_schedule.yml       |   2 +-
 tests/kit/model_zoo/registry.py               |   2 +-
 .../test_plugin/test_gemini_plugin.py         |   2 +-
 .../test_plugin/test_low_level_zero_plugin.py | 163 ++++++++----------
 .../test_plugin/test_torch_ddp_plugin.py      | 127 +++++---------
 .../test_plugin/test_torch_fsdp_plugin.py     | 153 +++++++++-------
 .../test_gemini_checkpoint_io.py              |   2 +-
 tests/test_lazy/test_models.py                |   2 +-
 9 files changed, 205 insertions(+), 250 deletions(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index 2cad504f3391..b01d15490e0f 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -201,4 +201,4 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           name: report
-          path: report/
+          path: report/
\ No newline at end of file
diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index ae1a5275e5da..510665b46f4b 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -83,4 +83,4 @@ jobs:
           SERVER_URL: ${{github.server_url }}
           REPO: ${{ github.repository }}
           RUN_ID: ${{ github.run_id }}
-          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
+          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
\ No newline at end of file
diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py
index a16b16ad6af7..fce81ab52c2b 100644
--- a/tests/kit/model_zoo/registry.py
+++ b/tests/kit/model_zoo/registry.py
@@ -102,4 +102,4 @@ def get_sub_registry(
         return new_dict
 
 
-model_zoo = ModelZooRegistry()
+model_zoo = ModelZooRegistry()
\ No newline at end of file
diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py
index 17dfa3a1860d..0f72d2bcd3e4 100644
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -185,4 +185,4 @@ def test_gemini_plugin_3d(early_stop: bool = True):
 
 
 if __name__ == "__main__":
-    test_gemini_plugin(early_stop=False)
+    test_gemini_plugin(early_stop=False)
\ No newline at end of file
diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
index 861fa0131397..439d7778575a 100644
--- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
+++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
@@ -1,106 +1,85 @@
-from typing import Optional
-
 import torch
 import torch.distributed as dist
-from torch.optim import Adam
+from torchvision.models import resnet18
+from utils import shared_tempdir
 
 import colossalai
-from colossalai.accelerator import get_accelerator
 from colossalai.booster import Booster
 from colossalai.booster.plugin import LowLevelZeroPlugin
-
-# from colossalai.nn.optimizer import HybridAdam
-from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
-
-# These models are not compatible with AMP
-_AMP_ERR_MODELS = ["timm_convit", "deepfm_interactionarch"]
-# These models have no parameters
-_LOW_LEVEL_ZERO_ERR_MODELS = ["dlrm_interactionarch"]
-# These models will cause stuck, to be fixed
-_STUCK_MODELS = ["transformers_albert_for_multiple_choice"]
-
-
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.testing import (
+    check_state_dict_equal,
+    clear_cache_before_run,
+    parameterize,
+    rerun_if_address_is_in_use,
+    spawn,
+)
+from colossalai.zero import LowLevelZeroOptimizer
+
+
+# stage 1 and 2 process the optimizer/mode the same way
+# only test 2 is fine
 @clear_cache_before_run()
-def run_fn(stage, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
-    device = get_accelerator().get_current_device()
-    try:
-        plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5)
-        booster = Booster(plugin=plugin)
-        model = model_fn()
-        optimizer = Adam(model.parameters(), lr=1e-3)
-        criterion = lambda x: x.mean()
-        data = data_gen_fn()
-
-        data = {
-            k: v.to(device) if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()
-        }
-
-        model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
-
-        output = model(**data)
-        output = output_transform_fn(output)
-        output_key = list(output.keys())[0]
-        loss = criterion(output[output_key])
-
-        booster.backward(loss, optimizer)
-        optimizer.step()
-
-    except Exception as e:
-        return repr(e)
-
-
 @parameterize("stage", [2])
-def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
-    """check low level zero plugin over model zoo
-
-    Args:
-        stage (int), stage of low level zero plugin
-        early_stop (bool, optional): Whether to stop when getting the first error. Defaults to True.
-    """
-    passed_models = []
-    failed_info = {}  # (model_name, error) pair
-    ignore_models = _AMP_ERR_MODELS + _LOW_LEVEL_ZERO_ERR_MODELS + _STUCK_MODELS
-    skipped_models = []
-
-    if IS_FAST_TEST:
-        registry = model_zoo.get_sub_registry(COMMON_MODELS)
-    else:
-        registry = model_zoo
-
-    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
-        # FIXME(ver217): fix these models
-        if name in ignore_models:
-            skipped_models.append(name)
-            continue
-        err = run_fn(stage, model_fn, data_gen_fn, output_transform_fn)
-
-        get_accelerator().empty_cache()
-
-        if err is None:
-            passed_models.append(name)
-        else:
-            failed_info[name] = err
-            if early_stop:
-                break
-
-    if dist.get_rank() == 0:
-        print(f"Passed models({len(passed_models)}): {passed_models}\n\n")
-        print(f"Failed models({len(failed_info)}): {list(failed_info.keys())}\n\n")
-        print(f"Skipped models({len(skipped_models)}): {skipped_models}\n\n")
-    assert len(failed_info) == 0, "\n".join([f"{k}: {v}" for k, v in failed_info.items()])
-
-
-def run_dist(rank, world_size, port, early_stop: bool = True):
-    # init dist env
-    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
-    check_low_level_zero_plugin(early_stop=early_stop)
+@parameterize("shard", [True, False])
+@parameterize("offload", [False, True])
+def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool):
+    plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=32, cpu_offload=offload)
+    booster = Booster(plugin=plugin)
+    model = resnet18()
+    criterion = lambda x: x.mean()
+    optimizer = HybridAdam((model.parameters()), lr=0.001)
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
+    x = torch.randn(1, 3, 224, 224, device="cuda")
+    output = model(x)
+    loss = criterion(output)
+    booster.backward(loss, optimizer)
+    optimizer.step()
+    with shared_tempdir() as tempdir:
+        model_ckpt_path = f"{tempdir}/model"
+        optimizer_ckpt_path = f"{tempdir}/optimizer"
+        # lr scheduler is tested in test_torch_ddp_checkpoint_io.py and low level zero does not change it, we can skip it here
+        booster.save_model(model, model_ckpt_path, shard=shard)
+        booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard)
+
+        dist.barrier()
+
+        new_model = resnet18()
+        new_optimizer = HybridAdam((new_model.parameters()), lr=0.001)
+        new_model, new_optimizer, _, _, _ = booster.boost(new_model, new_optimizer)
+
+        booster.load_model(new_model, model_ckpt_path)
+        check_state_dict_equal(model.state_dict(), new_model.state_dict(), False)
+        # check master weight
+        assert isinstance(new_optimizer, LowLevelZeroOptimizer)
+        working_param_id_set = set(id(p) for p in new_model.parameters())
+        for p_id, master_param in new_optimizer._param_store.working_to_master_param.items():
+            assert p_id in working_param_id_set
+            working_param = new_optimizer._param_store.master_to_working_param[id(master_param)]
+            padding = new_optimizer._param_store.get_param_padding_size(working_param)
+            padded_param = torch.nn.functional.pad(working_param.data.view(-1), (0, padding))
+            working_shard = padded_param.chunk(dist.get_world_size())[dist.get_rank()]
+            assert torch.equal(
+                working_shard, master_param.data.view(-1).to(dtype=padded_param.dtype, device=padded_param.device)
+            )
+
+        booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
+        check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False)
+    torch.cuda.empty_cache()
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost")
+    check_low_level_zero_checkpointIO()
+    torch.cuda.empty_cache()
 
 
 @rerun_if_address_is_in_use()
-def test_low_level_zero_plugin(early_stop: bool = True):
-    spawn(run_dist, 2, early_stop=early_stop)
+@clear_cache_before_run()
+def test_low_level_zero_checkpointIO():
+    spawn(run_dist, 2)
 
 
 if __name__ == "__main__":
-    test_low_level_zero_plugin(early_stop=False)
+    test_low_level_zero_checkpointIO()
\ No newline at end of file
diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
index e785843fb053..87d2adf6b5b6 100644
--- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
@@ -1,119 +1,70 @@
-from contextlib import nullcontext
-
 import torch
 import torch.distributed as dist
-import torch.nn as nn
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import SGD
+from torchvision.models import resnet18
+from utils import shared_tempdir
 
 import colossalai
 from colossalai.booster import Booster
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.interface import OptimizerWrapper
-from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
+from colossalai.testing import check_state_dict_equal, parameterize, rerun_if_address_is_in_use, spawn
 
 
-@clear_cache_before_run()
-def run_fn(model_fn, data_gen_fn, output_transform_fn):
+@parameterize("shard", [True, False])
+@parameterize("size_per_shard", [16, 128])
+def check_torch_ddp_checkpointIO(shard: bool, size_per_shard: int):
     plugin = TorchDDPPlugin()
     booster = Booster(plugin=plugin)
-    model = model_fn()
-    optimizer = SGD(model.parameters(), lr=1e-3)
+    model = resnet18()
     criterion = lambda x: x.mean()
-    data = data_gen_fn()
-
-    data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
-
-    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+    optimizer = SGD((model.parameters()), lr=0.001)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion, lr_scheduler=scheduler)
 
     assert isinstance(model.module, DDP)
     assert isinstance(optimizer, OptimizerWrapper)
 
-    output = model(**data)
-    output = output_transform_fn(output)
-    output_key = list(output.keys())[0]
-    loss = criterion(output[output_key])
-
+    x = torch.randn(4, 3, 224, 224)
+    x = x.to("cuda")
+    output = model(x)
+    loss = criterion(output)
     booster.backward(loss, optimizer)
     optimizer.clip_grad_by_norm(1.0)
     optimizer.step()
+    scheduler.step()
 
+    with shared_tempdir() as tempdir:
+        model_ckpt_path = f"{tempdir}/model"
+        optimizer_ckpt_path = f"{tempdir}/optimizer"
+        lr_scheduler_ckpt_path = f"{tempdir}/lr_scheduler"
+        booster.save_model(model, model_ckpt_path, shard=shard, size_per_shard=size_per_shard)
+        booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard, size_per_shard=size_per_shard)
+        booster.save_lr_scheduler(scheduler, lr_scheduler_ckpt_path)
+        dist.barrier()
 
-def check_torch_ddp_plugin():
-    if IS_FAST_TEST:
-        registry = model_zoo.get_sub_registry(COMMON_MODELS)
-    else:
-        registry = model_zoo
-
-    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
-        if name == "dlrm_interactionarch":
-            continue
-        run_fn(model_fn, data_gen_fn, output_transform_fn)
-        torch.cuda.empty_cache()
-
-
-class DummyModel(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.weight = nn.Parameter(torch.rand(1))
-
-    def forward(self, x):
-        return self.weight * x
-
-
-def check_torch_ddp_no_sync():
-    plugin = TorchDDPPlugin()
-    booster = Booster(plugin=plugin)
-
-    model = DummyModel()
-    criterion = lambda x: x.mean()
-    optimizer = SGD(model.parameters(), lr=1e-3)
-    # create a custom dataset with 0 to 10
-    dataset = torch.arange(0, 10)
-    train_dataloader = plugin.prepare_dataloader(dataset, batch_size=2)
-    model, optimizer, criterion, train_dataloader, _ = booster.boost(
-        model, optimizer, criterion, dataloader=train_dataloader
-    )
-
-    def fwd_bwd():
-        output = model(batch.cuda())
-        loss = criterion(output)
-        booster.backward(loss, optimizer)
+        new_model = resnet18()
+        new_optimizer = SGD((new_model.parameters()), lr=0.001)
+        new_scheduler = torch.optim.lr_scheduler.StepLR(new_optimizer, step_size=1, gamma=0.1)
+        new_model, new_optimizer, _, _, new_scheduler = booster.boost(
+            new_model, new_optimizer, lr_scheduler=new_scheduler
+        )
 
-    def get_grad_set_over_all_ranks():
-        for p in model.parameters():
-            # grad shape is (1, )
-            assert p.grad.shape == (1,)
-            grad_list = [torch.empty_like(p.grad) for _ in range(dist.get_world_size())]
-            dist.all_gather(grad_list, p.grad)
-            # get grad set of all ranks
-            grad_set = set([grad.item() for grad in grad_list])
-            # as the model only has one parameter, we can return here
-            return grad_set
+        booster.load_model(new_model, model_ckpt_path)
+        check_state_dict_equal(model.state_dict(), new_model.state_dict(), False)
 
-    for i, batch in enumerate(train_dataloader):
-        if i > 1:
-            # only check the first two batches
-            break
-        # no_sync for the first batch, sync for the second batch
-        ctx = booster.no_sync(model) if i == 0 else nullcontext()
-        with ctx:
-            fwd_bwd()
-        grad_set = get_grad_set_over_all_ranks()
-        # for the first batch, all ranks should have different grads
-        # for the second batch, as grad is synchronized,all ranks should have the same grads
-        target_num_different_grad = dist.get_world_size() if i == 0 else 1
-        assert len(grad_set) == target_num_different_grad
+        booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
+        check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict(), False)
+        booster.load_lr_scheduler(new_scheduler, lr_scheduler_ckpt_path)
+        check_state_dict_equal(scheduler.state_dict(), new_scheduler.state_dict(), False)
 
 
 def run_dist(rank, world_size, port):
-    # init dist env
-    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
-    check_torch_ddp_plugin()
-    check_torch_ddp_no_sync()
+    colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost")
+    check_torch_ddp_checkpointIO()
 
 
 @rerun_if_address_is_in_use()
-def test_torch_ddp_plugin():
-    spawn(run_dist, 2)
+def test_torch_ddp_checkpointIO():
+    spawn(run_dist, 2)
\ No newline at end of file
diff --git a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
index f698070465d6..2badd914a19e 100644
--- a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
@@ -2,86 +2,111 @@
 import torch
 from packaging import version
 from torch.optim import SGD
+from torchvision.models import resnet18
+from utils import shared_tempdir
 
 import colossalai
 from colossalai.booster import Booster
 
 if version.parse(torch.__version__) >= version.parse("1.12.0"):
-    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
     from colossalai.booster.plugin import TorchFSDPPlugin
 
-from colossalai.interface import OptimizerWrapper
-from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
-
-
-# test basic fsdp function
-@clear_cache_before_run()
-def run_fn(model_fn, data_gen_fn, output_transform_fn):
+from colossalai.testing import rerun_if_address_is_in_use, spawn
+
+
+def compare_nested_dict(dict1, dict2):
+    for key in dict1:
+        if key in dict2:
+            if type(dict1[key]) is dict:
+                assert type(dict2[key]) is dict
+                diff = compare_nested_dict(dict1[key], dict2[key])
+                if not diff:
+                    return diff
+            elif type(dict1[key]) is list:
+                assert type(dict2[key]) is list
+                for i, val in enumerate(dict1[key]):
+                    if isinstance(val, torch.Tensor):
+                        if not torch.equal(dict1[key][i], dict2[key][i]):
+                            return False
+                    elif val != dict2[key][i]:
+                        return False
+            elif type(dict1[key]) is torch.Tensor:
+                assert type(dict2[key]) is torch.Tensor
+                if not torch.equal(dict1[key], dict2[key]):
+                    return False
+            else:
+                if dict1[key] != dict2[key]:
+                    return False
+        else:
+            return False
+    return True
+
+
+def check_torch_fsdp_ckpt():
+    model = resnet18()
     plugin = TorchFSDPPlugin()
     booster = Booster(plugin=plugin)
-    model = model_fn()
-    optimizer = SGD(model.parameters(), lr=1e-3)
+    optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9)
     criterion = lambda x: x.mean()
-    data = data_gen_fn()
-
-    data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
-
-    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
-
-    assert isinstance(model.module, FSDP)
-    assert isinstance(optimizer, OptimizerWrapper)
-
-    output = model(**data)
-    output = output_transform_fn(output)
-    output_key = list(output.keys())[0]
-    loss = criterion(output[output_key])
-
-    booster.backward(loss, optimizer)
-    optimizer.clip_grad_by_norm(1.0)
-    optimizer.step()
-
-    del model
-    del optimizer
-    del criterion
-    del booster
-    del plugin
-
-
-def check_torch_fsdp_plugin():
-    if IS_FAST_TEST:
-        registry = model_zoo.get_sub_registry(COMMON_MODELS)
-    else:
-        registry = model_zoo.get_sub_registry("transformers_gptj")
-
-    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
-        if any(
-            element in name
-            for element in [
-                "diffusers",
-                "deepfm_sparsearch",
-                "dlrm_interactionarch",
-                "torchvision_googlenet",
-                "torchvision_inception_v3",
-            ]
-        ):
-            continue
-        print(name)
-        run_fn(model_fn, data_gen_fn, output_transform_fn)
-        torch.cuda.empty_cache()
+    fsdp_model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
+    inputs = torch.randn(4, 3, 224, 224)
+    outputs = None
+
+    def run_model():
+        nonlocal outputs
+        outputs = fsdp_model(inputs)
+        optimizer.zero_grad()
+        criterion(outputs).backward()
+        optimizer.step()
+
+    with shared_tempdir() as tempdir:
+        model_ckpt_path = f"{tempdir}/model"
+        optim_ckpt_path = f"{tempdir}/optimizer"
+
+        run_model()
+
+        booster.save_model(fsdp_model, model_ckpt_path, shard=False)
+        booster.save_optimizer(optimizer, optim_ckpt_path, shard=False)
+
+        full_msd = fsdp_model.state_dict()
+        # full_osd = FSDP.full_optim_state_dict(fsdp_model, optimizer)
+        sharded_osd = optimizer.state_dict()
+        import copy
+
+        sharded_osd = copy.deepcopy(sharded_osd)
+
+        run_model()
+
+        full_msd_updated = fsdp_model.state_dict()
+        # full_osd_updated = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
+        sharded_osd_updated = optimizer.state_dict()
+
+        assert not compare_nested_dict(sharded_osd, sharded_osd_updated)
+        assert not compare_nested_dict(full_msd_updated, full_msd)
+        outputs_first = fsdp_model(inputs)
+        assert criterion(outputs_first) != criterion(outputs)
+
+        booster.load_model(fsdp_model, model_ckpt_path)
+        booster.load_optimizer(optimizer, optim_ckpt_path)
+
+        full_msd_restore = fsdp_model.state_dict()
+        # full_osd_restore = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
+        sharded_osd_restore = optimizer.state_dict()
+
+        assert compare_nested_dict(sharded_osd, sharded_osd_restore)
+        assert compare_nested_dict(full_msd_restore, full_msd)
+        outputs_sec = fsdp_model(inputs)
+        assert criterion(outputs_sec) == criterion(outputs)
 
 
 def run_dist(rank, world_size, port):
     # init dist env
     colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
-    check_torch_fsdp_plugin()
+    check_torch_fsdp_ckpt()
 
 
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse("1.12.0"), reason="requires torch1.12 or higher")
 @rerun_if_address_is_in_use()
-def test_torch_fsdp_plugin():
-    spawn(run_dist, 2)
-
-
-if __name__ == "__main__":
-    test_torch_fsdp_plugin()
+def test_torch_fsdp_ckpt():
+    spawn(run_dist, 2)
\ No newline at end of file
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
index 708a1906b118..894d88b0f86b 100644
--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -182,4 +182,4 @@ def test_gemini_ckpIO_3d():
 
 
 if __name__ == "__main__":
-    test_gemini_ckpIO()
+    test_gemini_ckpIO()
\ No newline at end of file
diff --git a/tests/test_lazy/test_models.py b/tests/test_lazy/test_models.py
index d0c4cd0a7c48..aeca5f21dc1d 100644
--- a/tests/test_lazy/test_models.py
+++ b/tests/test_lazy/test_models.py
@@ -24,4 +24,4 @@ def test_torchvision_models_lazy_init(subset, default_device):
 
 
 if __name__ == "__main__":
-    test_torchvision_models_lazy_init("transformers", "cpu")
+    test_torchvision_models_lazy_init("transformers", "cpu")
\ No newline at end of file

From c15223b54a054162655bfbc1f118a5c7c342b7ff Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 11 Jan 2024 16:04:45 +0800
Subject: [PATCH 27/33] [ci] fixed booster test (#5251)

* [ci] fixed booster test

* [ci] fixed booster test

* [ci] fixed booster test
---
 tests/test_booster/test_plugin/test_3d_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py
index 67b0bef50594..9afd098b83ad 100644
--- a/tests/test_booster/test_plugin/test_3d_plugin.py
+++ b/tests/test_booster/test_plugin/test_3d_plugin.py
@@ -262,4 +262,4 @@ def test_gemini_plugin(early_stop: bool = True):
 
 
 if __name__ == "__main__":
-    test_gemini_plugin(early_stop=False)
+    test_gemini_plugin(early_stop=False)
\ No newline at end of file

From cc2fac8b6f90c943e1d969e223524884ad5c5361 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 29 Jan 2024 19:24:27 +0800
Subject: [PATCH 28/33] fix

fix
---
 colossalai/shardformer/policies/gpt2.py       |   2 -
 .../gpt/hybridparallelism/benchmark.py        |   2 +-
 .../test_plugin/test_low_level_zero_plugin.py | 163 ++++++++++--------
 .../test_plugin/test_torch_ddp_plugin.py      | 127 +++++++++-----
 .../test_plugin/test_torch_fsdp_plugin.py     | 153 +++++++---------
 5 files changed, 245 insertions(+), 202 deletions(-)

diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index 47e1395a36c8..a0774aee6d3a 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -266,8 +266,6 @@ def module_policy(self):
 
         module_policy = super().module_policy()
 
-        setattr(self.shard_config, "causal_lm", True)
-
         if self.shard_config.enable_tensor_parallelism:
             addon_module = {
                 GPT2LMHeadModel: ModulePolicyDescription(
diff --git a/examples/language/gpt/hybridparallelism/benchmark.py b/examples/language/gpt/hybridparallelism/benchmark.py
index 7916bbdfb231..61324a8dc25c 100644
--- a/examples/language/gpt/hybridparallelism/benchmark.py
+++ b/examples/language/gpt/hybridparallelism/benchmark.py
@@ -26,7 +26,7 @@
 
 MODEL_CONFIGS = {
     "small": GPT2Config(),
-    "medium": GPT2Config(n_head=16, n_layer=24, activation_function="gelu"),
+    "medium": GPT2Config(n_embd=1024, n_head=16, n_layer=24, activation_function="gelu"),
     "large": GPT2Config(n_embd=1280, n_head=20, n_layer=36),
 }
 
diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
index 439d7778575a..861fa0131397 100644
--- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
+++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
@@ -1,85 +1,106 @@
+from typing import Optional
+
 import torch
 import torch.distributed as dist
-from torchvision.models import resnet18
-from utils import shared_tempdir
+from torch.optim import Adam
 
 import colossalai
+from colossalai.accelerator import get_accelerator
 from colossalai.booster import Booster
 from colossalai.booster.plugin import LowLevelZeroPlugin
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.testing import (
-    check_state_dict_equal,
-    clear_cache_before_run,
-    parameterize,
-    rerun_if_address_is_in_use,
-    spawn,
-)
-from colossalai.zero import LowLevelZeroOptimizer
-
-
-# stage 1 and 2 process the optimizer/mode the same way
-# only test 2 is fine
+
+# from colossalai.nn.optimizer import HybridAdam
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
+
+# These models are not compatible with AMP
+_AMP_ERR_MODELS = ["timm_convit", "deepfm_interactionarch"]
+# These models have no parameters
+_LOW_LEVEL_ZERO_ERR_MODELS = ["dlrm_interactionarch"]
+# These models will cause stuck, to be fixed
+_STUCK_MODELS = ["transformers_albert_for_multiple_choice"]
+
+
 @clear_cache_before_run()
+def run_fn(stage, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
+    device = get_accelerator().get_current_device()
+    try:
+        plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5)
+        booster = Booster(plugin=plugin)
+        model = model_fn()
+        optimizer = Adam(model.parameters(), lr=1e-3)
+        criterion = lambda x: x.mean()
+        data = data_gen_fn()
+
+        data = {
+            k: v.to(device) if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()
+        }
+
+        model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
+        output = model(**data)
+        output = output_transform_fn(output)
+        output_key = list(output.keys())[0]
+        loss = criterion(output[output_key])
+
+        booster.backward(loss, optimizer)
+        optimizer.step()
+
+    except Exception as e:
+        return repr(e)
+
+
 @parameterize("stage", [2])
-@parameterize("shard", [True, False])
-@parameterize("offload", [False, True])
-def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool):
-    plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=32, cpu_offload=offload)
-    booster = Booster(plugin=plugin)
-    model = resnet18()
-    criterion = lambda x: x.mean()
-    optimizer = HybridAdam((model.parameters()), lr=0.001)
-    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
-
-    x = torch.randn(1, 3, 224, 224, device="cuda")
-    output = model(x)
-    loss = criterion(output)
-    booster.backward(loss, optimizer)
-    optimizer.step()
-    with shared_tempdir() as tempdir:
-        model_ckpt_path = f"{tempdir}/model"
-        optimizer_ckpt_path = f"{tempdir}/optimizer"
-        # lr scheduler is tested in test_torch_ddp_checkpoint_io.py and low level zero does not change it, we can skip it here
-        booster.save_model(model, model_ckpt_path, shard=shard)
-        booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard)
-
-        dist.barrier()
-
-        new_model = resnet18()
-        new_optimizer = HybridAdam((new_model.parameters()), lr=0.001)
-        new_model, new_optimizer, _, _, _ = booster.boost(new_model, new_optimizer)
-
-        booster.load_model(new_model, model_ckpt_path)
-        check_state_dict_equal(model.state_dict(), new_model.state_dict(), False)
-        # check master weight
-        assert isinstance(new_optimizer, LowLevelZeroOptimizer)
-        working_param_id_set = set(id(p) for p in new_model.parameters())
-        for p_id, master_param in new_optimizer._param_store.working_to_master_param.items():
-            assert p_id in working_param_id_set
-            working_param = new_optimizer._param_store.master_to_working_param[id(master_param)]
-            padding = new_optimizer._param_store.get_param_padding_size(working_param)
-            padded_param = torch.nn.functional.pad(working_param.data.view(-1), (0, padding))
-            working_shard = padded_param.chunk(dist.get_world_size())[dist.get_rank()]
-            assert torch.equal(
-                working_shard, master_param.data.view(-1).to(dtype=padded_param.dtype, device=padded_param.device)
-            )
-
-        booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
-        check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False)
-    torch.cuda.empty_cache()
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost")
-    check_low_level_zero_checkpointIO()
-    torch.cuda.empty_cache()
+def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
+    """check low level zero plugin over model zoo
+
+    Args:
+        stage (int), stage of low level zero plugin
+        early_stop (bool, optional): Whether to stop when getting the first error. Defaults to True.
+    """
+    passed_models = []
+    failed_info = {}  # (model_name, error) pair
+    ignore_models = _AMP_ERR_MODELS + _LOW_LEVEL_ZERO_ERR_MODELS + _STUCK_MODELS
+    skipped_models = []
+
+    if IS_FAST_TEST:
+        registry = model_zoo.get_sub_registry(COMMON_MODELS)
+    else:
+        registry = model_zoo
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
+        # FIXME(ver217): fix these models
+        if name in ignore_models:
+            skipped_models.append(name)
+            continue
+        err = run_fn(stage, model_fn, data_gen_fn, output_transform_fn)
+
+        get_accelerator().empty_cache()
+
+        if err is None:
+            passed_models.append(name)
+        else:
+            failed_info[name] = err
+            if early_stop:
+                break
+
+    if dist.get_rank() == 0:
+        print(f"Passed models({len(passed_models)}): {passed_models}\n\n")
+        print(f"Failed models({len(failed_info)}): {list(failed_info.keys())}\n\n")
+        print(f"Skipped models({len(skipped_models)}): {skipped_models}\n\n")
+    assert len(failed_info) == 0, "\n".join([f"{k}: {v}" for k, v in failed_info.items()])
+
+
+def run_dist(rank, world_size, port, early_stop: bool = True):
+    # init dist env
+    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
+    check_low_level_zero_plugin(early_stop=early_stop)
 
 
 @rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_low_level_zero_checkpointIO():
-    spawn(run_dist, 2)
+def test_low_level_zero_plugin(early_stop: bool = True):
+    spawn(run_dist, 2, early_stop=early_stop)
 
 
 if __name__ == "__main__":
-    test_low_level_zero_checkpointIO()
\ No newline at end of file
+    test_low_level_zero_plugin(early_stop=False)
diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
index 87d2adf6b5b6..e785843fb053 100644
--- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
@@ -1,70 +1,119 @@
+from contextlib import nullcontext
+
 import torch
 import torch.distributed as dist
+import torch.nn as nn
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import SGD
-from torchvision.models import resnet18
-from utils import shared_tempdir
 
 import colossalai
 from colossalai.booster import Booster
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.interface import OptimizerWrapper
-from colossalai.testing import check_state_dict_equal, parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
 
 
-@parameterize("shard", [True, False])
-@parameterize("size_per_shard", [16, 128])
-def check_torch_ddp_checkpointIO(shard: bool, size_per_shard: int):
+@clear_cache_before_run()
+def run_fn(model_fn, data_gen_fn, output_transform_fn):
     plugin = TorchDDPPlugin()
     booster = Booster(plugin=plugin)
-    model = resnet18()
+    model = model_fn()
+    optimizer = SGD(model.parameters(), lr=1e-3)
     criterion = lambda x: x.mean()
-    optimizer = SGD((model.parameters()), lr=0.001)
-    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
-    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion, lr_scheduler=scheduler)
+    data = data_gen_fn()
+
+    data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
+
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
 
     assert isinstance(model.module, DDP)
     assert isinstance(optimizer, OptimizerWrapper)
 
-    x = torch.randn(4, 3, 224, 224)
-    x = x.to("cuda")
-    output = model(x)
-    loss = criterion(output)
+    output = model(**data)
+    output = output_transform_fn(output)
+    output_key = list(output.keys())[0]
+    loss = criterion(output[output_key])
+
     booster.backward(loss, optimizer)
     optimizer.clip_grad_by_norm(1.0)
     optimizer.step()
-    scheduler.step()
 
-    with shared_tempdir() as tempdir:
-        model_ckpt_path = f"{tempdir}/model"
-        optimizer_ckpt_path = f"{tempdir}/optimizer"
-        lr_scheduler_ckpt_path = f"{tempdir}/lr_scheduler"
-        booster.save_model(model, model_ckpt_path, shard=shard, size_per_shard=size_per_shard)
-        booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard, size_per_shard=size_per_shard)
-        booster.save_lr_scheduler(scheduler, lr_scheduler_ckpt_path)
-        dist.barrier()
 
-        new_model = resnet18()
-        new_optimizer = SGD((new_model.parameters()), lr=0.001)
-        new_scheduler = torch.optim.lr_scheduler.StepLR(new_optimizer, step_size=1, gamma=0.1)
-        new_model, new_optimizer, _, _, new_scheduler = booster.boost(
-            new_model, new_optimizer, lr_scheduler=new_scheduler
-        )
+def check_torch_ddp_plugin():
+    if IS_FAST_TEST:
+        registry = model_zoo.get_sub_registry(COMMON_MODELS)
+    else:
+        registry = model_zoo
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
+        if name == "dlrm_interactionarch":
+            continue
+        run_fn(model_fn, data_gen_fn, output_transform_fn)
+        torch.cuda.empty_cache()
+
+
+class DummyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = nn.Parameter(torch.rand(1))
+
+    def forward(self, x):
+        return self.weight * x
+
+
+def check_torch_ddp_no_sync():
+    plugin = TorchDDPPlugin()
+    booster = Booster(plugin=plugin)
+
+    model = DummyModel()
+    criterion = lambda x: x.mean()
+    optimizer = SGD(model.parameters(), lr=1e-3)
+    # create a custom dataset with 0 to 10
+    dataset = torch.arange(0, 10)
+    train_dataloader = plugin.prepare_dataloader(dataset, batch_size=2)
+    model, optimizer, criterion, train_dataloader, _ = booster.boost(
+        model, optimizer, criterion, dataloader=train_dataloader
+    )
+
+    def fwd_bwd():
+        output = model(batch.cuda())
+        loss = criterion(output)
+        booster.backward(loss, optimizer)
 
-        booster.load_model(new_model, model_ckpt_path)
-        check_state_dict_equal(model.state_dict(), new_model.state_dict(), False)
+    def get_grad_set_over_all_ranks():
+        for p in model.parameters():
+            # grad shape is (1, )
+            assert p.grad.shape == (1,)
+            grad_list = [torch.empty_like(p.grad) for _ in range(dist.get_world_size())]
+            dist.all_gather(grad_list, p.grad)
+            # get grad set of all ranks
+            grad_set = set([grad.item() for grad in grad_list])
+            # as the model only has one parameter, we can return here
+            return grad_set
 
-        booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
-        check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict(), False)
-        booster.load_lr_scheduler(new_scheduler, lr_scheduler_ckpt_path)
-        check_state_dict_equal(scheduler.state_dict(), new_scheduler.state_dict(), False)
+    for i, batch in enumerate(train_dataloader):
+        if i > 1:
+            # only check the first two batches
+            break
+        # no_sync for the first batch, sync for the second batch
+        ctx = booster.no_sync(model) if i == 0 else nullcontext()
+        with ctx:
+            fwd_bwd()
+        grad_set = get_grad_set_over_all_ranks()
+        # for the first batch, all ranks should have different grads
+        # for the second batch, as grad is synchronized,all ranks should have the same grads
+        target_num_different_grad = dist.get_world_size() if i == 0 else 1
+        assert len(grad_set) == target_num_different_grad
 
 
 def run_dist(rank, world_size, port):
-    colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost")
-    check_torch_ddp_checkpointIO()
+    # init dist env
+    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
+    check_torch_ddp_plugin()
+    check_torch_ddp_no_sync()
 
 
 @rerun_if_address_is_in_use()
-def test_torch_ddp_checkpointIO():
-    spawn(run_dist, 2)
\ No newline at end of file
+def test_torch_ddp_plugin():
+    spawn(run_dist, 2)
diff --git a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
index 2badd914a19e..f698070465d6 100644
--- a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
@@ -2,111 +2,86 @@
 import torch
 from packaging import version
 from torch.optim import SGD
-from torchvision.models import resnet18
-from utils import shared_tempdir
 
 import colossalai
 from colossalai.booster import Booster
 
 if version.parse(torch.__version__) >= version.parse("1.12.0"):
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
     from colossalai.booster.plugin import TorchFSDPPlugin
 
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-
-
-def compare_nested_dict(dict1, dict2):
-    for key in dict1:
-        if key in dict2:
-            if type(dict1[key]) is dict:
-                assert type(dict2[key]) is dict
-                diff = compare_nested_dict(dict1[key], dict2[key])
-                if not diff:
-                    return diff
-            elif type(dict1[key]) is list:
-                assert type(dict2[key]) is list
-                for i, val in enumerate(dict1[key]):
-                    if isinstance(val, torch.Tensor):
-                        if not torch.equal(dict1[key][i], dict2[key][i]):
-                            return False
-                    elif val != dict2[key][i]:
-                        return False
-            elif type(dict1[key]) is torch.Tensor:
-                assert type(dict2[key]) is torch.Tensor
-                if not torch.equal(dict1[key], dict2[key]):
-                    return False
-            else:
-                if dict1[key] != dict2[key]:
-                    return False
-        else:
-            return False
-    return True
-
-
-def check_torch_fsdp_ckpt():
-    model = resnet18()
+from colossalai.interface import OptimizerWrapper
+from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
+
+
+# test basic fsdp function
+@clear_cache_before_run()
+def run_fn(model_fn, data_gen_fn, output_transform_fn):
     plugin = TorchFSDPPlugin()
     booster = Booster(plugin=plugin)
-    optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9)
+    model = model_fn()
+    optimizer = SGD(model.parameters(), lr=1e-3)
     criterion = lambda x: x.mean()
-    fsdp_model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
-
-    inputs = torch.randn(4, 3, 224, 224)
-    outputs = None
-
-    def run_model():
-        nonlocal outputs
-        outputs = fsdp_model(inputs)
-        optimizer.zero_grad()
-        criterion(outputs).backward()
-        optimizer.step()
-
-    with shared_tempdir() as tempdir:
-        model_ckpt_path = f"{tempdir}/model"
-        optim_ckpt_path = f"{tempdir}/optimizer"
-
-        run_model()
-
-        booster.save_model(fsdp_model, model_ckpt_path, shard=False)
-        booster.save_optimizer(optimizer, optim_ckpt_path, shard=False)
-
-        full_msd = fsdp_model.state_dict()
-        # full_osd = FSDP.full_optim_state_dict(fsdp_model, optimizer)
-        sharded_osd = optimizer.state_dict()
-        import copy
-
-        sharded_osd = copy.deepcopy(sharded_osd)
-
-        run_model()
-
-        full_msd_updated = fsdp_model.state_dict()
-        # full_osd_updated = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
-        sharded_osd_updated = optimizer.state_dict()
-
-        assert not compare_nested_dict(sharded_osd, sharded_osd_updated)
-        assert not compare_nested_dict(full_msd_updated, full_msd)
-        outputs_first = fsdp_model(inputs)
-        assert criterion(outputs_first) != criterion(outputs)
-
-        booster.load_model(fsdp_model, model_ckpt_path)
-        booster.load_optimizer(optimizer, optim_ckpt_path)
-
-        full_msd_restore = fsdp_model.state_dict()
-        # full_osd_restore = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
-        sharded_osd_restore = optimizer.state_dict()
-
-        assert compare_nested_dict(sharded_osd, sharded_osd_restore)
-        assert compare_nested_dict(full_msd_restore, full_msd)
-        outputs_sec = fsdp_model(inputs)
-        assert criterion(outputs_sec) == criterion(outputs)
+    data = data_gen_fn()
+
+    data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
+
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
+    assert isinstance(model.module, FSDP)
+    assert isinstance(optimizer, OptimizerWrapper)
+
+    output = model(**data)
+    output = output_transform_fn(output)
+    output_key = list(output.keys())[0]
+    loss = criterion(output[output_key])
+
+    booster.backward(loss, optimizer)
+    optimizer.clip_grad_by_norm(1.0)
+    optimizer.step()
+
+    del model
+    del optimizer
+    del criterion
+    del booster
+    del plugin
+
+
+def check_torch_fsdp_plugin():
+    if IS_FAST_TEST:
+        registry = model_zoo.get_sub_registry(COMMON_MODELS)
+    else:
+        registry = model_zoo.get_sub_registry("transformers_gptj")
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
+        if any(
+            element in name
+            for element in [
+                "diffusers",
+                "deepfm_sparsearch",
+                "dlrm_interactionarch",
+                "torchvision_googlenet",
+                "torchvision_inception_v3",
+            ]
+        ):
+            continue
+        print(name)
+        run_fn(model_fn, data_gen_fn, output_transform_fn)
+        torch.cuda.empty_cache()
 
 
 def run_dist(rank, world_size, port):
     # init dist env
     colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
-    check_torch_fsdp_ckpt()
+    check_torch_fsdp_plugin()
 
 
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse("1.12.0"), reason="requires torch1.12 or higher")
 @rerun_if_address_is_in_use()
-def test_torch_fsdp_ckpt():
-    spawn(run_dist, 2)
\ No newline at end of file
+def test_torch_fsdp_plugin():
+    spawn(run_dist, 2)
+
+
+if __name__ == "__main__":
+    test_torch_fsdp_plugin()

From 9249a5c053bf48783747525d60adb7d459e6b826 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 29 Jan 2024 19:24:27 +0800
Subject: [PATCH 29/33] fix

fix

fix
---
 colossalai/shardformer/modeling/gpt2.py       |   5 -
 colossalai/shardformer/policies/gpt2.py       |   2 -
 .../gpt/hybridparallelism/benchmark.py        |   2 +-
 .../test_plugin/test_low_level_zero_plugin.py | 163 ++++++++++--------
 .../test_plugin/test_torch_ddp_plugin.py      | 127 +++++++++-----
 .../test_plugin/test_torch_fsdp_plugin.py     | 153 +++++++---------
 6 files changed, 245 insertions(+), 207 deletions(-)

diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index ca806713c046..306bd58bf687 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -1065,11 +1065,6 @@ def forward(
         )
         hidden_states = transformer_outputs[0]
 
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.transformer.first_device)
-            hidden_states = hidden_states.to(self.lm_head.weight.device)
-
         lm_logits = self.lm_head(hidden_states)
 
         loss = None
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index 47e1395a36c8..a0774aee6d3a 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -266,8 +266,6 @@ def module_policy(self):
 
         module_policy = super().module_policy()
 
-        setattr(self.shard_config, "causal_lm", True)
-
         if self.shard_config.enable_tensor_parallelism:
             addon_module = {
                 GPT2LMHeadModel: ModulePolicyDescription(
diff --git a/examples/language/gpt/hybridparallelism/benchmark.py b/examples/language/gpt/hybridparallelism/benchmark.py
index 7916bbdfb231..61324a8dc25c 100644
--- a/examples/language/gpt/hybridparallelism/benchmark.py
+++ b/examples/language/gpt/hybridparallelism/benchmark.py
@@ -26,7 +26,7 @@
 
 MODEL_CONFIGS = {
     "small": GPT2Config(),
-    "medium": GPT2Config(n_head=16, n_layer=24, activation_function="gelu"),
+    "medium": GPT2Config(n_embd=1024, n_head=16, n_layer=24, activation_function="gelu"),
     "large": GPT2Config(n_embd=1280, n_head=20, n_layer=36),
 }
 
diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
index 439d7778575a..861fa0131397 100644
--- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
+++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
@@ -1,85 +1,106 @@
+from typing import Optional
+
 import torch
 import torch.distributed as dist
-from torchvision.models import resnet18
-from utils import shared_tempdir
+from torch.optim import Adam
 
 import colossalai
+from colossalai.accelerator import get_accelerator
 from colossalai.booster import Booster
 from colossalai.booster.plugin import LowLevelZeroPlugin
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.testing import (
-    check_state_dict_equal,
-    clear_cache_before_run,
-    parameterize,
-    rerun_if_address_is_in_use,
-    spawn,
-)
-from colossalai.zero import LowLevelZeroOptimizer
-
-
-# stage 1 and 2 process the optimizer/mode the same way
-# only test 2 is fine
+
+# from colossalai.nn.optimizer import HybridAdam
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
+
+# These models are not compatible with AMP
+_AMP_ERR_MODELS = ["timm_convit", "deepfm_interactionarch"]
+# These models have no parameters
+_LOW_LEVEL_ZERO_ERR_MODELS = ["dlrm_interactionarch"]
+# These models will cause stuck, to be fixed
+_STUCK_MODELS = ["transformers_albert_for_multiple_choice"]
+
+
 @clear_cache_before_run()
+def run_fn(stage, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
+    device = get_accelerator().get_current_device()
+    try:
+        plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=2**5)
+        booster = Booster(plugin=plugin)
+        model = model_fn()
+        optimizer = Adam(model.parameters(), lr=1e-3)
+        criterion = lambda x: x.mean()
+        data = data_gen_fn()
+
+        data = {
+            k: v.to(device) if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()
+        }
+
+        model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
+        output = model(**data)
+        output = output_transform_fn(output)
+        output_key = list(output.keys())[0]
+        loss = criterion(output[output_key])
+
+        booster.backward(loss, optimizer)
+        optimizer.step()
+
+    except Exception as e:
+        return repr(e)
+
+
 @parameterize("stage", [2])
-@parameterize("shard", [True, False])
-@parameterize("offload", [False, True])
-def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool):
-    plugin = LowLevelZeroPlugin(stage=stage, max_norm=1.0, initial_scale=32, cpu_offload=offload)
-    booster = Booster(plugin=plugin)
-    model = resnet18()
-    criterion = lambda x: x.mean()
-    optimizer = HybridAdam((model.parameters()), lr=0.001)
-    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
-
-    x = torch.randn(1, 3, 224, 224, device="cuda")
-    output = model(x)
-    loss = criterion(output)
-    booster.backward(loss, optimizer)
-    optimizer.step()
-    with shared_tempdir() as tempdir:
-        model_ckpt_path = f"{tempdir}/model"
-        optimizer_ckpt_path = f"{tempdir}/optimizer"
-        # lr scheduler is tested in test_torch_ddp_checkpoint_io.py and low level zero does not change it, we can skip it here
-        booster.save_model(model, model_ckpt_path, shard=shard)
-        booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard)
-
-        dist.barrier()
-
-        new_model = resnet18()
-        new_optimizer = HybridAdam((new_model.parameters()), lr=0.001)
-        new_model, new_optimizer, _, _, _ = booster.boost(new_model, new_optimizer)
-
-        booster.load_model(new_model, model_ckpt_path)
-        check_state_dict_equal(model.state_dict(), new_model.state_dict(), False)
-        # check master weight
-        assert isinstance(new_optimizer, LowLevelZeroOptimizer)
-        working_param_id_set = set(id(p) for p in new_model.parameters())
-        for p_id, master_param in new_optimizer._param_store.working_to_master_param.items():
-            assert p_id in working_param_id_set
-            working_param = new_optimizer._param_store.master_to_working_param[id(master_param)]
-            padding = new_optimizer._param_store.get_param_padding_size(working_param)
-            padded_param = torch.nn.functional.pad(working_param.data.view(-1), (0, padding))
-            working_shard = padded_param.chunk(dist.get_world_size())[dist.get_rank()]
-            assert torch.equal(
-                working_shard, master_param.data.view(-1).to(dtype=padded_param.dtype, device=padded_param.device)
-            )
-
-        booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
-        check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False)
-    torch.cuda.empty_cache()
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost")
-    check_low_level_zero_checkpointIO()
-    torch.cuda.empty_cache()
+def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
+    """check low level zero plugin over model zoo
+
+    Args:
+        stage (int), stage of low level zero plugin
+        early_stop (bool, optional): Whether to stop when getting the first error. Defaults to True.
+    """
+    passed_models = []
+    failed_info = {}  # (model_name, error) pair
+    ignore_models = _AMP_ERR_MODELS + _LOW_LEVEL_ZERO_ERR_MODELS + _STUCK_MODELS
+    skipped_models = []
+
+    if IS_FAST_TEST:
+        registry = model_zoo.get_sub_registry(COMMON_MODELS)
+    else:
+        registry = model_zoo
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
+        # FIXME(ver217): fix these models
+        if name in ignore_models:
+            skipped_models.append(name)
+            continue
+        err = run_fn(stage, model_fn, data_gen_fn, output_transform_fn)
+
+        get_accelerator().empty_cache()
+
+        if err is None:
+            passed_models.append(name)
+        else:
+            failed_info[name] = err
+            if early_stop:
+                break
+
+    if dist.get_rank() == 0:
+        print(f"Passed models({len(passed_models)}): {passed_models}\n\n")
+        print(f"Failed models({len(failed_info)}): {list(failed_info.keys())}\n\n")
+        print(f"Skipped models({len(skipped_models)}): {skipped_models}\n\n")
+    assert len(failed_info) == 0, "\n".join([f"{k}: {v}" for k, v in failed_info.items()])
+
+
+def run_dist(rank, world_size, port, early_stop: bool = True):
+    # init dist env
+    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
+    check_low_level_zero_plugin(early_stop=early_stop)
 
 
 @rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_low_level_zero_checkpointIO():
-    spawn(run_dist, 2)
+def test_low_level_zero_plugin(early_stop: bool = True):
+    spawn(run_dist, 2, early_stop=early_stop)
 
 
 if __name__ == "__main__":
-    test_low_level_zero_checkpointIO()
\ No newline at end of file
+    test_low_level_zero_plugin(early_stop=False)
diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
index 87d2adf6b5b6..e785843fb053 100644
--- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
@@ -1,70 +1,119 @@
+from contextlib import nullcontext
+
 import torch
 import torch.distributed as dist
+import torch.nn as nn
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import SGD
-from torchvision.models import resnet18
-from utils import shared_tempdir
 
 import colossalai
 from colossalai.booster import Booster
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.interface import OptimizerWrapper
-from colossalai.testing import check_state_dict_equal, parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
 
 
-@parameterize("shard", [True, False])
-@parameterize("size_per_shard", [16, 128])
-def check_torch_ddp_checkpointIO(shard: bool, size_per_shard: int):
+@clear_cache_before_run()
+def run_fn(model_fn, data_gen_fn, output_transform_fn):
     plugin = TorchDDPPlugin()
     booster = Booster(plugin=plugin)
-    model = resnet18()
+    model = model_fn()
+    optimizer = SGD(model.parameters(), lr=1e-3)
     criterion = lambda x: x.mean()
-    optimizer = SGD((model.parameters()), lr=0.001)
-    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
-    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion, lr_scheduler=scheduler)
+    data = data_gen_fn()
+
+    data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
+
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
 
     assert isinstance(model.module, DDP)
     assert isinstance(optimizer, OptimizerWrapper)
 
-    x = torch.randn(4, 3, 224, 224)
-    x = x.to("cuda")
-    output = model(x)
-    loss = criterion(output)
+    output = model(**data)
+    output = output_transform_fn(output)
+    output_key = list(output.keys())[0]
+    loss = criterion(output[output_key])
+
     booster.backward(loss, optimizer)
     optimizer.clip_grad_by_norm(1.0)
     optimizer.step()
-    scheduler.step()
 
-    with shared_tempdir() as tempdir:
-        model_ckpt_path = f"{tempdir}/model"
-        optimizer_ckpt_path = f"{tempdir}/optimizer"
-        lr_scheduler_ckpt_path = f"{tempdir}/lr_scheduler"
-        booster.save_model(model, model_ckpt_path, shard=shard, size_per_shard=size_per_shard)
-        booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard, size_per_shard=size_per_shard)
-        booster.save_lr_scheduler(scheduler, lr_scheduler_ckpt_path)
-        dist.barrier()
 
-        new_model = resnet18()
-        new_optimizer = SGD((new_model.parameters()), lr=0.001)
-        new_scheduler = torch.optim.lr_scheduler.StepLR(new_optimizer, step_size=1, gamma=0.1)
-        new_model, new_optimizer, _, _, new_scheduler = booster.boost(
-            new_model, new_optimizer, lr_scheduler=new_scheduler
-        )
+def check_torch_ddp_plugin():
+    if IS_FAST_TEST:
+        registry = model_zoo.get_sub_registry(COMMON_MODELS)
+    else:
+        registry = model_zoo
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
+        if name == "dlrm_interactionarch":
+            continue
+        run_fn(model_fn, data_gen_fn, output_transform_fn)
+        torch.cuda.empty_cache()
+
+
+class DummyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = nn.Parameter(torch.rand(1))
+
+    def forward(self, x):
+        return self.weight * x
+
+
+def check_torch_ddp_no_sync():
+    plugin = TorchDDPPlugin()
+    booster = Booster(plugin=plugin)
+
+    model = DummyModel()
+    criterion = lambda x: x.mean()
+    optimizer = SGD(model.parameters(), lr=1e-3)
+    # create a custom dataset with 0 to 10
+    dataset = torch.arange(0, 10)
+    train_dataloader = plugin.prepare_dataloader(dataset, batch_size=2)
+    model, optimizer, criterion, train_dataloader, _ = booster.boost(
+        model, optimizer, criterion, dataloader=train_dataloader
+    )
+
+    def fwd_bwd():
+        output = model(batch.cuda())
+        loss = criterion(output)
+        booster.backward(loss, optimizer)
 
-        booster.load_model(new_model, model_ckpt_path)
-        check_state_dict_equal(model.state_dict(), new_model.state_dict(), False)
+    def get_grad_set_over_all_ranks():
+        for p in model.parameters():
+            # grad shape is (1, )
+            assert p.grad.shape == (1,)
+            grad_list = [torch.empty_like(p.grad) for _ in range(dist.get_world_size())]
+            dist.all_gather(grad_list, p.grad)
+            # get grad set of all ranks
+            grad_set = set([grad.item() for grad in grad_list])
+            # as the model only has one parameter, we can return here
+            return grad_set
 
-        booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
-        check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict(), False)
-        booster.load_lr_scheduler(new_scheduler, lr_scheduler_ckpt_path)
-        check_state_dict_equal(scheduler.state_dict(), new_scheduler.state_dict(), False)
+    for i, batch in enumerate(train_dataloader):
+        if i > 1:
+            # only check the first two batches
+            break
+        # no_sync for the first batch, sync for the second batch
+        ctx = booster.no_sync(model) if i == 0 else nullcontext()
+        with ctx:
+            fwd_bwd()
+        grad_set = get_grad_set_over_all_ranks()
+        # for the first batch, all ranks should have different grads
+        # for the second batch, as grad is synchronized,all ranks should have the same grads
+        target_num_different_grad = dist.get_world_size() if i == 0 else 1
+        assert len(grad_set) == target_num_different_grad
 
 
 def run_dist(rank, world_size, port):
-    colossalai.launch(config=(dict()), rank=rank, world_size=world_size, port=port, host="localhost")
-    check_torch_ddp_checkpointIO()
+    # init dist env
+    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
+    check_torch_ddp_plugin()
+    check_torch_ddp_no_sync()
 
 
 @rerun_if_address_is_in_use()
-def test_torch_ddp_checkpointIO():
-    spawn(run_dist, 2)
\ No newline at end of file
+def test_torch_ddp_plugin():
+    spawn(run_dist, 2)
diff --git a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
index 2badd914a19e..f698070465d6 100644
--- a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
@@ -2,111 +2,86 @@
 import torch
 from packaging import version
 from torch.optim import SGD
-from torchvision.models import resnet18
-from utils import shared_tempdir
 
 import colossalai
 from colossalai.booster import Booster
 
 if version.parse(torch.__version__) >= version.parse("1.12.0"):
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
     from colossalai.booster.plugin import TorchFSDPPlugin
 
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-
-
-def compare_nested_dict(dict1, dict2):
-    for key in dict1:
-        if key in dict2:
-            if type(dict1[key]) is dict:
-                assert type(dict2[key]) is dict
-                diff = compare_nested_dict(dict1[key], dict2[key])
-                if not diff:
-                    return diff
-            elif type(dict1[key]) is list:
-                assert type(dict2[key]) is list
-                for i, val in enumerate(dict1[key]):
-                    if isinstance(val, torch.Tensor):
-                        if not torch.equal(dict1[key][i], dict2[key][i]):
-                            return False
-                    elif val != dict2[key][i]:
-                        return False
-            elif type(dict1[key]) is torch.Tensor:
-                assert type(dict2[key]) is torch.Tensor
-                if not torch.equal(dict1[key], dict2[key]):
-                    return False
-            else:
-                if dict1[key] != dict2[key]:
-                    return False
-        else:
-            return False
-    return True
-
-
-def check_torch_fsdp_ckpt():
-    model = resnet18()
+from colossalai.interface import OptimizerWrapper
+from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
+
+
+# test basic fsdp function
+@clear_cache_before_run()
+def run_fn(model_fn, data_gen_fn, output_transform_fn):
     plugin = TorchFSDPPlugin()
     booster = Booster(plugin=plugin)
-    optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9)
+    model = model_fn()
+    optimizer = SGD(model.parameters(), lr=1e-3)
     criterion = lambda x: x.mean()
-    fsdp_model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
-
-    inputs = torch.randn(4, 3, 224, 224)
-    outputs = None
-
-    def run_model():
-        nonlocal outputs
-        outputs = fsdp_model(inputs)
-        optimizer.zero_grad()
-        criterion(outputs).backward()
-        optimizer.step()
-
-    with shared_tempdir() as tempdir:
-        model_ckpt_path = f"{tempdir}/model"
-        optim_ckpt_path = f"{tempdir}/optimizer"
-
-        run_model()
-
-        booster.save_model(fsdp_model, model_ckpt_path, shard=False)
-        booster.save_optimizer(optimizer, optim_ckpt_path, shard=False)
-
-        full_msd = fsdp_model.state_dict()
-        # full_osd = FSDP.full_optim_state_dict(fsdp_model, optimizer)
-        sharded_osd = optimizer.state_dict()
-        import copy
-
-        sharded_osd = copy.deepcopy(sharded_osd)
-
-        run_model()
-
-        full_msd_updated = fsdp_model.state_dict()
-        # full_osd_updated = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
-        sharded_osd_updated = optimizer.state_dict()
-
-        assert not compare_nested_dict(sharded_osd, sharded_osd_updated)
-        assert not compare_nested_dict(full_msd_updated, full_msd)
-        outputs_first = fsdp_model(inputs)
-        assert criterion(outputs_first) != criterion(outputs)
-
-        booster.load_model(fsdp_model, model_ckpt_path)
-        booster.load_optimizer(optimizer, optim_ckpt_path)
-
-        full_msd_restore = fsdp_model.state_dict()
-        # full_osd_restore = FSDP.full_optim_state_dict(fsdp_model, optimizer, rank0_only=True)
-        sharded_osd_restore = optimizer.state_dict()
-
-        assert compare_nested_dict(sharded_osd, sharded_osd_restore)
-        assert compare_nested_dict(full_msd_restore, full_msd)
-        outputs_sec = fsdp_model(inputs)
-        assert criterion(outputs_sec) == criterion(outputs)
+    data = data_gen_fn()
+
+    data = {k: v.to("cuda") if torch.is_tensor(v) or "Tensor" in v.__class__.__name__ else v for k, v in data.items()}
+
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
+    assert isinstance(model.module, FSDP)
+    assert isinstance(optimizer, OptimizerWrapper)
+
+    output = model(**data)
+    output = output_transform_fn(output)
+    output_key = list(output.keys())[0]
+    loss = criterion(output[output_key])
+
+    booster.backward(loss, optimizer)
+    optimizer.clip_grad_by_norm(1.0)
+    optimizer.step()
+
+    del model
+    del optimizer
+    del criterion
+    del booster
+    del plugin
+
+
+def check_torch_fsdp_plugin():
+    if IS_FAST_TEST:
+        registry = model_zoo.get_sub_registry(COMMON_MODELS)
+    else:
+        registry = model_zoo.get_sub_registry("transformers_gptj")
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
+        if any(
+            element in name
+            for element in [
+                "diffusers",
+                "deepfm_sparsearch",
+                "dlrm_interactionarch",
+                "torchvision_googlenet",
+                "torchvision_inception_v3",
+            ]
+        ):
+            continue
+        print(name)
+        run_fn(model_fn, data_gen_fn, output_transform_fn)
+        torch.cuda.empty_cache()
 
 
 def run_dist(rank, world_size, port):
     # init dist env
     colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
-    check_torch_fsdp_ckpt()
+    check_torch_fsdp_plugin()
 
 
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse("1.12.0"), reason="requires torch1.12 or higher")
 @rerun_if_address_is_in_use()
-def test_torch_fsdp_ckpt():
-    spawn(run_dist, 2)
\ No newline at end of file
+def test_torch_fsdp_plugin():
+    spawn(run_dist, 2)
+
+
+if __name__ == "__main__":
+    test_torch_fsdp_plugin()

From e1402fd2a23091194c7e0ef5322ac306b4b2ff0a Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 23 Feb 2024 14:10:17 +0800
Subject: [PATCH 30/33] fix

---
 .../booster/plugin/hybrid_parallel_plugin.py  |  3 +
 colossalai/shardformer/layer/_operation.py    | 22 ++++---
 colossalai/shardformer/layer/normalization.py |  7 +--
 .../shardformer/layer/qkv_fused_linear.py     | 17 +++---
 colossalai/shardformer/modeling/gpt2.py       | 13 ++--
 colossalai/shardformer/policies/gpt2.py       | 52 ++++++++--------
 .../gpt/hybridparallelism/benchmark.py        | 61 ++++++++++++++-----
 examples/language/performance_evaluator.py    |  1 +
 extensions/cuda_extension.py                  |  3 +
 9 files changed, 110 insertions(+), 69 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 5837156a90cd..405044e6de68 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -1033,6 +1033,7 @@ def __init__(
             enable_sequence_parallelism=enable_sequence_parallelism,
             enable_sequence_overlap=enable_sequence_overlap,
         )
+        print("self.shard_config", self.shard_config)
         self.amp_config = dict(
             initial_scale=initial_scale,
             growth_factor=growth_factor,
@@ -1058,6 +1059,7 @@ def __init__(
             overlap_communication=overlap_communication,
             cpu_offload=cpu_offload,
             partition_grad=(self.zero_stage == 2),
+            forced_dtype=torch.bfloat16,
         )
 
         self.max_norm = max_norm
@@ -1099,6 +1101,7 @@ def configure(
         param_info = get_param_info(optimizer)
         if not isinstance(model, ModelWrapper):
             use_ddp = self.dp_size > 1 and self.pp_size == 1 and self.zero_stage == 0
+            print("use_ddp", use_ddp)
             model = HybridParallelModule(
                 model,
                 precision=self.precision,
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index 4bca335c84d8..44433ea63b45 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -9,6 +9,7 @@
 
 try:
     import fused_weight_gradient_mlp_cuda
+
     _grad_accum_fusion_available = True
 except ImportError:
     _grad_accum_fusion_available = False
@@ -77,10 +78,11 @@ def backward(ctx, grad_output):
         use_bias = ctx.use_bias
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias.
-        weight = weight.view(weight.shape)
-        bias = bias.view(bias.shape)
+        # weight = weight.view(weight.shape)
+        # bias = bias.view(bias.shape)
 
         total_input = input
+        # print("grad_output.shape", grad_output.shape, "weight.shape", weight.shape)
         grad_input = grad_output.matmul(weight.T)
         grad_output = grad_output.contiguous()
         # Convert the tensor shapes to 2D for execution compatibility
@@ -93,9 +95,10 @@ def backward(ctx, grad_output):
             handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
             # Delay the start of weight gradient computation shortly (3us) to have
             # all-reduce scheduled first and have GPU resources allocated
-            _ = torch.empty(1, device=grad_output.device) + 1
+            # _ = torch.empty(1, device=grad_output.device) + 1
 
         grad_weight = total_input.t().matmul(grad_output)
+        # print("use_biasuse_biasuse_biasuse_biasuse_bias",use_bias)
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
         if ctx.async_grad_allreduce:
@@ -115,7 +118,6 @@ def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce):
         ctx.use_bias = bias is not None
         ctx.process_group = process_group
         ctx.async_grad_allreduce = async_grad_allreduce
-
         if bias is not None:
             output = F.linear(input_, weight, bias)
         else:
@@ -129,8 +131,8 @@ def backward(ctx, grad_output):
         use_bias = ctx.use_bias
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to bias.
-        if use_bias:
-            bias.view(bias.shape)
+        # if use_bias:
+        #     bias.view(bias.shape)
 
         total_input = input
         grad_input = grad_output.matmul(weight)
@@ -145,7 +147,7 @@ def backward(ctx, grad_output):
             handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
             # Delay the start of weight gradient computation shortly (3us) to have
             # all-reduce scheduled first and have GPU resources allocated
-            _ = torch.empty(1, device=grad_output.device) + 1
+            # _ = torch.empty(1, device=grad_output.device) + 1
 
         if _grad_accum_fusion_available and weight.grad is not None:
             grad = weight.grad
@@ -205,8 +207,8 @@ def backward(ctx, grad_output):
         overlap = ctx.overlap
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias. Used in FusedLayerNorm
-        if use_bias:
-            bias = bias.view(bias.shape)
+        # if use_bias:
+        #     bias = bias.view(bias.shape)
 
         if not overlap:
             input_parallel = _gather(input_, dim, process_group)
@@ -431,7 +433,7 @@ def backward(ctx, grad_output):
             input_parallel = torch.cat(tensor_list, dim=dim).contiguous()
             # calculate gradient
             if len(input_parallel.shape) > 2:
-                input_parallel = input_parallel.view(-1, input_parallel.shape[-1])   
+                input_parallel = input_parallel.view(-1, input_parallel.shape[-1])
             grad_weight = input_parallel.t().matmul(grad_output)
             # wait until reduce-scatter finished
             reducescatter_handle.wait()
diff --git a/colossalai/shardformer/layer/normalization.py b/colossalai/shardformer/layer/normalization.py
index 4aa281290340..fed488bbfd2c 100644
--- a/colossalai/shardformer/layer/normalization.py
+++ b/colossalai/shardformer/layer/normalization.py
@@ -7,7 +7,6 @@
 
 from colossalai.lazy import LazyInitContext
 
-from ._operation import hook_paramter_in_backward
 from .utils import SeqParallelUtils
 
 __all__ = ["FusedLayerNorm", "FusedRMSNorm", "LayerNorm", "RMSNorm", "BaseLayerNorm"]
@@ -29,7 +28,7 @@ def __init__(self, normalized_shape, eps=0.00001, elementwise_affine=True):
 
         def forward(self, input):
             output = super().forward(input)
-            output = hook_paramter_in_backward(output, self.weight, self.bias)
+            # output = hook_paramter_in_backward(output, self.weight, self.bias)
             return output
 
     class FusedRMSNormWithHook(ApexFusedRMSNorm):
@@ -38,7 +37,7 @@ def __init__(self, normalized_shape, eps=0.00001, elementwise_affine=True):
 
         def forward(self, input):
             output = super().forward(input)
-            output = hook_paramter_in_backward(output, self.weight)
+            # output = hook_paramter_in_backward(output, self.weight)
             return output
 
 except ImportError:
@@ -79,7 +78,7 @@ def __init__(self, hidden_size, eps=0.00001):
 
         def forward(self, input):
             output = super().forward(input)
-            output = hook_paramter_in_backward(output, self.weight, self.bias)
+            # output = hook_paramter_in_backward(output, self.weight, self.bias)
             return output
 
 
diff --git a/colossalai/shardformer/layer/qkv_fused_linear.py b/colossalai/shardformer/layer/qkv_fused_linear.py
index 12476d050600..267c1bcce4ac 100644
--- a/colossalai/shardformer/layer/qkv_fused_linear.py
+++ b/colossalai/shardformer/layer/qkv_fused_linear.py
@@ -320,9 +320,8 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
         else:
             # Set up backprop all-reduce.
             input_parallel = reduce_backward(input_, self.process_group)
-            output_parallel = matmul_with_async_comm(
-                input_parallel, self.weight, bias, self.process_group, self.async_communication
-            )
+            input_parallel = input_
+            output_parallel = matmul_with_async_comm(input_parallel, self.weight, bias, self.process_group, False)
 
         if self.gather_output:
             # All-gather across the partitions.
@@ -331,7 +330,8 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
             output = output_parallel
 
         if self.skip_bias_add:
-            return output, self.bias
+            # return output, self.bias
+            return output
         else:
             return output
 
@@ -528,7 +528,8 @@ def forward(self, input_: Tensor) -> Tensor:
                     handle.wait()
                 output = torch.cat(output_parallel_list, dim=-1)
         else:
-            output_parallel = torch.matmul(input_, self.weight)
+            # output_parallel = torch.matmul(input_, self.weight)
+            output_parallel = matmul_with_async_comm(input_, self.weight, None, self.process_group, False)
             if self.seq_parallel:
                 output = linear_reducescatter_forward_gather_backward(output_parallel, self.process_group, 1)
             else:
@@ -539,7 +540,8 @@ def forward(self, input_: Tensor) -> Tensor:
                 output = output + self.bias
             return output
         else:
-            return output, self.bias
+            # return output, self.bias
+            return output
 
 
 # ====================================
@@ -734,6 +736,7 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
             output = output_parallel
 
         if self.skip_bias_add:
-            return output, self.bias
+            # return output, self.bias
+            return output
         else:
             return output
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 306bd58bf687..f506c3ad97bf 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -330,12 +330,13 @@ def gpt2_lmhead_model_forward(
             loss_fct = CrossEntropyLoss()
             shift_logits = shift_logits.view(-1, shift_logits.size(-1))
             shift_labels = shift_labels.view(-1)
-            if shard_config.enable_tensor_parallelism:
-                loss = cross_entropy_1d(
-                    shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
-                )
-            else:
-                loss = loss_fct(shift_logits, shift_labels)
+            # if shard_config.enable_tensor_parallelism:
+            #     loss = cross_entropy_1d(
+            #         shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
+            #     )
+            # else:
+            #     loss = loss_fct(shift_logits, shift_labels)
+            loss = loss_fct(shift_logits, shift_labels)
 
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index a0774aee6d3a..18e8bc290aec 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -5,12 +5,7 @@
 
 import colossalai.shardformer.layer as col_nn
 
-from ..modeling.gpt2 import (
-    GPT2PipelineForwards,
-    get_gpt2_flash_attention_forward,
-    get_lm_forward_with_dist_cross_entropy,
-    gpt2_sequence_parallel_forward_fn,
-)
+from ..modeling.gpt2 import GPT2PipelineForwards, get_gpt2_flash_attention_forward, gpt2_sequence_parallel_forward_fn
 from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 __all__ = [
@@ -58,10 +53,10 @@ def module_policy(self):
                         suffix="wte",
                         target_module=col_nn.VocabParallelEmbedding1D,
                     ),
-                    SubModuleReplacementDescription(
-                        suffix="drop",
-                        target_module=col_nn.DropoutForParallelInput,
-                    ),
+                    # SubModuleReplacementDescription(
+                    #     suffix="drop",
+                    #     target_module=col_nn.DropoutForParallelInput,
+                    # ),
                 ]
             )
 
@@ -87,27 +82,30 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="mlp.c_fc",
                         target_module=col_nn.GPT2FusedLinearConv1D_Col,
-                        kwargs={"n_fused": 1, "seq_parallel": use_sequence_parallel, "overlap": overlap},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="mlp.c_proj",
-                        target_module=col_nn.GPT2FusedLinearConv1D_Row,
                         kwargs={
+                            "n_fused": 1,
                             "seq_parallel": use_sequence_parallel,
+                            "overlap": overlap,
+                            "skip_bias_add": True,
                         },
                     ),
                     SubModuleReplacementDescription(
-                        suffix="attn.attn_dropout",
-                        target_module=col_nn.DropoutForParallelInput,
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="attn.resid_dropout",
-                        target_module=col_nn.DropoutForParallelInput,
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="mlp.dropout",
-                        target_module=col_nn.DropoutForParallelInput,
+                        suffix="mlp.c_proj",
+                        target_module=col_nn.GPT2FusedLinearConv1D_Row,
+                        kwargs={"seq_parallel": use_sequence_parallel, "skip_bias_add": True},
                     ),
+                    # SubModuleReplacementDescription(
+                    #     suffix="attn.attn_dropout",
+                    #     target_module=col_nn.DropoutForParallelInput,
+                    # ),
+                    # SubModuleReplacementDescription(
+                    #     suffix="attn.resid_dropout",
+                    #     target_module=col_nn.DropoutForParallelInput,
+                    # ),
+                    # SubModuleReplacementDescription(
+                    #     suffix="mlp.dropout",
+                    #     target_module=col_nn.DropoutForParallelInput,
+                    # ),
                 ],
             )
 
@@ -271,10 +269,10 @@ def module_policy(self):
                 GPT2LMHeadModel: ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": False}
+                            suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": True}
                         )
                     ],
-                    method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)},
+                    # method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)},
                 )
             }
             module_policy.update(addon_module)
diff --git a/examples/language/gpt/hybridparallelism/benchmark.py b/examples/language/gpt/hybridparallelism/benchmark.py
index 61324a8dc25c..6e353f8bb000 100644
--- a/examples/language/gpt/hybridparallelism/benchmark.py
+++ b/examples/language/gpt/hybridparallelism/benchmark.py
@@ -10,7 +10,8 @@
 from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 
 import colossalai
-import colossalai.utils.device as device_utils
+
+# import colossalai.utils.device as device_utils
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, TorchFSDPPlugin
 from colossalai.cluster import DistCoordinator
@@ -23,11 +24,11 @@
 # ==============================
 # Constants
 # ==============================
-
 MODEL_CONFIGS = {
-    "small": GPT2Config(),
+    "small": GPT2Config(activation_function="gelu"),
     "medium": GPT2Config(n_embd=1024, n_head=16, n_layer=24, activation_function="gelu"),
-    "large": GPT2Config(n_embd=1280, n_head=20, n_layer=36),
+    "large": GPT2Config(n_embd=1280, n_head=20, n_layer=36, activation_function="gelu"),
+    "default": GPT2Config(n_embd=4096, n_head=32, n_layer=32, n_positions=4096, activation_function="gelu"),
 }
 
 
@@ -36,7 +37,7 @@ def main():
     # Parse Arguments
     # ==============================
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--config", type=str, default="medium", help="Model configuration")
+    parser.add_argument("-c", "--config", type=str, default="default", help="Model configuration")
     parser.add_argument(
         "-p",
         "--plugin",
@@ -46,9 +47,9 @@ def main():
     )
     parser.add_argument("-b", "--batch_size", type=int, default=2, help="Batch size")
     parser.add_argument("-s", "--num_steps", type=int, default=200, help="Number of steps to run")
-    parser.add_argument("-i", "--ignore_steps", type=int, default=1, help="Number of steps to ignore")
+    parser.add_argument("-i", "--ignore_steps", type=int, default=3, help="Number of steps to ignore")
     parser.add_argument("-g", "--grad_checkpoint", action="store_true", help="Use gradient checkpointing")
-    parser.add_argument("-l", "--max_length", type=int, default=1024, help="Max sequence length")
+    parser.add_argument("-l", "--max_length", type=int, default=4096, help="Max sequence length")
     parser.add_argument(
         "-w", "--warmup_ratio", type=float, default=0.8, help="warm up ratio of non-model data. Only for gemini-auto"
     )
@@ -61,6 +62,9 @@ def main():
     parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel size")
     parser.add_argument("--mbs", type=int, default=1)
     parser.add_argument("--zero", type=int, default=0)
+    parser.add_argument("--pp_style", type=str, default="1f1b")
+    parser.add_argument("--num_model_chunks", type=int, default=2)
+    parser.add_argument("--cpu_offload", action="store_true", help="Use gradient checkpointing")
     args = parser.parse_args()
 
     colossalai.launch_from_torch({})
@@ -124,11 +128,16 @@ def empty_init():
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
             pp_size=args.pp,
-            pp_style="interleaved",
+            pp_style=args.pp_style,
             zero_stage=args.zero,
-            num_model_chunks=2,
-            enable_all_optimization=True,
+            num_model_chunks=args.num_model_chunks,
+            # enable_all_optimization=True,
+            # enable_flash_attention=True,
+            # enable_jit_fused=True,
+            enable_fused_normalization=True,
+            # enable_sequence_parallelism=True,
             num_microbatches=args.mbs,
+            cpu_offload=args.cpu_offload,
             precision="bf16",
         )
     elif args.plugin == "3d_cpu":
@@ -167,14 +176,18 @@ def empty_init():
         else nullcontext()
     )
 
-    with init_ctx:
-        model = GPT2LMHeadModel(config)
+    # with init_ctx:
+    #     model = GPT2LMHeadModel(config)
+    model = GPT2LMHeadModel(config)
 
     if args.grad_checkpoint:
         model.gradient_checkpointing_enable()
 
     model_numel = get_model_numel(model)
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
+    # print("args.ignore_steps", args.ignore_steps)
+    # print("args.batch_size", args.batch_size)
+    # print("max_length", args.max_length)
     performance_evaluator = PerformanceEvaluator(
         model_numel,
         model.config.n_layer,
@@ -189,7 +202,7 @@ def empty_init():
     torch.set_default_dtype(torch.bfloat16)
     model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
     torch.set_default_dtype(torch.float)
-    coordinator.print_on_master(f"Booster init max CUDA memory: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
     coordinator.print_on_master(
         f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB"
     )
@@ -213,10 +226,28 @@ def empty_init():
             optimizer.step()
             optimizer.zero_grad()
             performance_evaluator.on_step_end(**batch)
-            coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
+
+        # for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not coordinator.is_master())):
+        #     performance_evaluator.on_step_start(step)
+
+        #     with torch.profiler.profile(
+        #         activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
+        #         schedule=torch.profiler.schedule(wait=1, warmup=2, active=3, repeat=5),
+        #         on_trace_ready=torch.profiler.tensorboard_trace_handler("/home/jiangmingyan/workspace/trace/shardformer/GPT2-12-bf16"),
+        #         with_stack=True,
+        #         record_shapes=True
+        #     ) as prof:
+        #         for _ in range(0 + 2 + 5):
+        #             outputs = model(**batch)
+        #             loss = outputs[0]
+        #             booster.backward(loss, optimizer)
+        #             optimizer.step()
+        #             optimizer.zero_grad()
+        #             prof.step()
+        coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
     performance_evaluator.on_fit_end()
-    coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
+    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
 
 if __name__ == "__main__":
diff --git a/examples/language/performance_evaluator.py b/examples/language/performance_evaluator.py
index c2169a730a88..ba52548f97b6 100644
--- a/examples/language/performance_evaluator.py
+++ b/examples/language/performance_evaluator.py
@@ -107,6 +107,7 @@ def on_step_end(self, input_ids: Tensor, **kwargs) -> None:
 
     def on_fit_end(self) -> None:
         avg_duration = all_reduce_mean(self.timer.duration, self.coordinator.world_size)
+        # avg_duration = self.timer.duration
         avg_throughput = self.num_samples * self.dp_world_size / (avg_duration + 1e-12)
         mp_world_size = self.coordinator.world_size // self.dp_world_size
         avg_tflops_per_gpu_megatron = self.flop_megatron / 1e12 / (avg_duration + 1e-12) / mp_world_size
diff --git a/extensions/cuda_extension.py b/extensions/cuda_extension.py
index 188e61b60141..328ea4b997b5 100644
--- a/extensions/cuda_extension.py
+++ b/extensions/cuda_extension.py
@@ -1,7 +1,10 @@
 import os
+import time
 from abc import abstractmethod
+from pathlib import Path
 from typing import List
 
+from .base_extension import _Extension
 from .cpp_extension import _CppExtension
 from .utils import check_pytorch_version, check_system_pytorch_cuda_match, set_cuda_arch_list
 

From acccb4bd0653c904598a17eafb6b02a88efb0e8f Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 26 Feb 2024 12:20:10 +0800
Subject: [PATCH 31/33] fix

fix

fix

fix

fix
---
 .../booster/plugin/hybrid_parallel_plugin.py  |  6 +--
 colossalai/shardformer/layer/_operation.py    | 23 ++++----
 colossalai/shardformer/layer/normalization.py |  7 +--
 .../shardformer/layer/qkv_fused_linear.py     | 17 +++---
 colossalai/shardformer/modeling/gpt2.py       | 21 ++++----
 colossalai/shardformer/policies/gpt2.py       | 54 +++++++++----------
 .../gpt/hybridparallelism/benchmark.py        | 32 ++---------
 examples/language/performance_evaluator.py    |  1 -
 8 files changed, 64 insertions(+), 97 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 405044e6de68..ad1e4401c24c 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -35,6 +35,8 @@
 
 DP_AXIS, PP_AXIS, TP_AXIS = 0, 1, 2
 
+PRECISION_TORCH_TYPE = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}
+
 
 def _convert_floating_point(x, dtype: torch.dtype = torch.float16):
     if isinstance(x, torch.Tensor) and torch.is_floating_point(x):
@@ -1033,7 +1035,6 @@ def __init__(
             enable_sequence_parallelism=enable_sequence_parallelism,
             enable_sequence_overlap=enable_sequence_overlap,
         )
-        print("self.shard_config", self.shard_config)
         self.amp_config = dict(
             initial_scale=initial_scale,
             growth_factor=growth_factor,
@@ -1059,7 +1060,7 @@ def __init__(
             overlap_communication=overlap_communication,
             cpu_offload=cpu_offload,
             partition_grad=(self.zero_stage == 2),
-            forced_dtype=torch.bfloat16,
+            forced_dtype=PRECISION_TORCH_TYPE[precision],
         )
 
         self.max_norm = max_norm
@@ -1101,7 +1102,6 @@ def configure(
         param_info = get_param_info(optimizer)
         if not isinstance(model, ModelWrapper):
             use_ddp = self.dp_size > 1 and self.pp_size == 1 and self.zero_stage == 0
-            print("use_ddp", use_ddp)
             model = HybridParallelModule(
                 model,
                 precision=self.precision,
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index 44433ea63b45..d5a27877dc95 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -78,11 +78,11 @@ def backward(ctx, grad_output):
         use_bias = ctx.use_bias
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias.
-        # weight = weight.view(weight.shape)
-        # bias = bias.view(bias.shape)
+        weight = weight.view(weight.shape)
+        if bias is not None:
+            bias = bias.view(bias.shape)
 
         total_input = input
-        # print("grad_output.shape", grad_output.shape, "weight.shape", weight.shape)
         grad_input = grad_output.matmul(weight.T)
         grad_output = grad_output.contiguous()
         # Convert the tensor shapes to 2D for execution compatibility
@@ -93,12 +93,11 @@ def backward(ctx, grad_output):
         if ctx.async_grad_allreduce:
             # Asynchronous all-reduce
             handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
-            # Delay the start of weight gradient computation shortly (3us) to have
+            # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
             # all-reduce scheduled first and have GPU resources allocated
-            # _ = torch.empty(1, device=grad_output.device) + 1
+            _ = torch.empty(1, device=grad_output.device) + 1
 
         grad_weight = total_input.t().matmul(grad_output)
-        # print("use_biasuse_biasuse_biasuse_biasuse_bias",use_bias)
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
         if ctx.async_grad_allreduce:
@@ -131,8 +130,8 @@ def backward(ctx, grad_output):
         use_bias = ctx.use_bias
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to bias.
-        # if use_bias:
-        #     bias.view(bias.shape)
+        if use_bias:
+            bias.view(bias.shape)
 
         total_input = input
         grad_input = grad_output.matmul(weight)
@@ -145,9 +144,9 @@ def backward(ctx, grad_output):
         if ctx.async_grad_allreduce:
             # Asynchronous all-reduce
             handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
-            # Delay the start of weight gradient computation shortly (3us) to have
+            # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
             # all-reduce scheduled first and have GPU resources allocated
-            # _ = torch.empty(1, device=grad_output.device) + 1
+            _ = torch.empty(1, device=grad_output.device) + 1
 
         if _grad_accum_fusion_available and weight.grad is not None:
             grad = weight.grad
@@ -207,8 +206,8 @@ def backward(ctx, grad_output):
         overlap = ctx.overlap
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias. Used in FusedLayerNorm
-        # if use_bias:
-        #     bias = bias.view(bias.shape)
+        if use_bias:
+            bias = bias.view(bias.shape)
 
         if not overlap:
             input_parallel = _gather(input_, dim, process_group)
diff --git a/colossalai/shardformer/layer/normalization.py b/colossalai/shardformer/layer/normalization.py
index fed488bbfd2c..4aa281290340 100644
--- a/colossalai/shardformer/layer/normalization.py
+++ b/colossalai/shardformer/layer/normalization.py
@@ -7,6 +7,7 @@
 
 from colossalai.lazy import LazyInitContext
 
+from ._operation import hook_paramter_in_backward
 from .utils import SeqParallelUtils
 
 __all__ = ["FusedLayerNorm", "FusedRMSNorm", "LayerNorm", "RMSNorm", "BaseLayerNorm"]
@@ -28,7 +29,7 @@ def __init__(self, normalized_shape, eps=0.00001, elementwise_affine=True):
 
         def forward(self, input):
             output = super().forward(input)
-            # output = hook_paramter_in_backward(output, self.weight, self.bias)
+            output = hook_paramter_in_backward(output, self.weight, self.bias)
             return output
 
     class FusedRMSNormWithHook(ApexFusedRMSNorm):
@@ -37,7 +38,7 @@ def __init__(self, normalized_shape, eps=0.00001, elementwise_affine=True):
 
         def forward(self, input):
             output = super().forward(input)
-            # output = hook_paramter_in_backward(output, self.weight)
+            output = hook_paramter_in_backward(output, self.weight)
             return output
 
 except ImportError:
@@ -78,7 +79,7 @@ def __init__(self, hidden_size, eps=0.00001):
 
         def forward(self, input):
             output = super().forward(input)
-            # output = hook_paramter_in_backward(output, self.weight, self.bias)
+            output = hook_paramter_in_backward(output, self.weight, self.bias)
             return output
 
 
diff --git a/colossalai/shardformer/layer/qkv_fused_linear.py b/colossalai/shardformer/layer/qkv_fused_linear.py
index 267c1bcce4ac..12476d050600 100644
--- a/colossalai/shardformer/layer/qkv_fused_linear.py
+++ b/colossalai/shardformer/layer/qkv_fused_linear.py
@@ -320,8 +320,9 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
         else:
             # Set up backprop all-reduce.
             input_parallel = reduce_backward(input_, self.process_group)
-            input_parallel = input_
-            output_parallel = matmul_with_async_comm(input_parallel, self.weight, bias, self.process_group, False)
+            output_parallel = matmul_with_async_comm(
+                input_parallel, self.weight, bias, self.process_group, self.async_communication
+            )
 
         if self.gather_output:
             # All-gather across the partitions.
@@ -330,8 +331,7 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
             output = output_parallel
 
         if self.skip_bias_add:
-            # return output, self.bias
-            return output
+            return output, self.bias
         else:
             return output
 
@@ -528,8 +528,7 @@ def forward(self, input_: Tensor) -> Tensor:
                     handle.wait()
                 output = torch.cat(output_parallel_list, dim=-1)
         else:
-            # output_parallel = torch.matmul(input_, self.weight)
-            output_parallel = matmul_with_async_comm(input_, self.weight, None, self.process_group, False)
+            output_parallel = torch.matmul(input_, self.weight)
             if self.seq_parallel:
                 output = linear_reducescatter_forward_gather_backward(output_parallel, self.process_group, 1)
             else:
@@ -540,8 +539,7 @@ def forward(self, input_: Tensor) -> Tensor:
                 output = output + self.bias
             return output
         else:
-            # return output, self.bias
-            return output
+            return output, self.bias
 
 
 # ====================================
@@ -736,7 +734,6 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
             output = output_parallel
 
         if self.skip_bias_add:
-            # return output, self.bias
-            return output
+            return output, self.bias
         else:
             return output
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index f506c3ad97bf..3e5cc6015adc 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -330,13 +330,12 @@ def gpt2_lmhead_model_forward(
             loss_fct = CrossEntropyLoss()
             shift_logits = shift_logits.view(-1, shift_logits.size(-1))
             shift_labels = shift_labels.view(-1)
-            # if shard_config.enable_tensor_parallelism:
-            #     loss = cross_entropy_1d(
-            #         shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
-            #     )
-            # else:
-            #     loss = loss_fct(shift_logits, shift_labels)
-            loss = loss_fct(shift_logits, shift_labels)
+            if shard_config.enable_tensor_parallelism:
+                loss = cross_entropy_1d(
+                    shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
+                )
+            else:
+                loss = loss_fct(shift_logits, shift_labels)
 
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
@@ -727,7 +726,7 @@ def gpt2_for_sequence_classification_forward(
         )
 
 
-def get_gpt2_flash_attention_forward(shard_config: ShardConfig):
+def get_gpt2_flash_attention_forward():
     from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
 
     from colossalai.nn.layer.colo_attention import AttnMaskType, ColoAttention
@@ -778,12 +777,10 @@ def forward(
         else:
             present = None
 
-        flash_attention_mask = None
         if not self.is_cross_attention:
             attn_mask_type = AttnMaskType.causal
-        else:
-            attn_mask_type = None
-        if not getattr(shard_config, "causal_lm", False) and attention_mask != None:
+            flash_attention_mask = None
+        if attention_mask != None:
             flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous()
             if not torch.all(flash_attention_mask):
                 if attn_mask_type == AttnMaskType.causal:
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index 18e8bc290aec..303766993e3d 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -5,7 +5,12 @@
 
 import colossalai.shardformer.layer as col_nn
 
-from ..modeling.gpt2 import GPT2PipelineForwards, get_gpt2_flash_attention_forward, gpt2_sequence_parallel_forward_fn
+from ..modeling.gpt2 import (
+    GPT2PipelineForwards,
+    get_gpt2_flash_attention_forward,
+    get_lm_forward_with_dist_cross_entropy,
+    gpt2_sequence_parallel_forward_fn,
+)
 from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 __all__ = [
@@ -53,10 +58,10 @@ def module_policy(self):
                         suffix="wte",
                         target_module=col_nn.VocabParallelEmbedding1D,
                     ),
-                    # SubModuleReplacementDescription(
-                    #     suffix="drop",
-                    #     target_module=col_nn.DropoutForParallelInput,
-                    # ),
+                    SubModuleReplacementDescription(
+                        suffix="drop",
+                        target_module=col_nn.DropoutForParallelInput,
+                    ),
                 ]
             )
 
@@ -82,30 +87,25 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="mlp.c_fc",
                         target_module=col_nn.GPT2FusedLinearConv1D_Col,
-                        kwargs={
-                            "n_fused": 1,
-                            "seq_parallel": use_sequence_parallel,
-                            "overlap": overlap,
-                            "skip_bias_add": True,
-                        },
+                        kwargs={"n_fused": 1, "seq_parallel": use_sequence_parallel, "overlap": overlap},
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.c_proj",
                         target_module=col_nn.GPT2FusedLinearConv1D_Row,
-                        kwargs={"seq_parallel": use_sequence_parallel, "skip_bias_add": True},
+                        kwargs={"seq_parallel": use_sequence_parallel},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="attn.attn_dropout",
+                        target_module=col_nn.DropoutForParallelInput,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="attn.resid_dropout",
+                        target_module=col_nn.DropoutForParallelInput,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.dropout",
+                        target_module=col_nn.DropoutForParallelInput,
                     ),
-                    # SubModuleReplacementDescription(
-                    #     suffix="attn.attn_dropout",
-                    #     target_module=col_nn.DropoutForParallelInput,
-                    # ),
-                    # SubModuleReplacementDescription(
-                    #     suffix="attn.resid_dropout",
-                    #     target_module=col_nn.DropoutForParallelInput,
-                    # ),
-                    # SubModuleReplacementDescription(
-                    #     suffix="mlp.dropout",
-                    #     target_module=col_nn.DropoutForParallelInput,
-                    # ),
                 ],
             )
 
@@ -145,7 +145,7 @@ def module_policy(self):
         if self.shard_config.enable_flash_attention:
             self.append_or_create_method_replacement(
                 description={
-                    "forward": get_gpt2_flash_attention_forward(self.shard_config),
+                    "forward": get_gpt2_flash_attention_forward(),
                 },
                 policy=policy,
                 target_key=GPT2Attention,
@@ -269,10 +269,10 @@ def module_policy(self):
                 GPT2LMHeadModel: ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": True}
+                            suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": False}
                         )
                     ],
-                    # method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)},
+                    method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)},
                 )
             }
             module_policy.update(addon_module)
diff --git a/examples/language/gpt/hybridparallelism/benchmark.py b/examples/language/gpt/hybridparallelism/benchmark.py
index 6e353f8bb000..d7232938ae98 100644
--- a/examples/language/gpt/hybridparallelism/benchmark.py
+++ b/examples/language/gpt/hybridparallelism/benchmark.py
@@ -131,11 +131,7 @@ def empty_init():
             pp_style=args.pp_style,
             zero_stage=args.zero,
             num_model_chunks=args.num_model_chunks,
-            # enable_all_optimization=True,
-            # enable_flash_attention=True,
-            # enable_jit_fused=True,
-            enable_fused_normalization=True,
-            # enable_sequence_parallelism=True,
+            enable_all_optimization=True,
             num_microbatches=args.mbs,
             cpu_offload=args.cpu_offload,
             precision="bf16",
@@ -176,18 +172,14 @@ def empty_init():
         else nullcontext()
     )
 
-    # with init_ctx:
-    #     model = GPT2LMHeadModel(config)
-    model = GPT2LMHeadModel(config)
+    with init_ctx:
+        model = GPT2LMHeadModel(config)
 
     if args.grad_checkpoint:
         model.gradient_checkpointing_enable()
 
     model_numel = get_model_numel(model)
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
-    # print("args.ignore_steps", args.ignore_steps)
-    # print("args.batch_size", args.batch_size)
-    # print("max_length", args.max_length)
     performance_evaluator = PerformanceEvaluator(
         model_numel,
         model.config.n_layer,
@@ -226,24 +218,6 @@ def empty_init():
             optimizer.step()
             optimizer.zero_grad()
             performance_evaluator.on_step_end(**batch)
-
-        # for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not coordinator.is_master())):
-        #     performance_evaluator.on_step_start(step)
-
-        #     with torch.profiler.profile(
-        #         activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
-        #         schedule=torch.profiler.schedule(wait=1, warmup=2, active=3, repeat=5),
-        #         on_trace_ready=torch.profiler.tensorboard_trace_handler("/home/jiangmingyan/workspace/trace/shardformer/GPT2-12-bf16"),
-        #         with_stack=True,
-        #         record_shapes=True
-        #     ) as prof:
-        #         for _ in range(0 + 2 + 5):
-        #             outputs = model(**batch)
-        #             loss = outputs[0]
-        #             booster.backward(loss, optimizer)
-        #             optimizer.step()
-        #             optimizer.zero_grad()
-        #             prof.step()
         coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
     performance_evaluator.on_fit_end()
diff --git a/examples/language/performance_evaluator.py b/examples/language/performance_evaluator.py
index ba52548f97b6..c2169a730a88 100644
--- a/examples/language/performance_evaluator.py
+++ b/examples/language/performance_evaluator.py
@@ -107,7 +107,6 @@ def on_step_end(self, input_ids: Tensor, **kwargs) -> None:
 
     def on_fit_end(self) -> None:
         avg_duration = all_reduce_mean(self.timer.duration, self.coordinator.world_size)
-        # avg_duration = self.timer.duration
         avg_throughput = self.num_samples * self.dp_world_size / (avg_duration + 1e-12)
         mp_world_size = self.coordinator.world_size // self.dp_world_size
         avg_tflops_per_gpu_megatron = self.flop_megatron / 1e12 / (avg_duration + 1e-12) / mp_world_size

From 85f0cea135ab4e1bf38c2b4049c04b7214214687 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 26 Feb 2024 19:39:47 +0800
Subject: [PATCH 32/33] fix

---
 colossalai/shardformer/layer/_operation.py       | 16 ++++++----------
 colossalai/shardformer/shard/shardformer.py      |  3 +++
 .../language/gpt/hybridparallelism/benchmark.py  | 10 +++++-----
 .../test_layer/test_gpt2_qkv_fused_linear_1d.py  |  5 ++++-
 .../test_layer/test_linear_1d.py                 |  3 +++
 .../test_layer/test_qkv_fused_linear_1d.py       |  5 ++++-
 6 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index d5a27877dc95..d4960c7e4bde 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -94,8 +94,7 @@ def backward(ctx, grad_output):
             # Asynchronous all-reduce
             handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
             # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
-            # all-reduce scheduled first and have GPU resources allocated
-            _ = torch.empty(1, device=grad_output.device) + 1
+            # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
         grad_weight = total_input.t().matmul(grad_output)
         grad_bias = grad_output.sum(dim=0) if use_bias else None
@@ -145,8 +144,7 @@ def backward(ctx, grad_output):
             # Asynchronous all-reduce
             handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
             # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
-            # all-reduce scheduled first and have GPU resources allocated
-            _ = torch.empty(1, device=grad_output.device) + 1
+            # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
         if _grad_accum_fusion_available and weight.grad is not None:
             grad = weight.grad
@@ -229,9 +227,8 @@ def backward(ctx, grad_output):
                     input_.shape, dtype=input_parallel.dtype, device=input_parallel.device
                 ).contiguous()
                 handle = dist.reduce_scatter(output, input_list, group=process_group, async_op=True)
-                # Delay the start of weight gradient computation shortly (3us) to have
-                # reduce-scatter scheduled first and have GPU resources allocated
-                _ = torch.empty(1, device=grad_output.device) + 1
+                # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
+                # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
             if _grad_accum_fusion_available and weight.grad is not None:
                 grad = weight.grad
@@ -395,9 +392,8 @@ def backward(ctx, grad_output):
                     input_.shape, dtype=input_parallel.dtype, device=input_parallel.device
                 ).contiguous()
                 handle = dist.reduce_scatter(output, input_list, group=process_group, async_op=True)
-                # Delay the start of weight gradient computation shortly (3us) to have
-                # reduce-scatter scheduled first and have GPU resources allocated
-                _ = torch.empty(1, device=grad_output.device) + 1
+                # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
+                # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
             grad_weight = total_input.t().matmul(grad_output)
             grad_bias = grad_output.sum(dim=0) if use_bias else None
diff --git a/colossalai/shardformer/shard/shardformer.py b/colossalai/shardformer/shard/shardformer.py
index 7a0d75bf2f2a..27c1c147a121 100644
--- a/colossalai/shardformer/shard/shardformer.py
+++ b/colossalai/shardformer/shard/shardformer.py
@@ -1,3 +1,4 @@
+import os
 from typing import Dict, List, Tuple
 
 import torch.nn as nn
@@ -9,6 +10,8 @@
 from .shard_config import ShardConfig
 from .sharder import ModelSharder
 
+os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
 
 class ShardFormer:
     """
diff --git a/examples/language/gpt/hybridparallelism/benchmark.py b/examples/language/gpt/hybridparallelism/benchmark.py
index d7232938ae98..1315deae6eb0 100644
--- a/examples/language/gpt/hybridparallelism/benchmark.py
+++ b/examples/language/gpt/hybridparallelism/benchmark.py
@@ -25,10 +25,10 @@
 # Constants
 # ==============================
 MODEL_CONFIGS = {
-    "small": GPT2Config(activation_function="gelu"),
-    "medium": GPT2Config(n_embd=1024, n_head=16, n_layer=24, activation_function="gelu"),
-    "large": GPT2Config(n_embd=1280, n_head=20, n_layer=36, activation_function="gelu"),
-    "default": GPT2Config(n_embd=4096, n_head=32, n_layer=32, n_positions=4096, activation_function="gelu"),
+    "118M": GPT2Config(activation_function="gelu"),
+    "338M": GPT2Config(n_embd=1024, n_head=16, n_layer=24, activation_function="gelu"),
+    "738M": GPT2Config(n_embd=1280, n_head=20, n_layer=36, activation_function="gelu"),
+    "6.21B": GPT2Config(n_embd=4096, n_head=32, n_layer=32, n_positions=4096, activation_function="gelu"),
 }
 
 
@@ -37,7 +37,7 @@ def main():
     # Parse Arguments
     # ==============================
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--config", type=str, default="default", help="Model configuration")
+    parser.add_argument("-c", "--config", type=str, default="6.21B", help="Model configuration")
     parser.add_argument(
         "-p",
         "--plugin",
diff --git a/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py b/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
index 10ffdcd7138c..e056860ede57 100644
--- a/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
+++ b/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
@@ -1,3 +1,4 @@
+import os
 from contextlib import nullcontext
 
 import torch
@@ -11,8 +12,10 @@
 from colossalai.shardformer.layer.qkv_fused_linear import split_fused_qkv_in_gpt2_style
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 
-
 # This code is copied from https://github.com/huggingface/transformers
+os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
+
 class Conv1D(nn.Module):
     """
     1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
diff --git a/tests/test_shardformer/test_layer/test_linear_1d.py b/tests/test_shardformer/test_layer/test_linear_1d.py
index 5bacf1865c48..defa4afb919b 100644
--- a/tests/test_shardformer/test_layer/test_linear_1d.py
+++ b/tests/test_shardformer/test_layer/test_linear_1d.py
@@ -1,3 +1,4 @@
+import os
 from contextlib import nullcontext
 
 import torch
@@ -11,6 +12,8 @@
 from colossalai.tensor.d_tensor import is_distributed_tensor
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 
+os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
 
 def check_linear_1d_col(lazy_init: bool, seq_parallel: bool, overlap: bool):
     ctx = LazyInitContext() if lazy_init else nullcontext()
diff --git a/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py b/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py
index b02d581810cd..5e996d2ba985 100644
--- a/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py
+++ b/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py
@@ -1,3 +1,4 @@
+import os
 from contextlib import nullcontext
 
 import torch
@@ -11,8 +12,10 @@
 from colossalai.shardformer.layer.qkv_fused_linear import split_fused_qkv_in_gpt2_style
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 
-
 # This code is copied from https://github.com/huggingface/transformers
+os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
+
 class Conv1D(nn.Module):
     """
     1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).

From 8728c3c51f827f0521f4e9cab581edd6395dba00 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Tue, 27 Feb 2024 13:10:03 +0800
Subject: [PATCH 33/33] Update shardformer.py

---
 colossalai/shardformer/shard/shardformer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/colossalai/shardformer/shard/shardformer.py b/colossalai/shardformer/shard/shardformer.py
index 27c1c147a121..b132f47fd810 100644
--- a/colossalai/shardformer/shard/shardformer.py
+++ b/colossalai/shardformer/shard/shardformer.py
@@ -10,6 +10,7 @@
 from .shard_config import ShardConfig
 from .sharder import ModelSharder
 
+# set CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that when communication and computation overlap, the order of core scheduling is correct
 os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"