From 0dfb03af98cada5be05f479f7ae93e8cca712f58 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 14 Sep 2023 13:43:13 +0800
Subject: [PATCH 01/23] init policy

---
 .../language/openmoe/model/llama_policy.py    | 468 ++++++++++++++++++
 1 file changed, 468 insertions(+)
 create mode 100644 examples/language/openmoe/model/llama_policy.py

diff --git a/examples/language/openmoe/model/llama_policy.py b/examples/language/openmoe/model/llama_policy.py
new file mode 100644
index 000000000000..c4421de6a36c
--- /dev/null
+++ b/examples/language/openmoe/model/llama_policy.py
@@ -0,0 +1,468 @@
+import warnings
+from functools import partial
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from modeling_openmoe import (
+    OpenMoeAttention,
+    OpenMoeDecoderLayer,
+    OpenMoeForCausalLM,
+    OpenMoeMLP,
+    OpenMoeModel,
+    OpenMoePreTrainedModel,
+)
+from torch import Tensor
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, Module, MSELoss
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.utils import logging
+
+from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col, Linear1D_Row, VocabParallelEmbedding1D
+from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+__all__ = ['OpenMoePolicy', 'OpenMoeForCausalLMPolicy']
+
+
+class OpenMoePolicy(Policy):
+
+    def config_sanity_check(self):
+        pass
+
+    def preprocess(self):
+        if self.shard_config.enable_tensor_parallelism:
+            # Resize embedding
+            vocab_size = self.model.config.vocab_size
+            world_size = self.shard_config.tensor_parallel_size
+
+            if vocab_size % world_size != 0:
+                new_vocab_size = vocab_size + world_size - vocab_size % world_size
+                self.model.resize_token_embeddings(new_vocab_size)
+
+        return self.model
+
+    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+        policy = {}
+
+        if self.shard_config.enable_sequence_parallelism:
+            self.shard_config.enable_sequence_parallelism = False
+            raise NotImplementedError(
+                "openmoe dosen't support sequence parallelism now, will ignore the sequence parallelism flag.")
+
+        if self.shard_config.enable_tensor_parallelism:
+            raise NotImplementedError("Tensor parallelism is not supported for openmoe model now.")
+
+        # optimization configuration
+        if self.shard_config.enable_fused_normalization:
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="input_layernorm",
+                    target_module=FusedRMSNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="post_attention_layernorm",
+                    target_module=FusedRMSNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="pre_extra_mlp_layernorm",
+                    target_module=FusedRMSNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=OpenMoeDecoderLayer)
+
+            self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(
+                suffix="norm",
+                target_module=FusedRMSNorm,
+            ),
+                                                        policy=policy,
+                                                        target_key=OpenMoeModel)
+
+        if self.shard_config.enable_flash_attention:
+            raise NotImplementedError("Flash attention has already been replaced in openmoe.")
+
+        return policy
+
+    def postprocess(self):
+        return self.model
+
+    def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, policy: Dict) -> None:
+        """If under pipeline parallel setting, replacing the original forward method of huggingface
+           to customized forward method, and add this changing to policy."""
+        if self.pipeline_stage_manager:
+            stage_manager = self.pipeline_stage_manager
+            if self.model.__class__.__name__ == "LlamaModel":
+                module = self.model
+            else:
+                module = self.model.model
+
+            layers_per_stage = Policy.distribute_layers(len(module.layers), stage_manager.num_stages)
+            stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
+            method_replacement = {'forward': partial(new_forward, stage_manager=stage_manager, stage_index=stage_index)}
+            self.append_or_create_method_replacement(description=method_replacement,
+                                                     policy=policy,
+                                                     target_key=model_cls)
+
+        return
+
+    def get_held_layers(self) -> List[Module]:
+        """Get pipeline layers for current stage."""
+        assert self.pipeline_stage_manager is not None
+
+        if self.model.__class__.__name__ == 'LlamaModel':
+            module = self.model
+        else:
+            module = self.model.model
+        stage_manager = self.pipeline_stage_manager
+
+        held_layers = []
+        layers_per_stage = self.distribute_layers(len(module.layers), stage_manager.num_stages)
+        if stage_manager.is_first_stage():
+            held_layers.append(module.embed_tokens)
+        start_idx, end_idx = self.get_stage_index(layers_per_stage, stage_manager.stage)
+        held_layers.extend(module.layers[start_idx:end_idx])
+        if stage_manager.is_last_stage():
+            held_layers.append(module.norm)
+
+        return held_layers
+
+
+class OpenMoeModelPolicy(OpenMoePolicy):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def module_policy(self):
+        policy = super().module_policy()
+        from transformers.models.llama.modeling_llama import LlamaModel
+        if self.pipeline_stage_manager:
+            # set None as default
+            self.set_pipeline_forward(model_cls=LlamaModel,
+                                      new_forward=OpenMoePipelineForwards.llama_model_forward,
+                                      policy=policy)
+        return policy
+
+    def get_held_layers(self) -> List[Module]:
+        """Get pipeline layers for current stage."""
+        held_layers = super().get_held_layers()
+        return held_layers
+
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        """No shared params in llama model"""
+        return []
+
+
+class OpenMoeForCausalLMPolicy(OpenMoePolicy):
+
+    def module_policy(self):
+
+        policy = super().module_policy()
+
+        if self.shard_config.enable_tensor_parallelism:
+            # add a new item for casual lm
+            new_item = {
+                OpenMoeForCausalLM:
+                    ModulePolicyDescription(sub_module_replacement=[
+                        SubModuleReplacementDescription(
+                            suffix="lm_head", target_module=Linear1D_Col, kwargs=dict(gather_output=True))
+                    ])
+            }
+            policy.update(new_item)
+
+        if self.pipeline_stage_manager:
+            # set None as default
+            self.set_pipeline_forward(model_cls=OpenMoeForCausalLM,
+                                      new_forward=OpenMoePipelineForwards.llama_for_causal_lm_forward,
+                                      policy=policy)
+
+        return policy
+
+    def get_held_layers(self) -> List[Module]:
+        """Get pipeline layers for current stage."""
+        stage_manager = self.pipeline_stage_manager
+        held_layers = super().get_held_layers()
+        if stage_manager.is_last_stage():
+            held_layers.append(self.model.lm_head)
+        return held_layers
+
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        llama_model = self.model.model
+        if self.pipeline_stage_manager and self.pipeline_stage_manager.num_stages > 1:
+            if id(llama_model.embed_tokens.weight) == id(
+                    self.model.lm_head.weight) and self.pipeline_stage_manager.num_stages > 1:
+                # tie weights
+                return [{
+                    0: llama_model.embed_tokens.weight,
+                    self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight
+                }]
+        return []
+
+
+class OpenMoePipelineForwards:
+    '''
+    This class serves as a micro library for forward function substitution of Llama models
+    under pipeline setting.
+    '''
+
+    @staticmethod
+    def llama_model_forward(
+        self: OpenMoeModel,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        stage_manager: Optional[PipelineStageManager] = None,
+        hidden_states: Optional[torch.FloatTensor] = None,
+        stage_index: Optional[List[int]] = None,
+    ):
+        logger = logging.get_logger(__name__)
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if stage_manager.is_first_stage():
+            if input_ids is not None and inputs_embeds is not None:
+                raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+            elif input_ids is not None:
+                batch_size, seq_length = input_ids.shape
+            elif inputs_embeds is not None:
+                batch_size, seq_length, _ = inputs_embeds.shape
+            else:
+                raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            if inputs_embeds is None:
+                inputs_embeds = self.embed_tokens(input_ids)
+            hidden_states = inputs_embeds
+        else:
+            input_shape = hidden_states.shape[:-1]
+            batch_size, seq_length = input_shape
+            device = hidden_states.device
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        # TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
+        if output_attentions:
+            logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
+            output_attentions = False
+        if output_hidden_states:
+            logger.warning_once('output_hidden_states=True is not supported for pipeline models at the moment.')
+            output_hidden_states = False
+        if use_cache:
+            logger.warning_once('use_cache=True is not supported for pipeline models at the moment.')
+            use_cache = False
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            position_ids = torch.arange(past_key_values_length,
+                                        seq_length + past_key_values_length,
+                                        dtype=torch.long,
+                                        device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        # embed positions, for the first stage, hidden_states is the input embeddings,
+        # for the other stages, hidden_states is the output of the previous stage
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length_with_past),
+                                        dtype=torch.bool,
+                                        device=hidden_states.device)
+        attention_mask = self._prepare_decoder_attention_mask(attention_mask, (batch_size, seq_length), hidden_states,
+                                                              past_key_values_length)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        start_idx, end_idx = stage_index[0], stage_index[1]
+        for idx, decoder_layer in enumerate(self.layers[start_idx:end_idx], start=start_idx):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        if stage_manager.is_last_stage():
+            hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if stage_manager.is_last_stage():
+            if not return_dict:
+                return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+            return BaseModelOutputWithPast(
+                last_hidden_state=hidden_states,
+                past_key_values=next_cache,
+                hidden_states=all_hidden_states,
+                attentions=all_self_attns,
+            )
+        # always return dict for imediate stage
+        return {'hidden_states': hidden_states}
+
+    @staticmethod
+    def llama_for_causal_lm_forward(
+        self: OpenMoeForCausalLM,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        stage_manager: Optional[PipelineStageManager] = None,
+        hidden_states: Optional[torch.FloatTensor] = None,
+        stage_index: Optional[List[int]] = None,
+    ):
+        r"""
+            Args:
+                labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            Returns:
+
+            Example:
+
+            ```python
+            >>> from transformers import AutoTokenizer, LlamaForCausalLM
+
+            >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+            >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+            >>> prompt = "Hey, are you consciours? Can you talk to me?"
+            >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+            >>> # Generate
+            >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+            >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+            "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+            ```"""
+        logger = logging.get_logger(__name__)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
+        if output_attentions:
+            logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
+            output_attentions = False
+        if output_hidden_states:
+            logger.warning_once('output_hidden_states=True is not supported for pipeline models at the moment.')
+            output_hidden_states = False
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = OpenMoePipelineForwards.llama_model_forward(
+            self.model,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            stage_manager=stage_manager,
+            hidden_states=hidden_states,
+            stage_index=stage_index,
+        )
+        past_key_values = None
+        all_hidden_states = None
+        all_self_attentions = None
+        all_cross_attentions = None
+
+        if stage_manager.is_last_stage():
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            loss = None
+            if labels is not None:
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss()
+                shift_logits = shift_logits.view(-1, self.config.vocab_size)
+                shift_labels = shift_labels.view(-1)
+                # Enable model parallelism
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss = loss_fct(shift_logits, shift_labels)
+
+            if not return_dict:
+                output = (logits,) + outputs[1:]
+                return (loss,) + output if loss is not None else output
+
+            return CausalLMOutputWithPast(
+                loss=loss,
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        else:
+            hidden_states = outputs.get('hidden_states')
+            return {'hidden_states': hidden_states}

From d8e4b8064f894605e15d31d10fd9854f4a444406 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 14 Sep 2023 13:52:49 +0800
Subject: [PATCH 02/23] renam,e

---
 examples/language/openmoe/model/__init__.py   |  0
 .../{llama_policy.py => openmoe_policy.py}    | 83 ++++++++++++++-----
 2 files changed, 63 insertions(+), 20 deletions(-)
 create mode 100644 examples/language/openmoe/model/__init__.py
 rename examples/language/openmoe/model/{llama_policy.py => openmoe_policy.py} (88%)

diff --git a/examples/language/openmoe/model/__init__.py b/examples/language/openmoe/model/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/examples/language/openmoe/model/llama_policy.py b/examples/language/openmoe/model/openmoe_policy.py
similarity index 88%
rename from examples/language/openmoe/model/llama_policy.py
rename to examples/language/openmoe/model/openmoe_policy.py
index c4421de6a36c..53d4675f14c0 100644
--- a/examples/language/openmoe/model/llama_policy.py
+++ b/examples/language/openmoe/model/openmoe_policy.py
@@ -4,14 +4,7 @@
 
 import torch
 import torch.nn as nn
-from modeling_openmoe import (
-    OpenMoeAttention,
-    OpenMoeDecoderLayer,
-    OpenMoeForCausalLM,
-    OpenMoeMLP,
-    OpenMoeModel,
-    OpenMoePreTrainedModel,
-)
+import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, Module, MSELoss
 from transformers.modeling_outputs import (
@@ -21,10 +14,20 @@
 )
 from transformers.utils import logging
 
+from colossalai.moe.manager import MOE_MANAGER
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col, Linear1D_Row, VocabParallelEmbedding1D
 from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
+from .modeling_openmoe import (
+    OpenMoeAttention,
+    OpenMoeDecoderLayer,
+    OpenMoeForCausalLM,
+    OpenMoeMLP,
+    OpenMoeModel,
+    OpenMoePreTrainedModel,
+)
+
 __all__ = ['OpenMoePolicy', 'OpenMoeForCausalLMPolicy']
 
 
@@ -375,6 +378,7 @@ def llama_for_causal_lm_forward(
         stage_manager: Optional[PipelineStageManager] = None,
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
+        chunk_head: Optional[bool] = None,
     ):
         r"""
             Args:
@@ -401,6 +405,9 @@ def llama_for_causal_lm_forward(
             >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
             "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
             ```"""
+        # reset moe loss
+        MOE_MANAGER.reset_loss()
+
         logger = logging.get_logger(__name__)
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (output_hidden_states
@@ -438,19 +445,55 @@ def llama_for_causal_lm_forward(
 
         if stage_manager.is_last_stage():
             hidden_states = outputs[0]
-            logits = self.lm_head(hidden_states)
+            if self.pretraining_tp > 1:
+                lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
+                logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
+                logits = torch.cat(logits, dim=-1)
+
             loss = None
-            if labels is not None:
-                # Shift so that tokens < n predict n
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-                # Flatten the tokens
-                loss_fct = CrossEntropyLoss()
-                shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                shift_labels = shift_labels.view(-1)
-                # Enable model parallelism
-                shift_labels = shift_labels.to(shift_logits.device)
-                loss = loss_fct(shift_logits, shift_labels)
+            # if no training, just do forward
+            if labels is None:
+                logits = self.lm_head(hidden_states)
+                logits = logits.float()
+            # the vocab size for openmoe is 30w+
+            # which causes great activation memory in training, up to 20G for one sequence
+            # so we use chunk and checkpoint to reduce memory
+            else:
+                if chunk_head == True:
+
+                    def create_custom_forward(module):
+
+                        def custom_forward(*inputs):
+                            logits = module(inputs[0])
+                            logits = logits.float()
+                            # Shift so that tokens < n predict n
+                            shift_logits = logits[..., :-1, :].contiguous().float()
+                            shift_labels = inputs[1][..., 1:].contiguous()
+                            # Flatten the tokens
+                            loss = self._calculate_loss(shift_logits, shift_labels)
+                            return loss
+
+                        return custom_forward
+
+                    aux_loss, z_loss = self._calculate_router_loss()
+                    loss = aux_loss + z_loss
+                    for batch_idx in range(hidden_states.shape[0]):
+                        loss = loss + torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(self.lm_head),
+                            hidden_states[batch_idx:batch_idx + 1, :],
+                            labels[batch_idx:batch_idx + 1, :],
+                        )
+                    logits = None
+                else:
+                    logits = self.lm_head(hidden_states)
+                    logits = logits.float()
+                    # Shift so that tokens < n predict n
+                    shift_logits = logits[..., :-1, :].contiguous()
+                    shift_labels = labels[..., 1:].contiguous()
+                    # Flatten the tokens
+                    aux_loss, z_loss = self._calculate_router_loss()
+                    loss = aux_loss + z_loss
+                    loss = loss + self._calculate_loss(shift_logits, shift_labels)
 
             if not return_dict:
                 output = (logits,) + outputs[1:]

From 7f68f632c252dede6c5fd72be80e966bd8aeebcd Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 14 Sep 2023 17:15:15 +0800
Subject: [PATCH 03/23] update pp

---
 .../booster/plugin/hybrid_parallel_plugin.py  |  13 +-
 .../openmoe/model/modeling_openmoe.py         | 137 ++----------------
 .../language/openmoe/model/openmoe_policy.py  |  21 +--
 examples/language/openmoe/train.py            |  73 +++++++---
 4 files changed, 83 insertions(+), 161 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 3fbeebcc4110..d65bd437962e 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -22,6 +22,7 @@
 from colossalai.pipeline.schedule import OneForwardOneBackwardSchedule
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer import ShardConfig, ShardFormer
+from colossalai.shardformer.policies.base_policy import Policy
 from colossalai.zero.low_level import LowLevelZeroOptimizer
 
 from .pp_plugin_base import PipelinePluginBase
@@ -38,13 +39,15 @@ def _convert_floating_point(x, dtype: torch.dtype = torch.float16):
 class HybridParallelModule(ModelWrapper):
 
     def __init__(self, module: Module, precision: str, shard_config: ShardConfig, dp_group: ProcessGroup, use_ddp: bool,
-                 ddp_config: dict) -> None:
+                 ddp_config: dict, custom_policy: Policy) -> None:
 
         self.stage_manager = shard_config.pipeline_stage_manager
         self.dp_group = dp_group
 
         shardformer = ShardFormer(shard_config)
-        module, self.shared_params = shardformer.optimize(module)
+        if custom_policy is not None:
+            assert isinstance(custom_policy, object)
+        module, self.shared_params = shardformer.optimize(module, policy=custom_policy)
 
         # setting process groups for shared parameters
         self.shared_param_process_groups = []
@@ -302,7 +305,8 @@ def __init__(self,
                  zero_bucket_size_in_m: int = 12,
                  cpu_offload: bool = False,
                  communication_dtype: Optional[torch.dtype] = None,
-                 overlap_communication: bool = True) -> None:
+                 overlap_communication: bool = True,
+                 custom_policy: Policy = None) -> None:
 
         super().__init__()
         assert dist.get_world_size() % (
@@ -326,6 +330,7 @@ def __init__(self,
         self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.tp_size)
         self.stage_manager = None
         self.schedule = None
+        self.custom_policy = custom_policy
         assert zero_stage in (0, 1, 2)
         if self.pp_size > 1:
             assert num_microbatches is not None or microbatch_size is not None, 'num_microbatches or microbatch_size must be specified when using pipeline parallelism'
@@ -405,7 +410,7 @@ def configure(
         if not isinstance(model, ModelWrapper):
             use_ddp = self.dp_size > 1 and self.pp_size == 1 and self.zero_stage == 0
             model = HybridParallelModule(model, self.precision, self.shard_config, self.dp_group, use_ddp,
-                                         self.ddp_config)
+                                         self.ddp_config, self.custom_policy)
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
             if self.zero_stage == 0:
                 if self.precision in ['fp16', 'bf16']:
diff --git a/examples/language/openmoe/model/modeling_openmoe.py b/examples/language/openmoe/model/modeling_openmoe.py
index 6ccbf64a60e4..7d95fedce26e 100644
--- a/examples/language/openmoe/model/modeling_openmoe.py
+++ b/examples/language/openmoe/model/modeling_openmoe.py
@@ -145,87 +145,6 @@ def apply_rotary_embedding(q, k, cos, sin, decode=False, rotary_index=None):
     return out_q, out_k
 
 
-class LlamaRotaryEmbedding(torch.nn.Module):
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.inv_freq = inv_freq
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(seq_len=max_position_embeddings,
-                                device=self.inv_freq.device,
-                                dtype=torch.get_default_dtype())
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * ((self.scaling_factor * seq_len / self.max_position_embeddings) -
-                                (self.scaling_factor - 1))**(self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq)
-
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., :x.shape[-1] // 2]
@@ -233,17 +152,6 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
-    cos = cos.squeeze(1).squeeze(0)    # [seq_len, dim]
-    sin = sin.squeeze(1).squeeze(0)    # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)    # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)    # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
 def SwiGLU(x):
     """Gated linear unit activation function.
     Args:
@@ -256,7 +164,7 @@ def SwiGLU(x):
     return x1 * (x2 * torch.sigmoid(x2))
 
 
-class LlamaMLP(nn.Module):
+class OpenMoeMLP(nn.Module):
 
     def __init__(self, config):
         super().__init__()
@@ -302,7 +210,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
-class LlamaAttention(nn.Module):
+class OpenMoeAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(self, config: LlamaConfig):
@@ -321,22 +229,6 @@ def __init__(self, config: LlamaConfig):
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
         self.sin, self.cos = generate_fixed_pos_embedding(self.head_dim, self.max_position_embeddings, 1e4)
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor)
-            elif scaling_type == "dynamic":
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor)
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
@@ -446,13 +338,13 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-class LlamaDecoderLayer(nn.Module):
+class OpenMoeDecoderLayer(nn.Module):
 
     def __init__(self, config: LlamaConfig, moe: bool):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.moe = moe
-        self.self_attn = LlamaAttention(config=config)
+        self.self_attn = OpenMoeAttention(config=config)
         self.input_layernorm = T5LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = T5LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
         if self.moe:
@@ -470,9 +362,9 @@ def __init__(self, config: LlamaConfig, moe: bool):
                 activation=config.hidden_act,
                 gated=config.gated)
             self.pre_extra_mlp_layernorm = T5LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
-            self.extra_mlp = LlamaMLP(config)
+            self.extra_mlp = OpenMoeMLP(config)
         else:
-            self.mlp = LlamaMLP(config)
+            self.mlp = OpenMoeMLP(config)
 
     def forward(
         self,
@@ -556,7 +448,7 @@ def forward(
     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
     LLAMA_START_DOCSTRING,
 )
-class LlamaPreTrainedModel(PreTrainedModel):
+class OpenMoePreTrainedModel(PreTrainedModel):
     config_class = LlamaConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
@@ -575,7 +467,7 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
+        if isinstance(module, OpenMoeModel):
             module.gradient_checkpointing = value
 
 
@@ -647,7 +539,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
     LLAMA_START_DOCSTRING,
 )
-class LlamaModel(LlamaPreTrainedModel):
+class OpenMoeModel(OpenMoePreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
 
@@ -662,7 +554,7 @@ def __init__(self, config: LlamaConfig):
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList([
-            LlamaDecoderLayer(config, moe=True if (i + 1) % config.moe_layer_interval == 0 else False)
+            OpenMoeDecoderLayer(config, moe=True if (i + 1) % config.moe_layer_interval == 0 else False)
             for i in range(config.num_hidden_layers)
         ])
         self.norm = T5LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -827,12 +719,12 @@ def custom_forward(*inputs):
         )
 
 
-class OpenMoeForCausalLM(LlamaPreTrainedModel):
+class OpenMoeForCausalLM(OpenMoePreTrainedModel):
     # _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config):
         super().__init__(config)
-        self.model = LlamaModel(config)
+        self.model = OpenMoeModel(config)
         self.pretraining_tp = config.pretraining_tp
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -1029,10 +921,7 @@ def _calculate_router_loss(self):
         z_loss = self.config.router_z_loss_factor * sum(z_loss) / len(z_loss)
         return aux_loss, z_loss
 
-    def _calculate_loss(self,
-                        logits: torch.Tensor,
-                        targets: torch.Tensor
-                        ) -> torch.Tensor:
+    def _calculate_loss(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
         """Compute cross entropy and entropy for log probs and targets.
 
         Args:
diff --git a/examples/language/openmoe/model/openmoe_policy.py b/examples/language/openmoe/model/openmoe_policy.py
index 53d4675f14c0..df82e6deb721 100644
--- a/examples/language/openmoe/model/openmoe_policy.py
+++ b/examples/language/openmoe/model/openmoe_policy.py
@@ -1,32 +1,21 @@
 import warnings
 from functools import partial
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Callable, Dict, List, Optional, Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, Module, MSELoss
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    SequenceClassifierOutputWithPast,
-)
+from torch.nn import Module
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.utils import logging
 
 from colossalai.moe.manager import MOE_MANAGER
 from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col, Linear1D_Row, VocabParallelEmbedding1D
+from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
 from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
-from .modeling_openmoe import (
-    OpenMoeAttention,
-    OpenMoeDecoderLayer,
-    OpenMoeForCausalLM,
-    OpenMoeMLP,
-    OpenMoeModel,
-    OpenMoePreTrainedModel,
-)
+from .modeling_openmoe import OpenMoeDecoderLayer, OpenMoeForCausalLM, OpenMoeModel
 
 __all__ = ['OpenMoePolicy', 'OpenMoeForCausalLMPolicy']
 
diff --git a/examples/language/openmoe/train.py b/examples/language/openmoe/train.py
index 132f17a9ba0f..3ce97841730a 100644
--- a/examples/language/openmoe/train.py
+++ b/examples/language/openmoe/train.py
@@ -5,6 +5,7 @@
 import transformers
 from huggingface_hub import snapshot_download
 from model.modeling_openmoe import OpenMoeForCausalLM
+from model.openmoe_policy import OpenMoeForCausalLMPolicy
 from torch.utils.data import Dataset
 from tqdm import tqdm
 from transformers import Adafactor, T5Tokenizer
@@ -13,7 +14,7 @@
 import colossalai
 from colossalai import get_default_parser
 from colossalai.booster import Booster
-from colossalai.booster.plugin import LowLevelZeroPlugin
+from colossalai.booster.plugin import HybridParallelPlugin, LowLevelZeroPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.moe import MoeCheckpintIO
@@ -59,6 +60,7 @@ def __getitem__(self, idx):
 
 
 def parse_args():
+    # basic settings
     parser = get_default_parser()
     parser.add_argument("--model_name",
                         type=str,
@@ -74,6 +76,16 @@ def parse_args():
                         default=4,
                         help="Batch size (per dp group) for the training dataloader.")
     parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
+    parser.add_argument("--plugin",
+                        type=str,
+                        default="zero2",
+                        help="parallel plugin",
+                        choices=["zero1", "zero2", "hybrid"])
+    # hybrid plugin
+    parser.add_argument("--tp_size", type=int, default=1, help="tp size")
+    parser.add_argument("--pp_size", type=int, default=2, help="pp size")
+    parser.add_argument("--zero_stage", type=int, default=1, help="zero stage in hybrid plugin")
+    parser.add_argument("--microbatch_size", type=int, default=1, help="microbatch size")
     # loss
     parser.add_argument("--router_aux_loss_factor", type=float, default=0.01, help="router_aux_loss_factor.")
     parser.add_argument("--router_z_loss_factor", type=float, default=0.0001, help="router_z_loss_factor.")
@@ -95,7 +107,7 @@ def main():
     coordinator = DistCoordinator()
 
     # Set up moe
-    MOE_MANAGER.setup(seed=42, parallel="EP")
+    MOE_MANAGER.setup(seed=42, parallel=None)
 
     # Manage loggers
     disable_existing_loggers()
@@ -129,12 +141,23 @@ def main():
 
     # Set plugin
     booster_kwargs = {}
-    plugin = LowLevelZeroPlugin(initial_scale=2**5, stage=2)
+    if args.plugin == "zero1":
+        plugin = LowLevelZeroPlugin(initial_scale=2**5, stage=1)
+    elif args.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(initial_scale=2**5, stage=2)
+    elif args.plugin == "hybrid":
+        plugin = HybridParallelPlugin(tp_size=args.tp_size,
+                                      pp_size=args.pp_size,
+                                      zero_stage=args.zero_stage,
+                                      microbatch_size=args.microbatch_size,
+                                      custom_policy=OpenMoeForCausalLMPolicy())
+    else:
+        raise ValueError(f"Invalid plugin {args.plugin}")
     logger.info(f"Set plugin as {plugin}", ranks=[0])
 
     # Prepare tokenizer and dataloader
     tokenizer = T5Tokenizer.from_pretrained("google/umt5-small")
-    dataset = RandomDataset(num_samples=1000 if args.model_name != "test" else 1)
+    dataset = RandomDataset(num_samples=1000 if args.model_name != "test" else 10)
     dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
 
     # Set optimizer
@@ -143,27 +166,43 @@ def main():
     # Set booster
     booster = Booster(plugin=plugin, **booster_kwargs)
     model, optimizer, _, dataloader, _ = booster.boost(model=model, optimizer=optimizer, dataloader=dataloader)
+    use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
+    is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage()
     logger.info(f"Finish init booster", ranks=[0])
 
     # Start finetuning
     logger.info(f"Start finetuning", ranks=[0])
     for epoch in range(args.num_epoch):
         model.train()
-        with tqdm(dataloader, desc=f'Epoch [{epoch + 1}]', disable=not coordinator.is_master()) as pbar:
-            for batch in pbar:
-                # Forward
-                optimizer.zero_grad()
-                batch = move_to_cuda(batch, torch.cuda.current_device())
+        train_dataloader_iter = iter(dataloader)
+        total_len = len(train_dataloader_iter)
+        with tqdm(range(total_len),
+                  desc=f'Epoch [{epoch + 1}/{args.num_epoch}]',
+                  disable=not (coordinator.is_master() or is_pp_last_stage)) as pbar:
+            # Forward pass
+            for _ in pbar:
+                if use_pipeline:
+                    outputs = booster.execute_pipeline(train_dataloader_iter,
+                                                       model,
+                                                       lambda x: x,
+                                                       optimizer,
+                                                       return_loss=True,
+                                                       return_outputs=True)
+                    # Backward and optimize
+                    if is_pp_last_stage:
+                        loss = outputs['loss']
+                        pbar.set_postfix({'loss': loss.item()})
+                else:
+                    data = next(train_dataloader_iter)
+                    data = move_to_cuda(data, torch.cuda.current_device())
+                    outputs = model(**data)
+                    loss = outputs['loss']
+                    # Backward
+                    booster.backward(loss, optimizer)
+                    pbar.set_postfix({'loss': loss.item()})
 
-                outputs = model(use_cache=False, chunk_head=True, **batch)
-                loss = outputs['loss']
-
-                # Backward
-                booster.backward(loss, optimizer)
                 optimizer.step()
-
-                # Print batch loss
-                pbar.set_postfix({'loss': loss.item()})
+                optimizer.zero_grad()
 
     # Finish training and evaluate
     logger.info(f"Finish finetuning", ranks=[0])

From af224af2731cdd1a6db48a532916e8e3ded2bbdb Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Fri, 15 Sep 2023 13:11:22 +0800
Subject: [PATCH 04/23] finish pp

---
 .../openmoe/model/modeling_openmoe.py         |  5 +-
 .../language/openmoe/model/openmoe_policy.py  | 66 +++++++++++--------
 examples/language/openmoe/train.py            | 11 ++--
 3 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/examples/language/openmoe/model/modeling_openmoe.py b/examples/language/openmoe/model/modeling_openmoe.py
index 7d95fedce26e..d8289b791dd5 100644
--- a/examples/language/openmoe/model/modeling_openmoe.py
+++ b/examples/language/openmoe/model/modeling_openmoe.py
@@ -914,8 +914,9 @@ def _reorder_cache(past_key_values, beam_idx):
                 past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),)
         return reordered_past
 
-    def _calculate_router_loss(self):
-        aux_loss, z_loss = MOE_MANAGER.get_loss()
+    def _calculate_router_loss(self, aux_loss: list = None, z_loss: list = None):
+        if aux_loss is None or z_loss is None:
+            aux_loss, z_loss = MOE_MANAGER.get_loss()
         assert len(aux_loss) == len(z_loss) == self.config.num_hidden_layers // self.config.moe_layer_interval
         aux_loss = self.config.router_aux_loss_factor * sum(aux_loss) / len(aux_loss)
         z_loss = self.config.router_z_loss_factor * sum(z_loss) / len(z_loss)
diff --git a/examples/language/openmoe/model/openmoe_policy.py b/examples/language/openmoe/model/openmoe_policy.py
index df82e6deb721..21e25bcb73a0 100644
--- a/examples/language/openmoe/model/openmoe_policy.py
+++ b/examples/language/openmoe/model/openmoe_policy.py
@@ -130,11 +130,10 @@ def __init__(self) -> None:
 
     def module_policy(self):
         policy = super().module_policy()
-        from transformers.models.llama.modeling_llama import LlamaModel
         if self.pipeline_stage_manager:
             # set None as default
-            self.set_pipeline_forward(model_cls=LlamaModel,
-                                      new_forward=OpenMoePipelineForwards.llama_model_forward,
+            self.set_pipeline_forward(model_cls=OpenMoeModel,
+                                      new_forward=OpenMoePipelineForwards.openmoe_model_forward,
                                       policy=policy)
         return policy
 
@@ -201,7 +200,7 @@ class OpenMoePipelineForwards:
     '''
 
     @staticmethod
-    def llama_model_forward(
+    def openmoe_model_forward(
         self: OpenMoeModel,
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
@@ -215,7 +214,12 @@ def llama_model_forward(
         stage_manager: Optional[PipelineStageManager] = None,
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
+        past_router_aux_loss: Optional[torch.FloatTensor] = None,
+        past_router_z_loss: Optional[torch.FloatTensor] = None,
     ):
+        # reset moe loss for different data
+        MOE_MANAGER.reset_loss()
+
         logger = logging.get_logger(__name__)
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -339,17 +343,17 @@ def custom_forward(*inputs):
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
         next_cache = next_decoder_cache if use_cache else None
+
+        # concat past losses with current ones
+        router_aux_loss, router_z_loss = MOE_MANAGER.get_loss()
+        if past_router_aux_loss is not None and past_router_z_loss is not None:
+            router_aux_loss = past_router_aux_loss + router_aux_loss
+            router_z_loss = past_router_z_loss + router_z_loss
+
         if stage_manager.is_last_stage():
-            if not return_dict:
-                return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-            return BaseModelOutputWithPast(
-                last_hidden_state=hidden_states,
-                past_key_values=next_cache,
-                hidden_states=all_hidden_states,
-                attentions=all_self_attns,
-            )
+            return tuple([hidden_states, next_cache, all_hidden_states, all_self_attns, router_aux_loss, router_z_loss])
         # always return dict for imediate stage
-        return {'hidden_states': hidden_states}
+        return {'hidden_states': hidden_states, 'router_aux_loss': router_aux_loss, 'router_z_loss': router_z_loss}
 
     @staticmethod
     def llama_for_causal_lm_forward(
@@ -368,6 +372,8 @@ def llama_for_causal_lm_forward(
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
         chunk_head: Optional[bool] = None,
+        past_router_aux_loss: Optional[torch.FloatTensor] = None,
+        past_router_z_loss: Optional[torch.FloatTensor] = None,
     ):
         r"""
             Args:
@@ -394,9 +400,6 @@ def llama_for_causal_lm_forward(
             >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
             "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
             ```"""
-        # reset moe loss
-        MOE_MANAGER.reset_loss()
-
         logger = logging.get_logger(__name__)
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (output_hidden_states
@@ -412,7 +415,7 @@ def llama_for_causal_lm_forward(
             output_hidden_states = False
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = OpenMoePipelineForwards.llama_model_forward(
+        outputs = OpenMoePipelineForwards.openmoe_model_forward(
             self.model,
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -426,14 +429,13 @@ def llama_for_causal_lm_forward(
             stage_manager=stage_manager,
             hidden_states=hidden_states,
             stage_index=stage_index,
+            past_router_aux_loss=past_router_aux_loss,
+            past_router_z_loss=past_router_z_loss,
         )
-        past_key_values = None
-        all_hidden_states = None
-        all_self_attentions = None
-        all_cross_attentions = None
 
         if stage_manager.is_last_stage():
-            hidden_states = outputs[0]
+            hidden_states, past_key_values, all_hidden_states, attentions, router_aux_loss, router_z_loss = outputs
+
             if self.pretraining_tp > 1:
                 lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
                 logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
@@ -464,7 +466,7 @@ def custom_forward(*inputs):
 
                         return custom_forward
 
-                    aux_loss, z_loss = self._calculate_router_loss()
+                    aux_loss, z_loss = self._calculate_router_loss(router_aux_loss, router_z_loss)
                     loss = aux_loss + z_loss
                     for batch_idx in range(hidden_states.shape[0]):
                         loss = loss + torch.utils.checkpoint.checkpoint(
@@ -480,7 +482,7 @@ def custom_forward(*inputs):
                     shift_logits = logits[..., :-1, :].contiguous()
                     shift_labels = labels[..., 1:].contiguous()
                     # Flatten the tokens
-                    aux_loss, z_loss = self._calculate_router_loss()
+                    aux_loss, z_loss = self._calculate_router_loss(router_aux_loss, router_z_loss)
                     loss = aux_loss + z_loss
                     loss = loss + self._calculate_loss(shift_logits, shift_labels)
 
@@ -491,10 +493,16 @@ def custom_forward(*inputs):
             return CausalLMOutputWithPast(
                 loss=loss,
                 logits=logits,
-                past_key_values=outputs.past_key_values,
-                hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions,
+                past_key_values=past_key_values,
+                hidden_states=all_hidden_states,
+                attentions=attentions,
             )
         else:
-            hidden_states = outputs.get('hidden_states')
-            return {'hidden_states': hidden_states}
+            hidden_states = outputs['hidden_states']
+            router_aux_loss = outputs['router_aux_loss']
+            router_z_loss = outputs['router_z_loss']
+            return {
+                'hidden_states': hidden_states,
+                'past_router_aux_loss': router_aux_loss,
+                'past_router_z_loss': router_z_loss
+            }
diff --git a/examples/language/openmoe/train.py b/examples/language/openmoe/train.py
index 3ce97841730a..6351d26ca0a1 100644
--- a/examples/language/openmoe/train.py
+++ b/examples/language/openmoe/train.py
@@ -78,7 +78,7 @@ def parse_args():
     parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
     parser.add_argument("--plugin",
                         type=str,
-                        default="zero2",
+                        default="hybrid",
                         help="parallel plugin",
                         choices=["zero1", "zero2", "hybrid"])
     # hybrid plugin
@@ -157,7 +157,7 @@ def main():
 
     # Prepare tokenizer and dataloader
     tokenizer = T5Tokenizer.from_pretrained("google/umt5-small")
-    dataset = RandomDataset(num_samples=1000 if args.model_name != "test" else 10)
+    dataset = RandomDataset(num_samples=1000 if args.model_name != "test" else 50)
     dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
 
     # Set optimizer
@@ -176,15 +176,14 @@ def main():
         model.train()
         train_dataloader_iter = iter(dataloader)
         total_len = len(train_dataloader_iter)
-        with tqdm(range(total_len),
-                  desc=f'Epoch [{epoch + 1}/{args.num_epoch}]',
-                  disable=not (coordinator.is_master() or is_pp_last_stage)) as pbar:
+        with tqdm(range(total_len), desc=f'Epoch [{epoch + 1}/{args.num_epoch}]',
+                  disable=not coordinator.is_master()) as pbar:
             # Forward pass
             for _ in pbar:
                 if use_pipeline:
                     outputs = booster.execute_pipeline(train_dataloader_iter,
                                                        model,
-                                                       lambda x: x,
+                                                       lambda x, y: x.loss,
                                                        optimizer,
                                                        return_loss=True,
                                                        return_outputs=True)

From e275b0967e6259cec6bdd5f0fb762020893d38d2 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 18 Sep 2023 09:57:32 +0800
Subject: [PATCH 05/23] update script

---
 examples/language/openmoe/train.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/language/openmoe/train.sh b/examples/language/openmoe/train.sh
index 9a55779ca5ef..a2fe425c5805 100644
--- a/examples/language/openmoe/train.sh
+++ b/examples/language/openmoe/train.sh
@@ -1,3 +1,3 @@
-torchrun --standalone --nproc_per_node 2 train.py \
+torchrun --standalone --nproc_per_node 4 train.py \
     --model_name "base" \
     --batch_size 4

From dd6da186a62df3aaaae005bab24a372f3d5c88ad Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 18 Sep 2023 10:54:34 +0800
Subject: [PATCH 06/23] update plugin

---
 .../plugin/moe_hybrid_parallel_plugin.py      | 527 ++++++++++++++++++
 1 file changed, 527 insertions(+)
 create mode 100644 colossalai/booster/plugin/moe_hybrid_parallel_plugin.py

diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
new file mode 100644
index 000000000000..d65bd437962e
--- /dev/null
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -0,0 +1,527 @@
+import random
+from contextlib import nullcontext
+from functools import partial
+from typing import Any, Callable, Iterator, List, Optional, OrderedDict, Tuple, Union
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+from torch.nn import Module, SyncBatchNorm
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils._pytree import tree_map
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from colossalai.amp.naive_amp.mixed_precision_optimizer import MixedPrecisionOptimizer
+from colossalai.checkpoint_io import CheckpointIO, HybridParallelCheckpointIO
+from colossalai.cluster import ProcessGroupMesh
+from colossalai.interface import ModelWrapper, OptimizerWrapper
+from colossalai.pipeline.schedule import OneForwardOneBackwardSchedule
+from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer import ShardConfig, ShardFormer
+from colossalai.shardformer.policies.base_policy import Policy
+from colossalai.zero.low_level import LowLevelZeroOptimizer
+
+from .pp_plugin_base import PipelinePluginBase
+
+DP_AXIS, PP_AXIS, TP_AXIS = 0, 1, 2
+
+
+def _convert_floating_point(x, dtype: torch.dtype = torch.float16):
+    if isinstance(x, torch.Tensor) and torch.is_floating_point(x):
+        return x.to(dtype)
+    return x
+
+
+class HybridParallelModule(ModelWrapper):
+
+    def __init__(self, module: Module, precision: str, shard_config: ShardConfig, dp_group: ProcessGroup, use_ddp: bool,
+                 ddp_config: dict, custom_policy: Policy) -> None:
+
+        self.stage_manager = shard_config.pipeline_stage_manager
+        self.dp_group = dp_group
+
+        shardformer = ShardFormer(shard_config)
+        if custom_policy is not None:
+            assert isinstance(custom_policy, object)
+        module, self.shared_params = shardformer.optimize(module, policy=custom_policy)
+
+        # setting process groups for shared parameters
+        self.shared_param_process_groups = []
+        for shared_param in self.shared_params:
+            if len(shared_param) > 0:
+                self.shared_param_process_groups.append(
+                    self.stage_manager.init_process_group_by_stages(list(shared_param.keys())))
+
+        # setting mixed_precision
+        self.mixed_precision = None
+        if precision == 'fp16':
+            self.mixed_precision = torch.float16
+        elif precision == 'bf16':
+            self.mixed_precision = torch.bfloat16
+        if self.mixed_precision is not None:
+            module = module.to(self.mixed_precision)
+        module = module.cuda()
+
+        # setting input type cast when using mixed precision
+        self.convert_fn = None
+        if self.mixed_precision is not None:
+            self.convert_fn = partial(_convert_floating_point, dtype=self.mixed_precision)
+
+        # setting ddp configs
+        if use_ddp:
+            # convert model to sync bn
+            module = SyncBatchNorm.convert_sync_batchnorm(module, dp_group)
+            # wrap the model with PyTorch DDP
+            module = DDP(module, process_group=dp_group, **ddp_config)
+
+        super().__init__(module)
+
+    def sync_shared_params(self):
+        for shared_param, group in zip(self.shared_params, self.shared_param_process_groups):
+            if self.stage_manager.stage in shared_param:
+                param = shared_param[self.stage_manager.stage]
+                dist.all_reduce(param.grad, group=group)
+            dist.barrier()
+
+    def no_sync(self) -> Iterator[None]:
+        # no sync grads across data parallel
+        return nullcontext()
+
+    def sync_grads(self):
+        # sync grad across data parallel
+        if self.dp_group.size() == 1:
+            return
+        for p in self.module.parameters():
+            if p.grad is not None:
+                dist.all_reduce(p.grad, group=self.dp_group)
+                p.grad.div_(self.dp_group.size())
+
+    def forward(self, *args, **kwargs):
+        if self.convert_fn is not None:
+            args = tree_map(self.convert_fn, args)
+            kwargs = tree_map(self.convert_fn, kwargs)
+        return super().forward(*args, **kwargs)
+
+    def unwrap(self):
+        module = super().unwrap()
+        if isinstance(module, DDP):
+            module = module.module
+        return module
+
+
+def get_param_info(optim: Optimizer):
+    # Get a backup of necessary information of parameters for future use, which includes:
+    # 1. A complete param_group, with params in the form of param_id
+    # 2. A mapping from param address (obtained using id(param)) to integer param_id
+    # 3. A mapping from integer param_id to param address.
+    # 4. A mapping from param_address (obtained using id(param)) to the original shape of parameter before sharding.
+    # When Zero is used, the params here are fp16/bf16 model params rather than fp32 master params in optimizer.
+
+    if optim is None:
+        return {}
+    param_info = {'param_groups': [], 'param2id': {}, 'id2param': {}, 'param2shape': {}}
+    start_index = 0
+    for group in optim.param_groups:
+
+        packed_group = {k: v for k, v in group.items() if k != 'params'}
+        packed_group['params'] = []
+
+        for param_id, param in enumerate(group['params'], start_index):
+            original_shape = param.shape if isinstance(param, torch.Tensor) else None
+            packed_group['params'].append(param_id)
+            param_info['param2id'][id(param)] = param_id
+            param_info['id2param'][param_id] = id(param)
+            param_info['param2shape'][id(param)] = original_shape
+
+        param_info['param_groups'].append(packed_group)
+        start_index += len(group['params'])
+
+    return param_info
+
+
+def init_pipeline_optimizer(optim: Optimizer, model: Module):
+    model_params = set(model.parameters())
+    new_param_groups = []
+    for group in optim.param_groups:
+        params = [p for p in group['params'] if p in model_params]
+        new_param_groups.append({**group, 'params': params})
+    optim.__setstate__({'param_groups': new_param_groups})
+
+
+class HybridParallelNaiveOptimizer(OptimizerWrapper):
+
+    def __init__(self, optim: Optimizer, model: Module, use_pipeline: bool, param_info: OrderedDict):
+        self.param_info = param_info
+        if use_pipeline:
+            init_pipeline_optimizer(optim, model)
+        super().__init__(optim)
+
+
+class HybridParallelAMPOptimizer(MixedPrecisionOptimizer):
+
+    def __init__(self,
+                 optim: Optimizer,
+                 model: Module,
+                 use_pipeline: bool,
+                 param_info: OrderedDict,
+                 precision: str = 'fp16',
+                 initial_scale: float = 2**16,
+                 min_scale: float = 1,
+                 growth_factor: float = 2,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 1000,
+                 hysteresis: int = 2,
+                 max_scale: float = 2**32,
+                 max_norm: float = 0):
+        self.param_info = param_info
+        if use_pipeline:
+            init_pipeline_optimizer(optim, model)
+        super().__init__(optim, precision, initial_scale, min_scale, growth_factor, backoff_factor, growth_interval,
+                         hysteresis, max_scale, max_norm)
+
+
+class HybridParallelZeroOptimizer(LowLevelZeroOptimizer):
+
+    def __init__(
+            self,
+            optimizer: Optimizer,
+            model: Module,
+            use_pipeline: bool,
+            param_info: OrderedDict,
+            initial_scale: int = 2**16,    # grad scaler config
+            min_scale: int = 1,
+            growth_factor: float = 2.,
+            backoff_factor: float = .5,
+            growth_interval: int = 2000,
+            hysteresis: int = 2,
+            max_scale: int = 2**24,
+            clip_grad_norm: float = 0.0,    # grad clipping
+            verbose: bool = False,
+            reduce_bucket_size: int = 1024 * 1024,    # communication
+            communication_dtype: Optional[torch.dtype] = None,
+            overlap_communication: bool = True,
+            partition_grad: bool = False,    # stage 2 flag
+            cpu_offload: bool = False,    # cpu offload
+            dp_process_group: Optional[ProcessGroup] = None,    # the dp pg for comm
+            tp_process_group: Optional[ProcessGroup] = None,    # if using tp
+            forced_dtype: Optional[torch.dtype] = None):
+        self.param_info = param_info
+        if use_pipeline:
+            init_pipeline_optimizer(optimizer, model)
+        super().__init__(optimizer, initial_scale, min_scale, growth_factor, backoff_factor, growth_interval,
+                         hysteresis, max_scale, clip_grad_norm, verbose, reduce_bucket_size, communication_dtype,
+                         overlap_communication, partition_grad, cpu_offload, dp_process_group, tp_process_group,
+                         forced_dtype)
+
+
+class HybridParallelPlugin(PipelinePluginBase):
+    """
+    Plugin for Hybrid Parallel Training.
+    Tensor parallel, pipeline parallel and data parallel(DDP/ZeRO) can be picked and combined in this plugin.
+    The size of tp and pp should be passed in by user, then the size of dp is automatically calculated from dp_size = world_size / (tp_size * pp_size).
+
+    Example:
+        >>> from colossalai.booster import Booster
+        >>> from colossalai.booster.plugin import HybridParallelPlugin
+
+        >>> model, train_dataset, optimizer, criterion = ...
+        >>> plugin =  HybridParallelPlugin(tp_size=2, pp_size=2)
+
+        >>> train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=8)
+        >>> booster = Booster(plugin=plugin)
+        >>> model, optimizer, criterion, train_dataloader, _ = booster.boost(model, optimizer, criterion, train_dataloader)
+
+    Args:
+        tp_size (int): The size of tensor parallelism. Tensor parallelism will not be used when tp_size is set to 1.
+        pp_size (int): The number of pipeline stages in pipeline parallelism. Pipeline parallelism will not be used when pp_size is set to 1.
+        precision (str, optional): Specifies the precision of parameters during training.
+                                    Auto-mixied precision will be used when this argument is set to 'fp16' or 'bf16', otherwise model is trained with 'fp32'.
+                                    Defaults to 'fp16'.
+        zero_stage (int, optional): The stage of ZeRO for data parallelism. Can only be choosed from [0, 1, 2].
+                                        When set to 0, ZeRO will not be used. Defaults to 0.
+        enable_all_optimization (bool, optional): Whether to switch on all the optimizations supported by Shardformer.
+                                                    Currently all the optimization methods include fused normalization, flash attention and JIT.
+                                                    Defaults to False.
+        enable_fused_normalization (bool, optional): Whether to switch on fused normalization in Shardformer. Defaults to False.
+        enable_flash_attention (bool, optional): Whether to switch on flash attention in Shardformer. Defaults to False.
+        enable_jit_fused (bool, optional): Whether to switch on JIT in Shardformer. Default to False.
+        enable_sequence_parallelism (bool): Whether to turn on sequence parallelism in Shardformer. Defaults to False.
+        enable_sequence_overlap (bool): Whether to turn on sequence overlap in Shardformer. Defaults to False.
+        num_microbatches (int, optional): Number of microbatches when using pipeline parallelism. Defaults to None.
+        microbatch_size (int, optional): Microbatch size when using pipeline parallelism.
+            Either ``num_microbatches`` or ``microbatch_size`` should be provided if using pipeline.
+            If ``num_microbatches`` is provided, this will be ignored. Defaults to None.
+        initial_scale (float, optional): The initial loss scale of AMP. Defaults to 2**16.
+        min_scale (float, optional): The minimum loss scale of AMP. Defaults to 1.
+        growth_factor (float, optional): The multiplication factor for increasing loss scale when using AMP. Defaults to 2.
+        backoff_factor (float, optional): The multiplication factor for decreasing loss scale when using AMP. Defaults to 0.5.
+        growth_interval (int, optional): The number of steps to increase loss scale when no overflow occurs when using AMP. Defaults to 1000.
+        hysteresis (int, optional):  The number of overflows before decreasing loss scale when using AMP. Defaults to 2.
+        max_scale (float, optional): The maximum loss scale of AMP. Defaults to 2**32.
+        max_norm (float, optional): Maximum norm for gradient clipping. Defaults to 0.
+        broadcast_buffers (bool, optional): Whether to broadcast buffers in the beginning of training when using DDP. Defaults to True.
+        ddp_bucket_cap_mb (int, optional): The bucket size in MB when using DDP. Defaults to 25.
+        find_unused_parameters (bool, optional): Whether to find unused parameters when using DDP. Defaults to False.
+        check_reduction (bool, optional): Whether to check reduction when using DDP. Defaults to False.
+        gradient_as_bucket_view (bool, optional): Whether to use gradient as bucket view when using DDP. Defaults to False.
+        static_graph (bool, optional): Whether to use static graph when using DDP. Defaults to False.
+        zero_bucket_size_in_m (int, optional): Gradient reduce bucket size in million elements when using ZeRO. Defaults to 12.
+        cpu_offload (bool, optional): Whether to open cpu_offload when using ZeRO. Defaults to False.
+        communication_dtype (torch.dtype, optional): Communication dtype when using ZeRO. If not specified, the dtype of param will be used. Defaults to None.
+        overlap_communication (bool, optional): Whether to overlap communication and computation when using ZeRO. Defaults to True.
+    """
+
+    def __init__(self,
+                 tp_size: int,
+                 pp_size: int,
+                 precision: str = 'fp16',
+                 zero_stage: int = 0,
+                 enable_all_optimization: bool = False,
+                 enable_fused_normalization: bool = False,
+                 enable_flash_attention: bool = False,
+                 enable_jit_fused: bool = False,
+                 enable_sequence_parallelism: bool = False,
+                 enable_sequence_overlap: bool = False,
+                 num_microbatches: Optional[int] = None,
+                 microbatch_size: Optional[int] = None,
+                 initial_scale: float = 2**16,
+                 min_scale: float = 1,
+                 growth_factor: float = 2,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 1000,
+                 hysteresis: int = 2,
+                 max_scale: float = 2**32,
+                 max_norm: float = 0,
+                 broadcast_buffers: bool = True,
+                 ddp_bucket_cap_mb: int = 25,
+                 find_unused_parameters: bool = False,
+                 check_reduction: bool = False,
+                 gradient_as_bucket_view: bool = False,
+                 static_graph: bool = False,
+                 zero_bucket_size_in_m: int = 12,
+                 cpu_offload: bool = False,
+                 communication_dtype: Optional[torch.dtype] = None,
+                 overlap_communication: bool = True,
+                 custom_policy: Policy = None) -> None:
+
+        super().__init__()
+        assert dist.get_world_size() % (
+            tp_size * pp_size
+        ) == 0, f'world size {dist.get_world_size()} is not divisible by tp_size {tp_size} * pp_size {pp_size}'
+
+        if enable_sequence_parallelism:
+            assert tp_size > 1, 'Sequence parallelism must be enabled when using tensor parallelism'
+
+        self.tp_size = tp_size
+        self.pp_size = pp_size
+        self.dp_size = dist.get_world_size() // (tp_size * pp_size)
+        self.precision = precision
+        self.zero_stage = zero_stage
+        self.cpu_offload = cpu_offload
+        self.enable_all_optimization = enable_all_optimization
+        self.enable_fused_normalization = enable_fused_normalization
+        self.enable_flash_attention = enable_flash_attention
+        self.enable_jit_fused = enable_jit_fused
+        self.enable_sequence_parallelism = enable_sequence_parallelism
+        self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.tp_size)
+        self.stage_manager = None
+        self.schedule = None
+        self.custom_policy = custom_policy
+        assert zero_stage in (0, 1, 2)
+        if self.pp_size > 1:
+            assert num_microbatches is not None or microbatch_size is not None, 'num_microbatches or microbatch_size must be specified when using pipeline parallelism'
+            assert self.zero_stage <= 1, 'zero stage must be 0 or 1 when using pipeline parallelism'
+            self.stage_manager = PipelineStageManager(self.pg_mesh, PP_AXIS)
+            self.schedule = OneForwardOneBackwardSchedule(self.stage_manager,
+                                                          num_microbatches=num_microbatches,
+                                                          microbatch_size=microbatch_size)
+        self.tp_group = self.pg_mesh.get_group_along_axis(TP_AXIS)
+        self.dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS)
+        self.pp_group = self.pg_mesh.get_group_along_axis(PP_AXIS)
+        self.shard_config = ShardConfig(tensor_parallel_process_group=self.tp_group,
+                                        pipeline_stage_manager=self.stage_manager,
+                                        enable_tensor_parallelism=self.tp_size > 1,
+                                        enable_all_optimization=self.enable_all_optimization,
+                                        enable_fused_normalization=self.enable_fused_normalization,
+                                        enable_flash_attention=self.enable_flash_attention,
+                                        enable_jit_fused=self.enable_jit_fused,
+                                        enable_sequence_parallelism=enable_sequence_parallelism,
+                                        enable_sequence_overlap=enable_sequence_overlap)
+        self.amp_config = dict(
+            initial_scale=initial_scale,
+            growth_factor=growth_factor,
+            backoff_factor=backoff_factor,
+            growth_interval=growth_interval,
+            hysteresis=hysteresis,
+            min_scale=min_scale,
+            max_scale=max_scale,
+        )
+
+        self.ddp_config = dict(broadcast_buffers=broadcast_buffers,
+                               bucket_cap_mb=ddp_bucket_cap_mb,
+                               find_unused_parameters=find_unused_parameters,
+                               check_reduction=check_reduction,
+                               gradient_as_bucket_view=gradient_as_bucket_view,
+                               static_graph=static_graph)
+
+        self.zero_config = dict(reduce_bucket_size=zero_bucket_size_in_m * 1024 * 1024,
+                                communication_dtype=communication_dtype,
+                                overlap_communication=overlap_communication,
+                                cpu_offload=cpu_offload,
+                                partition_grad=(self.zero_stage == 2))
+
+        self.max_norm = max_norm
+
+    @property
+    def enable_pipeline_parallelism(self) -> bool:
+        return self.pp_size > 1
+
+    def supported_devices(self) -> List[str]:
+        return ['cuda']
+
+    def supported_precisions(self) -> List[str]:
+        return ['fp16', 'bf16', 'fp32']
+
+    def control_device(self) -> bool:
+        return True
+
+    def control_precision(self) -> bool:
+        return True
+
+    def support_no_sync(self) -> bool:
+        return False
+
+    def control_checkpoint_io(self) -> bool:
+        return True
+
+    def configure(
+        self,
+        model: Module,
+        optimizer: Optional[Optimizer] = None,
+        criterion: Optional[Callable] = None,
+        dataloader: Optional[DataLoader] = None,
+        lr_scheduler: Optional[LRScheduler] = None,
+    ) -> Tuple[Module, OptimizerWrapper, Callable, DataLoader, LRScheduler]:
+        param_info = get_param_info(optimizer)
+        if not isinstance(model, ModelWrapper):
+            use_ddp = self.dp_size > 1 and self.pp_size == 1 and self.zero_stage == 0
+            model = HybridParallelModule(model, self.precision, self.shard_config, self.dp_group, use_ddp,
+                                         self.ddp_config, self.custom_policy)
+        if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
+            if self.zero_stage == 0:
+                if self.precision in ['fp16', 'bf16']:
+                    optimizer = HybridParallelAMPOptimizer(optimizer,
+                                                           model,
+                                                           use_pipeline=self.enable_pipeline_parallelism,
+                                                           param_info=param_info,
+                                                           precision=self.precision,
+                                                           max_norm=self.max_norm,
+                                                           **self.amp_config)
+                    self.checkpoint_io.link_master_and_working_param(optimizer.working_to_master_map,
+                                                                     optimizer.master_to_working_map)
+                else:
+                    optimizer = HybridParallelNaiveOptimizer(optimizer,
+                                                             model,
+                                                             use_pipeline=self.enable_pipeline_parallelism,
+                                                             param_info=param_info)
+            else:
+                assert self.dp_size > 1, "Please use Zero when data parallel size is greater than 1."
+                assert self.precision != 'fp32', "Please set precision to 'fp16' or 'bf16' when using ZeRO."
+                optimizer = HybridParallelZeroOptimizer(optimizer,
+                                                        model,
+                                                        use_pipeline=self.enable_pipeline_parallelism,
+                                                        param_info=param_info,
+                                                        dp_process_group=self.dp_group,
+                                                        tp_process_group=self.tp_group,
+                                                        verbose=True,
+                                                        clip_grad_norm=self.max_norm,
+                                                        **self.zero_config,
+                                                        **self.amp_config)
+                self.checkpoint_io.link_master_and_working_param(optimizer._param_store.working_to_master_param,
+                                                                 optimizer._param_store.master_to_working_param)
+
+        return model, optimizer, criterion, dataloader, lr_scheduler
+
+    def execute_pipeline(self,
+                         data_iter: Iterator,
+                         model: HybridParallelModule,
+                         criterion: Callable[[Any, Any], torch.Tensor],
+                         optimizer: Optional[Union[HybridParallelNaiveOptimizer, HybridParallelAMPOptimizer,
+                                                   HybridParallelZeroOptimizer]] = None,
+                         return_loss: bool = True,
+                         return_outputs: bool = False) -> dict:
+        assert self.enable_pipeline_parallelism, 'pipeline parallelism is not enabled'
+        # return loss or outputs if needed
+        ctx = optimizer.no_sync() if isinstance(optimizer, HybridParallelZeroOptimizer) else model.no_sync()
+        with ctx:
+            outputs = self.schedule.forward_backward_step(model, data_iter, criterion, optimizer, return_loss,
+                                                          return_outputs)
+        model.sync_shared_params()
+        if isinstance(optimizer, HybridParallelZeroOptimizer):
+            optimizer.sync_grad()
+        else:
+            model.sync_grads()
+        return outputs
+
+    def prepare_dataloader(self,
+                           dataset,
+                           batch_size,
+                           shuffle=False,
+                           seed=1024,
+                           drop_last=False,
+                           pin_memory=False,
+                           num_workers=0,
+                           **kwargs):
+        r"""
+        Prepare a dataloader for distributed training. The dataloader will be wrapped by
+        `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
+
+
+        Args:
+            dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
+            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
+            seed (int, optional): Random worker seed for sampling, defaults to 1024.
+            add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
+            drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+                is not divisible by the batch size. If False and the size of dataset is not divisible by
+                the batch size, then the last batch will be smaller, defaults to False.
+            pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
+            num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
+            kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
+                    `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
+
+        Returns:
+            :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
+        """
+        _kwargs = kwargs.copy()
+        sampler = DistributedSampler(dataset,
+                                     num_replicas=self.pg_mesh.size(DP_AXIS),
+                                     rank=self.pg_mesh.coordinate(DP_AXIS),
+                                     shuffle=shuffle)
+
+        # Deterministic dataloader
+        def seed_worker(worker_id):
+            worker_seed = seed
+            np.random.seed(worker_seed)
+            torch.manual_seed(worker_seed)
+            random.seed(worker_seed)
+
+        return DataLoader(dataset,
+                          batch_size=batch_size,
+                          sampler=sampler,
+                          worker_init_fn=seed_worker,
+                          drop_last=drop_last,
+                          pin_memory=pin_memory,
+                          num_workers=num_workers,
+                          **_kwargs)
+
+    def get_checkpoint_io(self) -> CheckpointIO:
+        self.checkpoint_io = HybridParallelCheckpointIO(self.dp_group, self.pp_group, self.tp_group, self.zero_stage)
+        return self.checkpoint_io
+
+    def no_sync(self, model: Module) -> Iterator[None]:
+        raise NotImplementedError

From 0c2b3ef2b1b548934c3aebf61e6f01dc77258eee Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 18 Sep 2023 11:21:41 +0800
Subject: [PATCH 07/23] finish pp

---
 .../plugin/moe_hybrid_parallel_plugin.py      | 218 ++----------------
 colossalai/moe/manager.py                     |  84 +++++--
 colossalai/tensor/moe_tensor/api.py           |  10 +-
 colossalai/tensor/moe_tensor/moe_info.py      |  15 +-
 examples/language/openmoe/train.py            |  27 ++-
 5 files changed, 110 insertions(+), 244 deletions(-)

diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index d65bd437962e..04ec0ce57cef 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -1,226 +1,38 @@
 import random
-from contextlib import nullcontext
-from functools import partial
-from typing import Any, Callable, Iterator, List, Optional, OrderedDict, Tuple, Union
+from typing import Any, Callable, Iterator, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 import torch.distributed as dist
-from torch.distributed import ProcessGroup
-from torch.nn import Module, SyncBatchNorm
-from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.nn import Module
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
-from torch.utils._pytree import tree_map
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
 
-from colossalai.amp.naive_amp.mixed_precision_optimizer import MixedPrecisionOptimizer
+from colossalai.booster.plugin.hybrid_parallel_plugin import (
+    HybridParallelAMPOptimizer,
+    HybridParallelModule,
+    HybridParallelNaiveOptimizer,
+    HybridParallelZeroOptimizer,
+    get_param_info,
+)
 from colossalai.checkpoint_io import CheckpointIO, HybridParallelCheckpointIO
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 from colossalai.pipeline.schedule import OneForwardOneBackwardSchedule
 from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer import ShardConfig, ShardFormer
+from colossalai.shardformer import ShardConfig
 from colossalai.shardformer.policies.base_policy import Policy
-from colossalai.zero.low_level import LowLevelZeroOptimizer
 
 from .pp_plugin_base import PipelinePluginBase
 
-DP_AXIS, PP_AXIS, TP_AXIS = 0, 1, 2
-
-
-def _convert_floating_point(x, dtype: torch.dtype = torch.float16):
-    if isinstance(x, torch.Tensor) and torch.is_floating_point(x):
-        return x.to(dtype)
-    return x
-
-
-class HybridParallelModule(ModelWrapper):
-
-    def __init__(self, module: Module, precision: str, shard_config: ShardConfig, dp_group: ProcessGroup, use_ddp: bool,
-                 ddp_config: dict, custom_policy: Policy) -> None:
-
-        self.stage_manager = shard_config.pipeline_stage_manager
-        self.dp_group = dp_group
-
-        shardformer = ShardFormer(shard_config)
-        if custom_policy is not None:
-            assert isinstance(custom_policy, object)
-        module, self.shared_params = shardformer.optimize(module, policy=custom_policy)
-
-        # setting process groups for shared parameters
-        self.shared_param_process_groups = []
-        for shared_param in self.shared_params:
-            if len(shared_param) > 0:
-                self.shared_param_process_groups.append(
-                    self.stage_manager.init_process_group_by_stages(list(shared_param.keys())))
-
-        # setting mixed_precision
-        self.mixed_precision = None
-        if precision == 'fp16':
-            self.mixed_precision = torch.float16
-        elif precision == 'bf16':
-            self.mixed_precision = torch.bfloat16
-        if self.mixed_precision is not None:
-            module = module.to(self.mixed_precision)
-        module = module.cuda()
-
-        # setting input type cast when using mixed precision
-        self.convert_fn = None
-        if self.mixed_precision is not None:
-            self.convert_fn = partial(_convert_floating_point, dtype=self.mixed_precision)
-
-        # setting ddp configs
-        if use_ddp:
-            # convert model to sync bn
-            module = SyncBatchNorm.convert_sync_batchnorm(module, dp_group)
-            # wrap the model with PyTorch DDP
-            module = DDP(module, process_group=dp_group, **ddp_config)
-
-        super().__init__(module)
-
-    def sync_shared_params(self):
-        for shared_param, group in zip(self.shared_params, self.shared_param_process_groups):
-            if self.stage_manager.stage in shared_param:
-                param = shared_param[self.stage_manager.stage]
-                dist.all_reduce(param.grad, group=group)
-            dist.barrier()
-
-    def no_sync(self) -> Iterator[None]:
-        # no sync grads across data parallel
-        return nullcontext()
-
-    def sync_grads(self):
-        # sync grad across data parallel
-        if self.dp_group.size() == 1:
-            return
-        for p in self.module.parameters():
-            if p.grad is not None:
-                dist.all_reduce(p.grad, group=self.dp_group)
-                p.grad.div_(self.dp_group.size())
-
-    def forward(self, *args, **kwargs):
-        if self.convert_fn is not None:
-            args = tree_map(self.convert_fn, args)
-            kwargs = tree_map(self.convert_fn, kwargs)
-        return super().forward(*args, **kwargs)
-
-    def unwrap(self):
-        module = super().unwrap()
-        if isinstance(module, DDP):
-            module = module.module
-        return module
-
-
-def get_param_info(optim: Optimizer):
-    # Get a backup of necessary information of parameters for future use, which includes:
-    # 1. A complete param_group, with params in the form of param_id
-    # 2. A mapping from param address (obtained using id(param)) to integer param_id
-    # 3. A mapping from integer param_id to param address.
-    # 4. A mapping from param_address (obtained using id(param)) to the original shape of parameter before sharding.
-    # When Zero is used, the params here are fp16/bf16 model params rather than fp32 master params in optimizer.
-
-    if optim is None:
-        return {}
-    param_info = {'param_groups': [], 'param2id': {}, 'id2param': {}, 'param2shape': {}}
-    start_index = 0
-    for group in optim.param_groups:
-
-        packed_group = {k: v for k, v in group.items() if k != 'params'}
-        packed_group['params'] = []
-
-        for param_id, param in enumerate(group['params'], start_index):
-            original_shape = param.shape if isinstance(param, torch.Tensor) else None
-            packed_group['params'].append(param_id)
-            param_info['param2id'][id(param)] = param_id
-            param_info['id2param'][param_id] = id(param)
-            param_info['param2shape'][id(param)] = original_shape
-
-        param_info['param_groups'].append(packed_group)
-        start_index += len(group['params'])
-
-    return param_info
-
-
-def init_pipeline_optimizer(optim: Optimizer, model: Module):
-    model_params = set(model.parameters())
-    new_param_groups = []
-    for group in optim.param_groups:
-        params = [p for p in group['params'] if p in model_params]
-        new_param_groups.append({**group, 'params': params})
-    optim.__setstate__({'param_groups': new_param_groups})
-
-
-class HybridParallelNaiveOptimizer(OptimizerWrapper):
-
-    def __init__(self, optim: Optimizer, model: Module, use_pipeline: bool, param_info: OrderedDict):
-        self.param_info = param_info
-        if use_pipeline:
-            init_pipeline_optimizer(optim, model)
-        super().__init__(optim)
-
-
-class HybridParallelAMPOptimizer(MixedPrecisionOptimizer):
+PP_AXIS, DP_AXIS, TP_AXIS = 0, 1, 2
 
-    def __init__(self,
-                 optim: Optimizer,
-                 model: Module,
-                 use_pipeline: bool,
-                 param_info: OrderedDict,
-                 precision: str = 'fp16',
-                 initial_scale: float = 2**16,
-                 min_scale: float = 1,
-                 growth_factor: float = 2,
-                 backoff_factor: float = 0.5,
-                 growth_interval: int = 1000,
-                 hysteresis: int = 2,
-                 max_scale: float = 2**32,
-                 max_norm: float = 0):
-        self.param_info = param_info
-        if use_pipeline:
-            init_pipeline_optimizer(optim, model)
-        super().__init__(optim, precision, initial_scale, min_scale, growth_factor, backoff_factor, growth_interval,
-                         hysteresis, max_scale, max_norm)
-
-
-class HybridParallelZeroOptimizer(LowLevelZeroOptimizer):
-
-    def __init__(
-            self,
-            optimizer: Optimizer,
-            model: Module,
-            use_pipeline: bool,
-            param_info: OrderedDict,
-            initial_scale: int = 2**16,    # grad scaler config
-            min_scale: int = 1,
-            growth_factor: float = 2.,
-            backoff_factor: float = .5,
-            growth_interval: int = 2000,
-            hysteresis: int = 2,
-            max_scale: int = 2**24,
-            clip_grad_norm: float = 0.0,    # grad clipping
-            verbose: bool = False,
-            reduce_bucket_size: int = 1024 * 1024,    # communication
-            communication_dtype: Optional[torch.dtype] = None,
-            overlap_communication: bool = True,
-            partition_grad: bool = False,    # stage 2 flag
-            cpu_offload: bool = False,    # cpu offload
-            dp_process_group: Optional[ProcessGroup] = None,    # the dp pg for comm
-            tp_process_group: Optional[ProcessGroup] = None,    # if using tp
-            forced_dtype: Optional[torch.dtype] = None):
-        self.param_info = param_info
-        if use_pipeline:
-            init_pipeline_optimizer(optimizer, model)
-        super().__init__(optimizer, initial_scale, min_scale, growth_factor, backoff_factor, growth_interval,
-                         hysteresis, max_scale, clip_grad_norm, verbose, reduce_bucket_size, communication_dtype,
-                         overlap_communication, partition_grad, cpu_offload, dp_process_group, tp_process_group,
-                         forced_dtype)
-
-
-class HybridParallelPlugin(PipelinePluginBase):
+
+class MoeHybridParallelPlugin(PipelinePluginBase):
     """
-    Plugin for Hybrid Parallel Training.
+    Plugin for Moe Hybrid Parallel Training.
     Tensor parallel, pipeline parallel and data parallel(DDP/ZeRO) can be picked and combined in this plugin.
     The size of tp and pp should be passed in by user, then the size of dp is automatically calculated from dp_size = world_size / (tp_size * pp_size).
 
@@ -327,7 +139,7 @@ def __init__(self,
         self.enable_flash_attention = enable_flash_attention
         self.enable_jit_fused = enable_jit_fused
         self.enable_sequence_parallelism = enable_sequence_parallelism
-        self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.tp_size)
+        self.pg_mesh = ProcessGroupMesh(self.pp_size, self.dp_size, self.tp_size)
         self.stage_manager = None
         self.schedule = None
         self.custom_policy = custom_policy
diff --git a/colossalai/moe/manager.py b/colossalai/moe/manager.py
index 3dc27c6cb0f0..30f191a1de91 100644
--- a/colossalai/moe/manager.py
+++ b/colossalai/moe/manager.py
@@ -24,6 +24,7 @@ def __init__(self):
         self.router_z_loss = []
         self.parallel = None
         self.seed = None
+        self.mode = None
         self.use_kernel_optim = True
 
         self.has_setup = False
@@ -37,16 +38,50 @@ def parallel_info_dict(self):
     def is_initialized(self):
         return self.has_setup
 
-    def setup(self, seed: int, use_kernel_optim: bool = True, max_ep_size: int = 8, parallel: bool = None):
+    def setup(self,
+              seed: int,
+              use_kernel_optim: bool = True,
+              parallel: bool = None,
+              mode: str = "dynamic",
+              max_ep_size: int = 8,
+              fixed_dp_size: int = 0,
+              fixed_ep_size: int = 0,
+              fixed_pp_size: int = 0) -> None:
+        """
+        Setup MoE distributed context.
+
+        Args:
+            seed (int): Random seed. Defaults to 42.
+            use_kernel_optim (bool, optional): Use cuda kernel. Defaults to True.
+            parallel (bool, optional): Parallel mode, should be EP, TP or None. Defaults to None.
+            mode (str, optional): Should be "fixed" or "dynamic". Defaults to "dynamic".
+                In fixed mode, the ep size and dp size is fixed.
+                In dynamic mode, the ep size and dp size will be changed according to num experts.
+            max_ep_size (int, optional): Max ep size in dynamic mode. Defaults to 8.
+            fixed_dp_size (int, optional): Fixed dp size in fixed mode. Defaults to 0.
+            fixed_ep_size (int, optional): Fixed ep size in fixed mode. Defaults to 0.
+            fixed_pp_size (int, optional): Fixed pp size in fixed mode. Defaults to 0.
+        """
         assert not self.is_initialized, "MoE distributed context shouldn't be set up again"
         assert torch.cuda.is_available(), "MoE requires to enable CUDA first"
 
         self.world_size = dist.get_world_size()
         self.seed = seed + dist.get_rank()
-        self.max_ep_size = min(max_ep_size, dist.get_world_size())
-        self.min_dp_size = self.world_size // self.max_ep_size
         self.parallel = parallel
 
+        # init by mode
+        assert mode in ["fixed", "dynamic"], "mode should be fixed or dynamic"
+        if mode == "dynamic":
+            self.max_ep_size = min(max_ep_size, dist.get_world_size())
+            self.min_dp_size = self.world_size // self.max_ep_size
+        else:
+            assert fixed_dp_size > 0 and fixed_ep_size > 0 and fixed_pp_size > 0, "dp_size, ep_size and pp_size should be greater than 0"
+            assert isinstance(fixed_dp_size, int) and isinstance(fixed_ep_size, int) and isinstance(
+                fixed_pp_size, int), "dp_size, ep_size and pp_size should be int"
+            self.ep_size = fixed_ep_size
+            self.dp_size = fixed_dp_size
+            self.pp_size = fixed_pp_size
+
         # Enabling kernel optimization may raise error in some cases
         # Users can close kernel optimization manually
         self.use_kernel_optim = use_kernel_optim
@@ -67,30 +102,39 @@ def get_info(self, num_experts: int, use_tp: bool = False) -> Tuple[int, MoePara
             number of local experts, the MoeParallelInfo of the current ep_size
         """
 
-        gt_flag = num_experts % self.max_ep_size == 0    # check whether num_experts is greater
-        lt_flag = self.max_ep_size % num_experts == 0    # check whether num_experts is less
-
-        assert gt_flag or lt_flag, "Automatic experts placement dose not not support expert number" \
-                                   " is not a multiple of ep size or vice versa."
-
-        # If the number of experts is greater than maximum expert parallel size. a.k.a ep_size,
-        # there are multiple experts in each GPU and each GPU has different experts
-        # So it's data parallel size is 1
-        # Otherwise, there is only one expert in each GPU
-        # The data parallel size should be calculated
-        dp_size = 1 if gt_flag else self.max_ep_size // num_experts
-        ep_size = self.max_ep_size // dp_size
+        if self.mode == "dynamic":
+            gt_flag = num_experts % self.max_ep_size == 0    # check whether num_experts is greater
+            lt_flag = self.max_ep_size % num_experts == 0    # check whether num_experts is less
+
+            assert gt_flag or lt_flag, "Automatic experts placement dose not not support expert number" \
+                                    " is not a multiple of ep size or vice versa."
+
+            # If the number of experts is greater than maximum expert parallel size. a.k.a ep_size,
+            # there are multiple experts in each GPU and each GPU has different experts
+            # So it's data parallel size is 1
+            # Otherwise, there is only one expert in each GPU
+            # The data parallel size should be calculated
+            dp_size = 1 if gt_flag else self.max_ep_size // num_experts
+            ep_size = self.max_ep_size // dp_size
+            # Don't forget to multiply minimum data parallel size
+            dp_size *= self.min_dp_size
+            pp_size = None
+        else:
+            dp_size = self.dp_size
+            ep_size = self.ep_size
+            pp_size = self.pp_size
 
         # Calculate the number of experts for each GPU
         if use_tp:
             num_local_experts = num_experts
         else:
-            num_local_experts = 1 if lt_flag else num_experts // self.max_ep_size
+            if self.mode == "dynamic":
+                num_local_experts = 1 if lt_flag else num_experts // self.max_ep_size
+            else:
+                num_local_experts = num_experts // ep_size
 
-        # Don't forget to multiply minimum data parallel size
-        dp_size *= self.min_dp_size
         if not (ep_size in self.parallel_info_dict):
-            self.parallel_info_dict[ep_size] = get_moe_info(ep_size, dp_size)
+            self.parallel_info_dict[ep_size] = get_moe_info(ep_size, dp_size, pp_size)
 
         return num_local_experts, self.parallel_info_dict[ep_size]
 
diff --git a/colossalai/tensor/moe_tensor/api.py b/colossalai/tensor/moe_tensor/api.py
index 442b3c0f4958..fc4ed14e0ef7 100644
--- a/colossalai/tensor/moe_tensor/api.py
+++ b/colossalai/tensor/moe_tensor/api.py
@@ -28,20 +28,22 @@ def set_moe_tensor_info(tensor: torch.Tensor, moe_info: MoeParallelInfo) -> None
         moe_info (dict): The moe info to be set.
 
     """
-    tensor.__setattr__('moe_info', moe_info)
+    tensor.__setattr__("moe_info", moe_info)
 
 
-def get_moe_info(ep_size: int, dp_size: int) -> MoeParallelInfo:
+def get_moe_info(ep_size: int, dp_size: int, pp_size: int) -> MoeParallelInfo:
     """
     Get moe info for the given tensor.
 
     Args:
-        tensor (torch.Tensor): The tensor to be checked.
+        ep_size (int): The expert parallel size.
+        dp_size (int): The data parallel size.
+        pp_size (int): The pipeline parallel size.
 
     Returns:
         dict: The moe info of the given tensor.
     """
-    return MoeParallelInfo(ep_size, dp_size)
+    return MoeParallelInfo(ep_size, dp_size, pp_size)
 
 
 def get_ep_group(tensor: torch.Tensor) -> ProcessGroup:
diff --git a/colossalai/tensor/moe_tensor/moe_info.py b/colossalai/tensor/moe_tensor/moe_info.py
index ca7f163b9c24..2d3c2efbfb31 100644
--- a/colossalai/tensor/moe_tensor/moe_info.py
+++ b/colossalai/tensor/moe_tensor/moe_info.py
@@ -2,15 +2,14 @@
 
 
 class MoeParallelInfo:
-    """Moe parallelism information, storing parallel sizes and groups.
-    """
+    """Moe parallelism information, storing parallel sizes and groups."""
+
+    def __init__(self, ep_size: int, dp_size: int, pp_size: int = 1):
+        self.pp_axis, self.dp_axis, self.ep_axis = 0, 1, 2
+        self.pp_size, self.dp_size, self.ep_size = pp_size, dp_size, ep_size
+
+        self.pg = ProcessGroupMesh(self.pp_size, self.dp_size, self.ep_size)
 
-    def __init__(self, ep_size: int, dp_size: int):
-        self.dp_axis = 0
-        self.dp_size = dp_size
-        self.ep_axis = 1
-        self.ep_size = ep_size
-        self.pg = ProcessGroupMesh(self.dp_size, self.ep_size)
         self.ep_group = self.pg.get_group_along_axis(self.ep_axis)
         self.ep_group_ranks = self.pg.get_ranks_in_group(self.ep_group)
         self.dp_group = self.pg.get_group_along_axis(self.dp_axis)
diff --git a/examples/language/openmoe/train.py b/examples/language/openmoe/train.py
index 6351d26ca0a1..1bc19d3d726b 100644
--- a/examples/language/openmoe/train.py
+++ b/examples/language/openmoe/train.py
@@ -14,7 +14,8 @@
 import colossalai
 from colossalai import get_default_parser
 from colossalai.booster import Booster
-from colossalai.booster.plugin import HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.booster.plugin import LowLevelZeroPlugin
+from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.moe import MoeCheckpintIO
@@ -82,8 +83,9 @@ def parse_args():
                         help="parallel plugin",
                         choices=["zero1", "zero2", "hybrid"])
     # hybrid plugin
-    parser.add_argument("--tp_size", type=int, default=1, help="tp size")
     parser.add_argument("--pp_size", type=int, default=2, help="pp size")
+    parser.add_argument("--dp_size", type=int, default=1, help="dp size")
+    parser.add_argument("--ep_size", type=int, default=2, help="ep size")
     parser.add_argument("--zero_stage", type=int, default=1, help="zero stage in hybrid plugin")
     parser.add_argument("--microbatch_size", type=int, default=1, help="microbatch size")
     # loss
@@ -107,7 +109,14 @@ def main():
     coordinator = DistCoordinator()
 
     # Set up moe
-    MOE_MANAGER.setup(seed=42, parallel=None)
+    assert args.dp_size * args.ep_size * args.pp_size == coordinator.world_size, "dp_size * ep_size * pp_size must equal to world_size"
+    # MOE_MANAGER.setup(seed=42, parallel=None)
+    MOE_MANAGER.setup(seed=42,
+                      parallel="EP",
+                      mode="fixed",
+                      fixed_dp_size=args.dp_size,
+                      fixed_ep_size=args.ep_size,
+                      fixed_pp_size=args.pp_size)
 
     # Manage loggers
     disable_existing_loggers()
@@ -146,11 +155,11 @@ def main():
     elif args.plugin == "zero2":
         plugin = LowLevelZeroPlugin(initial_scale=2**5, stage=2)
     elif args.plugin == "hybrid":
-        plugin = HybridParallelPlugin(tp_size=args.tp_size,
-                                      pp_size=args.pp_size,
-                                      zero_stage=args.zero_stage,
-                                      microbatch_size=args.microbatch_size,
-                                      custom_policy=OpenMoeForCausalLMPolicy())
+        plugin = MoeHybridParallelPlugin(tp_size=1,
+                                         pp_size=args.pp_size,
+                                         zero_stage=args.zero_stage,
+                                         microbatch_size=args.microbatch_size,
+                                         custom_policy=OpenMoeForCausalLMPolicy())
     else:
         raise ValueError(f"Invalid plugin {args.plugin}")
     logger.info(f"Set plugin as {plugin}", ranks=[0])
@@ -166,7 +175,7 @@ def main():
     # Set booster
     booster = Booster(plugin=plugin, **booster_kwargs)
     model, optimizer, _, dataloader, _ = booster.boost(model=model, optimizer=optimizer, dataloader=dataloader)
-    use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
+    use_pipeline = isinstance(booster.plugin, MoeHybridParallelPlugin) and booster.plugin.pp_size > 1
     is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage()
     logger.info(f"Finish init booster", ranks=[0])
 

From 58d99f08e9882031f94cd0088a68d36a048537b6 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 18 Sep 2023 11:31:27 +0800
Subject: [PATCH 08/23] update setup for different plugin

---
 examples/language/openmoe/train.py | 126 ++++++++++++++++++-----------
 1 file changed, 78 insertions(+), 48 deletions(-)

diff --git a/examples/language/openmoe/train.py b/examples/language/openmoe/train.py
index 1bc19d3d726b..ab72659eff27 100644
--- a/examples/language/openmoe/train.py
+++ b/examples/language/openmoe/train.py
@@ -54,34 +54,42 @@ def __len__(self):
 
     def __getitem__(self, idx):
         return {
-            'input_ids': self.input_ids[idx],
-            'attention_mask': self.attention_mask[idx],
-            'labels': self.input_ids[idx]
+            "input_ids": self.input_ids[idx],
+            "attention_mask": self.attention_mask[idx],
+            "labels": self.input_ids[idx],
         }
 
 
 def parse_args():
     # basic settings
     parser = get_default_parser()
-    parser.add_argument("--model_name",
-                        type=str,
-                        default="base",
-                        help="Path to pretrained model or model identifier from huggingface.co/models.")
-    parser.add_argument("--output_path",
-                        type=str,
-                        default="./output_model.bin",
-                        help="The path of your saved model after finetuning.")
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="base",
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default="./output_model.bin",
+        help="The path of your saved model after finetuning.",
+    )
     parser.add_argument("--num_epoch", type=int, default=10, help="Number of epochs.")
-    parser.add_argument("--batch_size",
-                        type=int,
-                        default=4,
-                        help="Batch size (per dp group) for the training dataloader.")
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=4,
+        help="Batch size (per dp group) for the training dataloader.",
+    )
     parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
-    parser.add_argument("--plugin",
-                        type=str,
-                        default="hybrid",
-                        help="parallel plugin",
-                        choices=["zero1", "zero2", "hybrid"])
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="hybrid",
+        help="parallel plugin",
+        choices=["zero1", "zero2", "hybrid"],
+    )
     # hybrid plugin
     parser.add_argument("--pp_size", type=int, default=2, help="pp size")
     parser.add_argument("--dp_size", type=int, default=1, help="dp size")
@@ -89,8 +97,18 @@ def parse_args():
     parser.add_argument("--zero_stage", type=int, default=1, help="zero stage in hybrid plugin")
     parser.add_argument("--microbatch_size", type=int, default=1, help="microbatch size")
     # loss
-    parser.add_argument("--router_aux_loss_factor", type=float, default=0.01, help="router_aux_loss_factor.")
-    parser.add_argument("--router_z_loss_factor", type=float, default=0.0001, help="router_z_loss_factor.")
+    parser.add_argument(
+        "--router_aux_loss_factor",
+        type=float,
+        default=0.01,
+        help="router_aux_loss_factor.",
+    )
+    parser.add_argument(
+        "--router_z_loss_factor",
+        type=float,
+        default=0.0001,
+        help="router_z_loss_factor.",
+    )
     parser.add_argument("--label_smoothing", type=float, default=0.0, help="label_smoothing.")
     parser.add_argument("--z_loss_factor", type=float, default=0.0001, help="z_loss_factor.")
     # optim
@@ -109,14 +127,19 @@ def main():
     coordinator = DistCoordinator()
 
     # Set up moe
-    assert args.dp_size * args.ep_size * args.pp_size == coordinator.world_size, "dp_size * ep_size * pp_size must equal to world_size"
-    # MOE_MANAGER.setup(seed=42, parallel=None)
-    MOE_MANAGER.setup(seed=42,
-                      parallel="EP",
-                      mode="fixed",
-                      fixed_dp_size=args.dp_size,
-                      fixed_ep_size=args.ep_size,
-                      fixed_pp_size=args.pp_size)
+    assert (args.dp_size * args.ep_size *
+            args.pp_size == coordinator.world_size), "dp_size * ep_size * pp_size must equal to world_size"
+    if args.plugin in ["zero1", "zero2"]:
+        MOE_MANAGER.setup(seed=42, parallel="EP")
+    elif args.plugin == "hybrid":
+        MOE_MANAGER.setup(
+            seed=42,
+            parallel="EP",
+            mode="fixed",
+            fixed_dp_size=args.dp_size,
+            fixed_ep_size=args.ep_size,
+            fixed_pp_size=args.pp_size,
+        )
 
     # Manage loggers
     disable_existing_loggers()
@@ -155,11 +178,13 @@ def main():
     elif args.plugin == "zero2":
         plugin = LowLevelZeroPlugin(initial_scale=2**5, stage=2)
     elif args.plugin == "hybrid":
-        plugin = MoeHybridParallelPlugin(tp_size=1,
-                                         pp_size=args.pp_size,
-                                         zero_stage=args.zero_stage,
-                                         microbatch_size=args.microbatch_size,
-                                         custom_policy=OpenMoeForCausalLMPolicy())
+        plugin = MoeHybridParallelPlugin(
+            tp_size=1,
+            pp_size=args.pp_size,
+            zero_stage=args.zero_stage,
+            microbatch_size=args.microbatch_size,
+            custom_policy=OpenMoeForCausalLMPolicy(),
+        )
     else:
         raise ValueError(f"Invalid plugin {args.plugin}")
     logger.info(f"Set plugin as {plugin}", ranks=[0])
@@ -175,7 +200,7 @@ def main():
     # Set booster
     booster = Booster(plugin=plugin, **booster_kwargs)
     model, optimizer, _, dataloader, _ = booster.boost(model=model, optimizer=optimizer, dataloader=dataloader)
-    use_pipeline = isinstance(booster.plugin, MoeHybridParallelPlugin) and booster.plugin.pp_size > 1
+    use_pipeline = (isinstance(booster.plugin, MoeHybridParallelPlugin) and booster.plugin.pp_size > 1)
     is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage()
     logger.info(f"Finish init booster", ranks=[0])
 
@@ -185,29 +210,34 @@ def main():
         model.train()
         train_dataloader_iter = iter(dataloader)
         total_len = len(train_dataloader_iter)
-        with tqdm(range(total_len), desc=f'Epoch [{epoch + 1}/{args.num_epoch}]',
-                  disable=not coordinator.is_master()) as pbar:
+        with tqdm(
+                range(total_len),
+                desc=f"Epoch [{epoch + 1}/{args.num_epoch}]",
+                disable=not coordinator.is_master(),
+        ) as pbar:
             # Forward pass
             for _ in pbar:
                 if use_pipeline:
-                    outputs = booster.execute_pipeline(train_dataloader_iter,
-                                                       model,
-                                                       lambda x, y: x.loss,
-                                                       optimizer,
-                                                       return_loss=True,
-                                                       return_outputs=True)
+                    outputs = booster.execute_pipeline(
+                        train_dataloader_iter,
+                        model,
+                        lambda x, y: x.loss,
+                        optimizer,
+                        return_loss=True,
+                        return_outputs=True,
+                    )
                     # Backward and optimize
                     if is_pp_last_stage:
-                        loss = outputs['loss']
-                        pbar.set_postfix({'loss': loss.item()})
+                        loss = outputs["loss"]
+                        pbar.set_postfix({"loss": loss.item()})
                 else:
                     data = next(train_dataloader_iter)
                     data = move_to_cuda(data, torch.cuda.current_device())
                     outputs = model(**data)
-                    loss = outputs['loss']
+                    loss = outputs["loss"]
                     # Backward
                     booster.backward(loss, optimizer)
-                    pbar.set_postfix({'loss': loss.item()})
+                    pbar.set_postfix({"loss": loss.item()})
 
                 optimizer.step()
                 optimizer.zero_grad()

From e67b3aec08b2ba87824f3f500ccdbadba44adf64 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 18 Sep 2023 12:33:33 +0800
Subject: [PATCH 09/23] update ci

---
 colossalai/moe/manager.py            | 7 ++++---
 examples/language/openmoe/test_ci.sh | 2 +-
 examples/language/openmoe/train.py   | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/colossalai/moe/manager.py b/colossalai/moe/manager.py
index 30f191a1de91..0f2964cb1076 100644
--- a/colossalai/moe/manager.py
+++ b/colossalai/moe/manager.py
@@ -70,8 +70,9 @@ def setup(self,
         self.parallel = parallel
 
         # init by mode
-        assert mode in ["fixed", "dynamic"], "mode should be fixed or dynamic"
-        if mode == "dynamic":
+        self.mode = mode
+        assert self.mode in ["fixed", "dynamic"], "mode should be fixed or dynamic"
+        if self.mode == "dynamic":
             self.max_ep_size = min(max_ep_size, dist.get_world_size())
             self.min_dp_size = self.world_size // self.max_ep_size
         else:
@@ -118,7 +119,7 @@ def get_info(self, num_experts: int, use_tp: bool = False) -> Tuple[int, MoePara
             ep_size = self.max_ep_size // dp_size
             # Don't forget to multiply minimum data parallel size
             dp_size *= self.min_dp_size
-            pp_size = None
+            pp_size = 1
         else:
             dp_size = self.dp_size
             ep_size = self.ep_size
diff --git a/examples/language/openmoe/test_ci.sh b/examples/language/openmoe/test_ci.sh
index 75eee902c747..8361b66c50d1 100644
--- a/examples/language/openmoe/test_ci.sh
+++ b/examples/language/openmoe/test_ci.sh
@@ -2,4 +2,4 @@ set -xe
 pip install -r requirements.txt
 
 python infer.py --model "test"
-torchrun --standalone --nproc_per_node 2 train.py --model_name "test" --batch_size 1 --num_epoch 20
+torchrun --standalone --nproc_per_node 2 train.py --model_name "test" --batch_size 1 --num_epoch 1 --plugin zero2
diff --git a/examples/language/openmoe/train.py b/examples/language/openmoe/train.py
index ab72659eff27..ad20920eb24b 100644
--- a/examples/language/openmoe/train.py
+++ b/examples/language/openmoe/train.py
@@ -127,11 +127,11 @@ def main():
     coordinator = DistCoordinator()
 
     # Set up moe
-    assert (args.dp_size * args.ep_size *
-            args.pp_size == coordinator.world_size), "dp_size * ep_size * pp_size must equal to world_size"
     if args.plugin in ["zero1", "zero2"]:
         MOE_MANAGER.setup(seed=42, parallel="EP")
     elif args.plugin == "hybrid":
+        assert (args.dp_size * args.ep_size *
+                args.pp_size == coordinator.world_size), "dp_size * ep_size * pp_size must equal to world_size"
         MOE_MANAGER.setup(
             seed=42,
             parallel="EP",

From 89e8f99b1acb3a2c48421aaf13956016740113fc Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 18 Sep 2023 13:58:50 +0800
Subject: [PATCH 10/23] update ci

---
 colossalai/moe/manager.py            | 2 +-
 examples/language/openmoe/test_ci.sh | 1 +
 examples/language/openmoe/train.py   | 7 ++++++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/colossalai/moe/manager.py b/colossalai/moe/manager.py
index 0f2964cb1076..d05ee4c25450 100644
--- a/colossalai/moe/manager.py
+++ b/colossalai/moe/manager.py
@@ -25,7 +25,7 @@ def __init__(self):
         self.parallel = None
         self.seed = None
         self.mode = None
-        self.use_kernel_optim = True
+        self.use_kernel_optim = False
 
         self.has_setup = False
         self._parallel_info_dict = dict()
diff --git a/examples/language/openmoe/test_ci.sh b/examples/language/openmoe/test_ci.sh
index 8361b66c50d1..c555c3e5b116 100644
--- a/examples/language/openmoe/test_ci.sh
+++ b/examples/language/openmoe/test_ci.sh
@@ -3,3 +3,4 @@ pip install -r requirements.txt
 
 python infer.py --model "test"
 torchrun --standalone --nproc_per_node 2 train.py --model_name "test" --batch_size 1 --num_epoch 1 --plugin zero2
+torchrun --standalone --nproc_per_node 4 train.py --model_name "test" --batch_size 1 --num_epoch 1 --plugin hybrid --pp_size 2 --dp_size 1 --ep_size 2
diff --git a/examples/language/openmoe/train.py b/examples/language/openmoe/train.py
index ad20920eb24b..efb61d1bb69c 100644
--- a/examples/language/openmoe/train.py
+++ b/examples/language/openmoe/train.py
@@ -128,7 +128,11 @@ def main():
 
     # Set up moe
     if args.plugin in ["zero1", "zero2"]:
-        MOE_MANAGER.setup(seed=42, parallel="EP")
+        MOE_MANAGER.setup(
+            seed=42,
+            parallel="EP",
+            use_kernel_optim=True if args.model_name != "test" else False,
+        )
     elif args.plugin == "hybrid":
         assert (args.dp_size * args.ep_size *
                 args.pp_size == coordinator.world_size), "dp_size * ep_size * pp_size must equal to world_size"
@@ -139,6 +143,7 @@ def main():
             fixed_dp_size=args.dp_size,
             fixed_ep_size=args.ep_size,
             fixed_pp_size=args.pp_size,
+            use_kernel_optim=True if args.model_name != "test" else False,
         )
 
     # Manage loggers

From fae3c50363e27aace033535bc781e3b357cca712 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 18 Sep 2023 14:00:29 +0800
Subject: [PATCH 11/23] update ci

---
 examples/language/openmoe/test_ci.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/language/openmoe/test_ci.sh b/examples/language/openmoe/test_ci.sh
index c555c3e5b116..8361b66c50d1 100644
--- a/examples/language/openmoe/test_ci.sh
+++ b/examples/language/openmoe/test_ci.sh
@@ -3,4 +3,3 @@ pip install -r requirements.txt
 
 python infer.py --model "test"
 torchrun --standalone --nproc_per_node 2 train.py --model_name "test" --batch_size 1 --num_epoch 1 --plugin zero2
-torchrun --standalone --nproc_per_node 4 train.py --model_name "test" --batch_size 1 --num_epoch 1 --plugin hybrid --pp_size 2 --dp_size 1 --ep_size 2

From 4abb220f5829b1db0c9948c5de794f324de967bd Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 18 Sep 2023 14:26:43 +0800
Subject: [PATCH 12/23] support ep inside or dp inside

---
 colossalai/moe/manager.py                |  8 ++++++--
 colossalai/tensor/moe_tensor/api.py      |  5 +++--
 colossalai/tensor/moe_tensor/moe_info.py | 20 ++++++++++++++++----
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/colossalai/moe/manager.py b/colossalai/moe/manager.py
index d05ee4c25450..e61fb0bf9582 100644
--- a/colossalai/moe/manager.py
+++ b/colossalai/moe/manager.py
@@ -26,6 +26,7 @@ def __init__(self):
         self.seed = None
         self.mode = None
         self.use_kernel_optim = False
+        self.use_ep_inside = None
 
         self.has_setup = False
         self._parallel_info_dict = dict()
@@ -46,7 +47,8 @@ def setup(self,
               max_ep_size: int = 8,
               fixed_dp_size: int = 0,
               fixed_ep_size: int = 0,
-              fixed_pp_size: int = 0) -> None:
+              fixed_pp_size: int = 0,
+              use_ep_inside: bool = True) -> None:
         """
         Setup MoE distributed context.
 
@@ -61,6 +63,7 @@ def setup(self,
             fixed_dp_size (int, optional): Fixed dp size in fixed mode. Defaults to 0.
             fixed_ep_size (int, optional): Fixed ep size in fixed mode. Defaults to 0.
             fixed_pp_size (int, optional): Fixed pp size in fixed mode. Defaults to 0.
+            use_ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if Fasle. Defaults to True.
         """
         assert not self.is_initialized, "MoE distributed context shouldn't be set up again"
         assert torch.cuda.is_available(), "MoE requires to enable CUDA first"
@@ -68,6 +71,7 @@ def setup(self,
         self.world_size = dist.get_world_size()
         self.seed = seed + dist.get_rank()
         self.parallel = parallel
+        self.use_ep_inside = use_ep_inside
 
         # init by mode
         self.mode = mode
@@ -135,7 +139,7 @@ def get_info(self, num_experts: int, use_tp: bool = False) -> Tuple[int, MoePara
                 num_local_experts = num_experts // ep_size
 
         if not (ep_size in self.parallel_info_dict):
-            self.parallel_info_dict[ep_size] = get_moe_info(ep_size, dp_size, pp_size)
+            self.parallel_info_dict[ep_size] = get_moe_info(ep_size, dp_size, pp_size, ep_inside=self.use_ep_inside)
 
         return num_local_experts, self.parallel_info_dict[ep_size]
 
diff --git a/colossalai/tensor/moe_tensor/api.py b/colossalai/tensor/moe_tensor/api.py
index fc4ed14e0ef7..9120a40b8533 100644
--- a/colossalai/tensor/moe_tensor/api.py
+++ b/colossalai/tensor/moe_tensor/api.py
@@ -31,7 +31,7 @@ def set_moe_tensor_info(tensor: torch.Tensor, moe_info: MoeParallelInfo) -> None
     tensor.__setattr__("moe_info", moe_info)
 
 
-def get_moe_info(ep_size: int, dp_size: int, pp_size: int) -> MoeParallelInfo:
+def get_moe_info(ep_size: int, dp_size: int, pp_size: int, ep_inside: bool) -> MoeParallelInfo:
     """
     Get moe info for the given tensor.
 
@@ -39,11 +39,12 @@ def get_moe_info(ep_size: int, dp_size: int, pp_size: int) -> MoeParallelInfo:
         ep_size (int): The expert parallel size.
         dp_size (int): The data parallel size.
         pp_size (int): The pipeline parallel size.
+        ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if Fasle.
 
     Returns:
         dict: The moe info of the given tensor.
     """
-    return MoeParallelInfo(ep_size, dp_size, pp_size)
+    return MoeParallelInfo(ep_inside, ep_size, dp_size, pp_size)
 
 
 def get_ep_group(tensor: torch.Tensor) -> ProcessGroup:
diff --git a/colossalai/tensor/moe_tensor/moe_info.py b/colossalai/tensor/moe_tensor/moe_info.py
index 2d3c2efbfb31..5097ac1044e7 100644
--- a/colossalai/tensor/moe_tensor/moe_info.py
+++ b/colossalai/tensor/moe_tensor/moe_info.py
@@ -4,11 +4,23 @@
 class MoeParallelInfo:
     """Moe parallelism information, storing parallel sizes and groups."""
 
-    def __init__(self, ep_size: int, dp_size: int, pp_size: int = 1):
-        self.pp_axis, self.dp_axis, self.ep_axis = 0, 1, 2
-        self.pp_size, self.dp_size, self.ep_size = pp_size, dp_size, ep_size
+    def __init__(self, ep_inside: bool, ep_size: int, dp_size: int, pp_size: int = 1):
+        """
+        init MoeParallelInfo with ep_size, dp_size and pp_size
 
-        self.pg = ProcessGroupMesh(self.pp_size, self.dp_size, self.ep_size)
+        Args:
+            ep_size (int): expert parallel size
+            dp_size (int): data parallel (zero) size
+            pp_size (int, optional): pipeline parallel size. Defaults to 1.
+            ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if Fasle. Defaults to True.
+        """
+        self.pp_size, self.dp_size, self.ep_size = pp_size, dp_size, ep_size
+        if ep_inside:
+            self.pp_axis, self.dp_axis, self.ep_axis = 0, 1, 2
+            self.pg = ProcessGroupMesh(self.pp_size, self.dp_size, self.ep_size)
+        else:
+            self.pp_axis, self.ep_axis, self.dp_axis = 0, 1, 2
+            self.pg = ProcessGroupMesh(self.pp_size, self.ep_size, self.dp_size)
 
         self.ep_group = self.pg.get_group_along_axis(self.ep_axis)
         self.ep_group_ranks = self.pg.get_ranks_in_group(self.ep_group)

From ac98ee64ffa6850d320f9389b14a3b5b87f1069d Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 18 Sep 2023 15:50:32 +0800
Subject: [PATCH 13/23] update arg for kernel

---
 .../openmoe/model/modeling_openmoe.py         |   3 +-
 .../language/openmoe/model/openmoe_policy.py  | 209 +++++++++++-------
 examples/language/openmoe/train.py            |  12 +-
 3 files changed, 135 insertions(+), 89 deletions(-)

diff --git a/examples/language/openmoe/model/modeling_openmoe.py b/examples/language/openmoe/model/modeling_openmoe.py
index d8289b791dd5..90d3e0022ce4 100644
--- a/examples/language/openmoe/model/modeling_openmoe.py
+++ b/examples/language/openmoe/model/modeling_openmoe.py
@@ -175,6 +175,7 @@ def __init__(self, config):
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = SwiGLU
+        self.use_kernel = True if MOE_MANAGER.use_kernel_optim else False
 
     def forward(self, x):
         if self.pretraining_tp > 1:
@@ -190,7 +191,7 @@ def forward(self, x):
             down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
             down_proj = sum(down_proj)
         else:
-            if HAS_TRITON:
+            if HAS_TRITON and self.use_kernel:
                 down_proj = self.down_proj(LlamaActCombine.apply(self.gate_proj(x), self.up_proj(x)))
             else:
                 down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
diff --git a/examples/language/openmoe/model/openmoe_policy.py b/examples/language/openmoe/model/openmoe_policy.py
index 21e25bcb73a0..cc82683cd319 100644
--- a/examples/language/openmoe/model/openmoe_policy.py
+++ b/examples/language/openmoe/model/openmoe_policy.py
@@ -7,7 +7,7 @@
 import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Module
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import logging
 
 from colossalai.moe.manager import MOE_MANAGER
@@ -17,7 +17,7 @@
 
 from .modeling_openmoe import OpenMoeDecoderLayer, OpenMoeForCausalLM, OpenMoeModel
 
-__all__ = ['OpenMoePolicy', 'OpenMoeForCausalLMPolicy']
+__all__ = ["OpenMoePolicy", "OpenMoeForCausalLMPolicy"]
 
 
 class OpenMoePolicy(Policy):
@@ -50,29 +50,34 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
 
         # optimization configuration
         if self.shard_config.enable_fused_normalization:
-            self.append_or_create_submodule_replacement(description=[
-                SubModuleReplacementDescription(
-                    suffix="input_layernorm",
-                    target_module=FusedRMSNorm,
-                ),
-                SubModuleReplacementDescription(
-                    suffix="post_attention_layernorm",
+            self.append_or_create_submodule_replacement(
+                description=[
+                    SubModuleReplacementDescription(
+                        suffix="input_layernorm",
+                        target_module=FusedRMSNorm,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="post_attention_layernorm",
+                        target_module=FusedRMSNorm,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="pre_extra_mlp_layernorm",
+                        target_module=FusedRMSNorm,
+                        ignore_if_not_exist=True,
+                    ),
+                ],
+                policy=policy,
+                target_key=OpenMoeDecoderLayer,
+            )
+
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="norm",
                     target_module=FusedRMSNorm,
                 ),
-                SubModuleReplacementDescription(
-                    suffix="pre_extra_mlp_layernorm",
-                    target_module=FusedRMSNorm,
-                )
-            ],
-                                                        policy=policy,
-                                                        target_key=OpenMoeDecoderLayer)
-
-            self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(
-                suffix="norm",
-                target_module=FusedRMSNorm,
-            ),
-                                                        policy=policy,
-                                                        target_key=OpenMoeModel)
+                policy=policy,
+                target_key=OpenMoeModel,
+            )
 
         if self.shard_config.enable_flash_attention:
             raise NotImplementedError("Flash attention has already been replaced in openmoe.")
@@ -84,17 +89,17 @@ def postprocess(self):
 
     def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, policy: Dict) -> None:
         """If under pipeline parallel setting, replacing the original forward method of huggingface
-           to customized forward method, and add this changing to policy."""
+        to customized forward method, and add this changing to policy."""
         if self.pipeline_stage_manager:
             stage_manager = self.pipeline_stage_manager
-            if self.model.__class__.__name__ == "LlamaModel":
+            if self.model.__class__.__name__ == "OpenMoeModel":
                 module = self.model
             else:
                 module = self.model.model
 
             layers_per_stage = Policy.distribute_layers(len(module.layers), stage_manager.num_stages)
             stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
-            method_replacement = {'forward': partial(new_forward, stage_manager=stage_manager, stage_index=stage_index)}
+            method_replacement = {"forward": partial(new_forward, stage_manager=stage_manager, stage_index=stage_index)}
             self.append_or_create_method_replacement(description=method_replacement,
                                                      policy=policy,
                                                      target_key=model_cls)
@@ -105,7 +110,7 @@ def get_held_layers(self) -> List[Module]:
         """Get pipeline layers for current stage."""
         assert self.pipeline_stage_manager is not None
 
-        if self.model.__class__.__name__ == 'LlamaModel':
+        if self.model.__class__.__name__ == "LlamaModel":
             module = self.model
         else:
             module = self.model.model
@@ -132,9 +137,11 @@ def module_policy(self):
         policy = super().module_policy()
         if self.pipeline_stage_manager:
             # set None as default
-            self.set_pipeline_forward(model_cls=OpenMoeModel,
-                                      new_forward=OpenMoePipelineForwards.openmoe_model_forward,
-                                      policy=policy)
+            self.set_pipeline_forward(
+                model_cls=OpenMoeModel,
+                new_forward=OpenMoePipelineForwards.openmoe_model_forward,
+                policy=policy,
+            )
         return policy
 
     def get_held_layers(self) -> List[Module]:
@@ -150,7 +157,6 @@ def get_shared_params(self) -> List[Dict[int, Tensor]]:
 class OpenMoeForCausalLMPolicy(OpenMoePolicy):
 
     def module_policy(self):
-
         policy = super().module_policy()
 
         if self.shard_config.enable_tensor_parallelism:
@@ -159,16 +165,21 @@ def module_policy(self):
                 OpenMoeForCausalLM:
                     ModulePolicyDescription(sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="lm_head", target_module=Linear1D_Col, kwargs=dict(gather_output=True))
+                            suffix="lm_head",
+                            target_module=Linear1D_Col,
+                            kwargs=dict(gather_output=True),
+                        )
                     ])
             }
             policy.update(new_item)
 
         if self.pipeline_stage_manager:
             # set None as default
-            self.set_pipeline_forward(model_cls=OpenMoeForCausalLM,
-                                      new_forward=OpenMoePipelineForwards.llama_for_causal_lm_forward,
-                                      policy=policy)
+            self.set_pipeline_forward(
+                model_cls=OpenMoeForCausalLM,
+                new_forward=OpenMoePipelineForwards.llama_for_causal_lm_forward,
+                policy=policy,
+            )
 
         return policy
 
@@ -183,21 +194,21 @@ def get_held_layers(self) -> List[Module]:
     def get_shared_params(self) -> List[Dict[int, Tensor]]:
         llama_model = self.model.model
         if self.pipeline_stage_manager and self.pipeline_stage_manager.num_stages > 1:
-            if id(llama_model.embed_tokens.weight) == id(
-                    self.model.lm_head.weight) and self.pipeline_stage_manager.num_stages > 1:
+            if (id(llama_model.embed_tokens.weight) == id(self.model.lm_head.weight)
+                    and self.pipeline_stage_manager.num_stages > 1):
                 # tie weights
                 return [{
                     0: llama_model.embed_tokens.weight,
-                    self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight
+                    self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
                 }]
         return []
 
 
 class OpenMoePipelineForwards:
-    '''
+    """
     This class serves as a micro library for forward function substitution of Llama models
     under pipeline setting.
-    '''
+    """
 
     @staticmethod
     def openmoe_model_forward(
@@ -222,12 +233,12 @@ def openmoe_model_forward(
 
         logger = logging.get_logger(__name__)
 
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = (output_attentions if output_attentions is not None else self.config.output_attentions)
         output_hidden_states = (output_hidden_states
                                 if output_hidden_states is not None else self.config.output_hidden_states)
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
 
         # retrieve input_ids and inputs_embeds
         if stage_manager.is_first_stage():
@@ -253,13 +264,13 @@ def openmoe_model_forward(
 
         # TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
         if output_attentions:
-            logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
+            logger.warning_once("output_attentions=True is not supported for pipeline models at the moment.")
             output_attentions = False
         if output_hidden_states:
-            logger.warning_once('output_hidden_states=True is not supported for pipeline models at the moment.')
+            logger.warning_once("output_hidden_states=True is not supported for pipeline models at the moment.")
             output_hidden_states = False
         if use_cache:
-            logger.warning_once('use_cache=True is not supported for pipeline models at the moment.')
+            logger.warning_once("use_cache=True is not supported for pipeline models at the moment.")
             use_cache = False
 
         if past_key_values is not None:
@@ -267,10 +278,12 @@ def openmoe_model_forward(
             seq_length_with_past = seq_length_with_past + past_key_values_length
 
         if position_ids is None:
-            position_ids = torch.arange(past_key_values_length,
-                                        seq_length + past_key_values_length,
-                                        dtype=torch.long,
-                                        device=device)
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
             position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
         else:
             position_ids = position_ids.view(-1, seq_length).long()
@@ -278,11 +291,17 @@ def openmoe_model_forward(
         # embed positions, for the first stage, hidden_states is the input embeddings,
         # for the other stages, hidden_states is the output of the previous stage
         if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        dtype=torch.bool,
-                                        device=hidden_states.device)
-        attention_mask = self._prepare_decoder_attention_mask(attention_mask, (batch_size, seq_length), hidden_states,
-                                                              past_key_values_length)
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past),
+                dtype=torch.bool,
+                device=hidden_states.device,
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            hidden_states,
+            past_key_values_length,
+        )
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -300,7 +319,7 @@ def openmoe_model_forward(
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            past_key_value = (past_key_values[idx] if past_key_values is not None else None)
 
             if self.gradient_checkpointing and self.training:
 
@@ -351,9 +370,20 @@ def custom_forward(*inputs):
             router_z_loss = past_router_z_loss + router_z_loss
 
         if stage_manager.is_last_stage():
-            return tuple([hidden_states, next_cache, all_hidden_states, all_self_attns, router_aux_loss, router_z_loss])
+            return tuple([
+                hidden_states,
+                next_cache,
+                all_hidden_states,
+                all_self_attns,
+                router_aux_loss,
+                router_z_loss,
+            ])
         # always return dict for imediate stage
-        return {'hidden_states': hidden_states, 'router_aux_loss': router_aux_loss, 'router_z_loss': router_z_loss}
+        return {
+            "hidden_states": hidden_states,
+            "router_aux_loss": router_aux_loss,
+            "router_z_loss": router_z_loss,
+        }
 
     @staticmethod
     def llama_for_causal_lm_forward(
@@ -376,42 +406,42 @@ def llama_for_causal_lm_forward(
         past_router_z_loss: Optional[torch.FloatTensor] = None,
     ):
         r"""
-            Args:
-                labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
 
-            Returns:
+        Returns:
 
-            Example:
+        Example:
 
-            ```python
-            >>> from transformers import AutoTokenizer, LlamaForCausalLM
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
 
-            >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-            >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
-            >>> prompt = "Hey, are you consciours? Can you talk to me?"
-            >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
 
-            >>> # Generate
-            >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-            >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-            "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
-            ```"""
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
         logger = logging.get_logger(__name__)
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = (output_attentions if output_attentions is not None else self.config.output_attentions)
         output_hidden_states = (output_hidden_states
                                 if output_hidden_states is not None else self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
 
         # TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
         if output_attentions:
-            logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
+            logger.warning_once("output_attentions=True is not supported for pipeline models at the moment.")
             output_attentions = False
         if output_hidden_states:
-            logger.warning_once('output_hidden_states=True is not supported for pipeline models at the moment.')
+            logger.warning_once("output_hidden_states=True is not supported for pipeline models at the moment.")
             output_hidden_states = False
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
@@ -434,7 +464,14 @@ def llama_for_causal_lm_forward(
         )
 
         if stage_manager.is_last_stage():
-            hidden_states, past_key_values, all_hidden_states, attentions, router_aux_loss, router_z_loss = outputs
+            (
+                hidden_states,
+                past_key_values,
+                all_hidden_states,
+                attentions,
+                router_aux_loss,
+                router_z_loss,
+            ) = outputs
 
             if self.pretraining_tp > 1:
                 lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
@@ -498,11 +535,11 @@ def custom_forward(*inputs):
                 attentions=attentions,
             )
         else:
-            hidden_states = outputs['hidden_states']
-            router_aux_loss = outputs['router_aux_loss']
-            router_z_loss = outputs['router_z_loss']
+            hidden_states = outputs["hidden_states"]
+            router_aux_loss = outputs["router_aux_loss"]
+            router_z_loss = outputs["router_z_loss"]
             return {
-                'hidden_states': hidden_states,
-                'past_router_aux_loss': router_aux_loss,
-                'past_router_z_loss': router_z_loss
+                "hidden_states": hidden_states,
+                "past_router_aux_loss": router_aux_loss,
+                "past_router_z_loss": router_z_loss,
             }
diff --git a/examples/language/openmoe/train.py b/examples/language/openmoe/train.py
index efb61d1bb69c..2099bbde91f5 100644
--- a/examples/language/openmoe/train.py
+++ b/examples/language/openmoe/train.py
@@ -96,6 +96,12 @@ def parse_args():
     parser.add_argument("--ep_size", type=int, default=2, help="ep size")
     parser.add_argument("--zero_stage", type=int, default=1, help="zero stage in hybrid plugin")
     parser.add_argument("--microbatch_size", type=int, default=1, help="microbatch size")
+    # kernel
+    parser.add_argument(
+        "--use_kernel",
+        action="store_true",
+        help="Use kernel optim. Need to install flash attention, apex, triton to enable all kernel optimizations.",
+    )
     # loss
     parser.add_argument(
         "--router_aux_loss_factor",
@@ -131,7 +137,7 @@ def main():
         MOE_MANAGER.setup(
             seed=42,
             parallel="EP",
-            use_kernel_optim=True if args.model_name != "test" else False,
+            use_kernel_optim=False if args.model_name == "test" else args.use_kernel,
         )
     elif args.plugin == "hybrid":
         assert (args.dp_size * args.ep_size *
@@ -143,7 +149,7 @@ def main():
             fixed_dp_size=args.dp_size,
             fixed_ep_size=args.ep_size,
             fixed_pp_size=args.pp_size,
-            use_kernel_optim=True if args.model_name != "test" else False,
+            use_kernel_optim=False if args.model_name == "test" else args.use_kernel,
         )
 
     # Manage loggers
@@ -189,6 +195,8 @@ def main():
             zero_stage=args.zero_stage,
             microbatch_size=args.microbatch_size,
             custom_policy=OpenMoeForCausalLMPolicy(),
+            enable_fused_normalization=args.use_kernel,
+            enable_jit_fused=args.use_kernel,
         )
     else:
         raise ValueError(f"Invalid plugin {args.plugin}")

From a97a201e37c73c8bd8c3c7841971a7f023cfeb8d Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 18 Sep 2023 17:40:29 +0800
Subject: [PATCH 14/23] disable ci

---
 examples/language/openmoe/test_ci.sh | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/examples/language/openmoe/test_ci.sh b/examples/language/openmoe/test_ci.sh
index 8361b66c50d1..e69de29bb2d1 100644
--- a/examples/language/openmoe/test_ci.sh
+++ b/examples/language/openmoe/test_ci.sh
@@ -1,5 +0,0 @@
-set -xe
-pip install -r requirements.txt
-
-python infer.py --model "test"
-torchrun --standalone --nproc_per_node 2 train.py --model_name "test" --batch_size 1 --num_epoch 1 --plugin zero2

From 2817bc237f7360cec8efb43a5debf00d59e20d6b Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Mon, 18 Sep 2023 17:40:47 +0800
Subject: [PATCH 15/23] update train script

---
 examples/language/openmoe/train.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/language/openmoe/train.sh b/examples/language/openmoe/train.sh
index a2fe425c5805..6712aa10a88b 100644
--- a/examples/language/openmoe/train.sh
+++ b/examples/language/openmoe/train.sh
@@ -1,3 +1,9 @@
 torchrun --standalone --nproc_per_node 4 train.py \
     --model_name "base" \
+    --plugin "hybrid" \
+    --pp_size 2 \
+    --dp_size 1 \
+    --ep_size 2 \
+    --use_kernel \
+    --zero_stage 1 \
     --batch_size 4

From 14583ead2279b1fdf2be15b697b20e114315d3ce Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 21 Sep 2023 11:25:47 +0800
Subject: [PATCH 16/23] fsdp

---
 examples/language/openmoe/train1.py | 252 ++++++++++++++++++++++++++++
 1 file changed, 252 insertions(+)
 create mode 100644 examples/language/openmoe/train1.py

diff --git a/examples/language/openmoe/train1.py b/examples/language/openmoe/train1.py
new file mode 100644
index 000000000000..f1b1327b5a54
--- /dev/null
+++ b/examples/language/openmoe/train1.py
@@ -0,0 +1,252 @@
+import argparse
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from model.modeling_openmoe import OpenMoeForCausalLM
+from torchvision import datasets, transforms
+
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+# import datasets
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import transformers
+from huggingface_hub import snapshot_download
+from model.modeling_openmoe import LlamaConfig, OpenMoeForCausalLM
+from model.openmoe_policy import OpenMoeForCausalLMPolicy
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch, CPUOffload
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
+from torch.distributed.fsdp.wrap import enable_wrap, wrap
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim.lr_scheduler import StepLR
+from torch.utils.data import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm
+from transformers import Adafactor, T5Tokenizer
+from transformers.models.llama import LlamaConfig
+
+import colossalai
+from colossalai import get_default_parser
+from colossalai.booster import Booster
+from colossalai.booster.plugin import LowLevelZeroPlugin, TorchFSDPPlugin
+from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.moe import MoeCheckpintIO
+from colossalai.moe.manager import MOE_MANAGER
+from colossalai.moe.utils import skip_init
+from colossalai.utils import get_current_device
+
+
+def setup(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "14523"
+
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+class Net(nn.Module):
+
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None):
+    MOE_MANAGER.setup(seed=42, parallel=None, use_kernel_optim=False)
+    model.train()
+    ddp_loss = torch.zeros(2).to(rank)
+    if sampler:
+        sampler.set_epoch(epoch)
+    for batch_idx, (data, target) in enumerate(train_loader):
+        input_ids = torch.randint(0, 40, (4, 16), device=get_current_device()).to(rank)
+        labels = input_ids.to(rank)
+        optimizer.zero_grad()
+        output = model(input_ids=input_ids, labels=labels)
+        loss = output["loss"]
+        loss.backward()
+        optimizer.step()
+        ddp_loss[0] += loss.item()
+        ddp_loss[1] += len(data)
+        print("!1111")
+
+    dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
+    if rank == 0:
+        print("Train Epoch: {} \tLoss: {:.6f}".format(epoch, ddp_loss[0] / ddp_loss[1]))
+
+
+def test(model, rank, world_size, test_loader):
+    model.eval()
+    correct = 0
+    ddp_loss = torch.zeros(3).to(rank)
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(rank), target.to(rank)
+            output = model(data)
+            ddp_loss[0] += F.nll_loss(output, target, reduction="sum").item()    # sum up batch loss
+            pred = output.argmax(dim=1, keepdim=True)    # get the index of the max log-probability
+            ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item()
+            ddp_loss[2] += len(data)
+
+    dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
+
+    if rank == 0:
+        test_loss = ddp_loss[0] / ddp_loss[2]
+        print("Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n".format(
+            test_loss,
+            int(ddp_loss[1]),
+            int(ddp_loss[2]),
+            100.0 * ddp_loss[1] / ddp_loss[2],
+        ))
+
+
+def fsdp_main(rank, world_size, args):
+    # 每个进程都要setup一下
+    setup(rank, world_size)
+
+    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+
+    dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
+    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
+
+    sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True)
+    sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size)
+
+    train_kwargs = {"batch_size": args.batch_size, "sampler": sampler1}
+    test_kwargs = {"batch_size": args.test_batch_size, "sampler": sampler2}
+    cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": False}
+    train_kwargs.update(cuda_kwargs)
+    test_kwargs.update(cuda_kwargs)
+
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
+    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
+    torch.cuda.set_device(rank)
+
+    init_start_event = torch.cuda.Event(enable_timing=True)
+    init_end_event = torch.cuda.Event(enable_timing=True)
+
+    # model = Net().to(rank)
+    config = LlamaConfig.from_pretrained("hpcaitech/openmoe-base")
+    setattr(config, "router_aux_loss_factor", 0.1)
+    setattr(config, "router_z_loss_factor", 0.1)
+    setattr(config, "label_smoothing", 0.1)
+    setattr(config, "z_loss_factor", 0.1)
+    model = OpenMoeForCausalLM(config).to(rank)
+    # 使用FSDP将model warp起来
+    model = FSDP(
+        model,
+        mixed_precision=MixedPrecision(
+            param_dtype=torch.float16,
+            reduce_dtype=torch.float16,
+            buffer_dtype=torch.float16,
+        ),
+    )
+
+    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
+
+    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+    init_start_event.record()
+    for epoch in range(1, args.epochs + 1):
+        train(
+            args,
+            model,
+            rank,
+            world_size,
+            train_loader,
+            optimizer,
+            epoch,
+            sampler=sampler1,
+        )
+        scheduler.step()
+
+    init_end_event.record()
+
+    if rank == 0:
+        print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec")
+        print(f"{model}")
+
+    cleanup()
+
+
+if __name__ == "__main__":
+    # Training settings
+    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        metavar="N",
+        help="input batch size for training (default: 64)",
+    )
+    parser.add_argument(
+        "--test-batch-size",
+        type=int,
+        default=1000,
+        metavar="N",
+        help="input batch size for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=10,
+        metavar="N",
+        help="number of epochs to train (default: 14)",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1.0,
+        metavar="LR",
+        help="learning rate (default: 1.0)",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.7,
+        metavar="M",
+        help="Learning rate step gamma (default: 0.7)",
+    )
+    parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training")
+    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
+    parser.add_argument(
+        "--save-model",
+        action="store_true",
+        default=False,
+        help="For Saving the current Model",
+    )
+    args = parser.parse_args()
+
+    torch.manual_seed(args.seed)
+
+    WORLD_SIZE = torch.cuda.device_count()
+    mp.spawn(fsdp_main, args=(WORLD_SIZE, args), nprocs=WORLD_SIZE, join=True)

From 4e384d711c922996ed3d8e0abfb227c32f33c7c8 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 21 Sep 2023 13:53:55 +0800
Subject: [PATCH 17/23] update train

---
 .../openmoe/model/modeling_openmoe.py         | 11 ++-
 examples/language/openmoe/train.py            | 82 +++++++++----------
 2 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/examples/language/openmoe/model/modeling_openmoe.py b/examples/language/openmoe/model/modeling_openmoe.py
index 90d3e0022ce4..5e810f22b87d 100644
--- a/examples/language/openmoe/model/modeling_openmoe.py
+++ b/examples/language/openmoe/model/modeling_openmoe.py
@@ -27,8 +27,7 @@
 from torch import nn
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
-from transformers.models.llama import LlamaConfig
-from transformers.models.t5.modeling_t5 import T5LayerNorm
+from transformers.models.llama.modeling_llama import LlamaConfig, LlamaRMSNorm
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -346,8 +345,8 @@ def __init__(self, config: LlamaConfig, moe: bool):
         self.hidden_size = config.hidden_size
         self.moe = moe
         self.self_attn = OpenMoeAttention(config=config)
-        self.input_layernorm = T5LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = T5LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         if self.moe:
             self.mlp = SparseMLP(
                 num_experts=config.num_experts,
@@ -362,7 +361,7 @@ def __init__(self, config: LlamaConfig, moe: bool):
                 intermediate_size=config.intermediate_size,
                 activation=config.hidden_act,
                 gated=config.gated)
-            self.pre_extra_mlp_layernorm = T5LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.pre_extra_mlp_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
             self.extra_mlp = OpenMoeMLP(config)
         else:
             self.mlp = OpenMoeMLP(config)
@@ -558,7 +557,7 @@ def __init__(self, config: LlamaConfig):
             OpenMoeDecoderLayer(config, moe=True if (i + 1) % config.moe_layer_interval == 0 else False)
             for i in range(config.num_hidden_layers)
         ])
-        self.norm = T5LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
diff --git a/examples/language/openmoe/train.py b/examples/language/openmoe/train.py
index 2099bbde91f5..5e881bb240f4 100644
--- a/examples/language/openmoe/train.py
+++ b/examples/language/openmoe/train.py
@@ -67,6 +67,7 @@ def parse_args():
         "--model_name",
         type=str,
         default="base",
+        choices=["base", "8b"],
         help="Path to pretrained model or model identifier from huggingface.co/models.",
     )
     parser.add_argument(
@@ -88,7 +89,7 @@ def parse_args():
         type=str,
         default="hybrid",
         help="parallel plugin",
-        choices=["zero1", "zero2", "hybrid"],
+        choices=["zero1", "zero2", "hybrid", "fsdp"],
     )
     # hybrid plugin
     parser.add_argument("--pp_size", type=int, default=2, help="pp size")
@@ -132,26 +133,6 @@ def main():
     colossalai.launch_from_torch(config={}, seed=args.seed)
     coordinator = DistCoordinator()
 
-    # Set up moe
-    if args.plugin in ["zero1", "zero2"]:
-        MOE_MANAGER.setup(
-            seed=42,
-            parallel="EP",
-            use_kernel_optim=False if args.model_name == "test" else args.use_kernel,
-        )
-    elif args.plugin == "hybrid":
-        assert (args.dp_size * args.ep_size *
-                args.pp_size == coordinator.world_size), "dp_size * ep_size * pp_size must equal to world_size"
-        MOE_MANAGER.setup(
-            seed=42,
-            parallel="EP",
-            mode="fixed",
-            fixed_dp_size=args.dp_size,
-            fixed_ep_size=args.ep_size,
-            fixed_pp_size=args.pp_size,
-            use_kernel_optim=False if args.model_name == "test" else args.use_kernel,
-        )
-
     # Manage loggers
     disable_existing_loggers()
     logger = get_dist_logger()
@@ -162,32 +143,22 @@ def main():
         datasets.utils.logging.set_verbosity_error()
         transformers.utils.logging.set_verbosity_error()
 
-    # Build OpenMoe model
-    repo_name = "hpcaitech/openmoe-" + args.model_name
-    if args.model_name == "test":
-        config = LlamaConfig.from_pretrained("hpcaitech/openmoe-base")
-        config.vocab_size = 32000
-    else:
-        config = LlamaConfig.from_pretrained(repo_name)
-    setattr(config, "router_aux_loss_factor", args.router_aux_loss_factor)
-    setattr(config, "router_z_loss_factor", args.router_z_loss_factor)
-    setattr(config, "label_smoothing", args.label_smoothing)
-    setattr(config, "z_loss_factor", args.z_loss_factor)
-    with skip_init():
-        model = OpenMoeForCausalLM(config)
-    if args.model_name != "test":
-        load_ckpt(repo_name, model)
-    logger.info(f"Finish init model with config:\n{config}", ranks=[0])
-
-    # Enable gradient checkpointing
-    model.gradient_checkpointing_enable()
-
     # Set plugin
     booster_kwargs = {}
     if args.plugin == "zero1":
         plugin = LowLevelZeroPlugin(initial_scale=2**5, stage=1)
+        MOE_MANAGER.setup(
+            seed=42,
+            parallel="EP",
+            use_kernel_optim=args.use_kernel,
+        )
     elif args.plugin == "zero2":
         plugin = LowLevelZeroPlugin(initial_scale=2**5, stage=2)
+        MOE_MANAGER.setup(
+            seed=42,
+            parallel="EP",
+            use_kernel_optim=args.use_kernel,
+        )
     elif args.plugin == "hybrid":
         plugin = MoeHybridParallelPlugin(
             tp_size=1,
@@ -198,13 +169,37 @@ def main():
             enable_fused_normalization=args.use_kernel,
             enable_jit_fused=args.use_kernel,
         )
+        MOE_MANAGER.setup(
+            seed=42,
+            parallel="EP",
+            mode="fixed",
+            fixed_dp_size=args.dp_size,
+            fixed_ep_size=args.ep_size,
+            fixed_pp_size=args.pp_size,
+            use_kernel_optim=args.use_kernel,
+        )
     else:
         raise ValueError(f"Invalid plugin {args.plugin}")
     logger.info(f"Set plugin as {plugin}", ranks=[0])
 
+    # Build OpenMoe model
+    repo_name = "hpcaitech/openmoe-" + args.model_name
+    config = LlamaConfig.from_pretrained(repo_name)
+    setattr(config, "router_aux_loss_factor", args.router_aux_loss_factor)
+    setattr(config, "router_z_loss_factor", args.router_z_loss_factor)
+    setattr(config, "label_smoothing", args.label_smoothing)
+    setattr(config, "z_loss_factor", args.z_loss_factor)
+    with skip_init():
+        model = OpenMoeForCausalLM(config)
+    load_ckpt(repo_name, model)
+    logger.info(f"Finish init model with config:\n{config}", ranks=[0])
+
+    # Enable gradient checkpointing
+    model.gradient_checkpointing_enable()
+
     # Prepare tokenizer and dataloader
     tokenizer = T5Tokenizer.from_pretrained("google/umt5-small")
-    dataset = RandomDataset(num_samples=1000 if args.model_name != "test" else 50)
+    dataset = RandomDataset(num_samples=1000)
     dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
 
     # Set optimizer
@@ -228,9 +223,9 @@ def main():
                 desc=f"Epoch [{epoch + 1}/{args.num_epoch}]",
                 disable=not coordinator.is_master(),
         ) as pbar:
-            # Forward pass
             for _ in pbar:
                 if use_pipeline:
+                    # Forward pass
                     outputs = booster.execute_pipeline(
                         train_dataloader_iter,
                         model,
@@ -244,6 +239,7 @@ def main():
                         loss = outputs["loss"]
                         pbar.set_postfix({"loss": loss.item()})
                 else:
+                    # Forward pass
                     data = next(train_dataloader_iter)
                     data = move_to_cuda(data, torch.cuda.current_device())
                     outputs = model(**data)

From dbc4f56889d05e09cceee162028f3800f4783a45 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 21 Sep 2023 14:00:38 +0800
Subject: [PATCH 18/23] update train

---
 .../openmoe/benchmark/benchmark_fsdp.py       | 165 ++++++++++++
 examples/language/openmoe/train1.py           | 252 ------------------
 2 files changed, 165 insertions(+), 252 deletions(-)
 create mode 100644 examples/language/openmoe/benchmark/benchmark_fsdp.py
 delete mode 100644 examples/language/openmoe/train1.py

diff --git a/examples/language/openmoe/benchmark/benchmark_fsdp.py b/examples/language/openmoe/benchmark/benchmark_fsdp.py
new file mode 100644
index 000000000000..0e9db4797d22
--- /dev/null
+++ b/examples/language/openmoe/benchmark/benchmark_fsdp.py
@@ -0,0 +1,165 @@
+import argparse
+import os
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.optim as optim
+from model.modeling_openmoe import LlamaConfig, OpenMoeForCausalLM
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
+from torch.utils.data import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from transformers.models.llama import LlamaConfig
+
+from colossalai.moe.manager import MOE_MANAGER
+
+
+def setup(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "14523"
+
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+
+class RandomDataset(Dataset):
+
+    def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000):
+        self.num_samples = num_samples
+        self.max_length = max_length
+        self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length))
+        self.attention_mask = torch.ones_like(self.input_ids)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        return {
+            "input_ids": self.input_ids[idx],
+            "attention_mask": self.attention_mask[idx],
+            "labels": self.input_ids[idx],
+        }
+
+
+def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None):
+    model.train()
+
+    for idx, data in enumerate(train_loader):
+
+        input_ids, attention_mask, labels = (
+            data["input_ids"].cuda(),
+            data["attention_mask"].cuda(),
+            data["labels"].cuda(),
+        )
+
+        optimizer.zero_grad()
+        output = model(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=attention_mask,
+            chunk_head=False,
+        )
+        loss = output["loss"]
+        loss.backward()
+        optimizer.step()
+
+
+def fsdp_main(rank, world_size, args):
+    # 每个进程都要setup一下
+    setup(rank, world_size)
+    MOE_MANAGER.setup(seed=42, parallel=None, use_kernel_optim=False)
+
+    dataset1 = RandomDataset()
+    sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True)
+
+    train_kwargs = {"batch_size": args.batch_size, "sampler": sampler1}
+    cuda_kwargs = {"num_workers": 2, "shuffle": False}
+    train_kwargs.update(cuda_kwargs)
+
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
+    torch.cuda.set_device(rank)
+
+    config = LlamaConfig.from_pretrained("hpcaitech/openmoe-base")
+    setattr(config, "router_aux_loss_factor", 0.1)
+    setattr(config, "router_z_loss_factor", 0.1)
+    setattr(config, "label_smoothing", 0.1)
+    setattr(config, "z_loss_factor", 0.1)
+    model = OpenMoeForCausalLM(config).to(rank)
+    # 使用FSDP将model warp起来
+    model = FSDP(
+        model,
+        mixed_precision=MixedPrecision(
+            param_dtype=torch.float16,
+            reduce_dtype=torch.float16,
+            buffer_dtype=torch.float16,
+        ),
+    )
+
+    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
+
+    for epoch in range(1, args.epochs + 1):
+        train(
+            args,
+            model,
+            rank,
+            world_size,
+            train_loader,
+            optimizer,
+            epoch,
+            sampler=sampler1,
+        )
+
+
+if __name__ == "__main__":
+    # Training settings
+    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=1,
+        metavar="N",
+        help="input batch size for training (default: 64)",
+    )
+    parser.add_argument(
+        "--test-batch-size",
+        type=int,
+        default=1000,
+        metavar="N",
+        help="input batch size for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=10,
+        metavar="N",
+        help="number of epochs to train (default: 14)",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1.0,
+        metavar="LR",
+        help="learning rate (default: 1.0)",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.7,
+        metavar="M",
+        help="Learning rate step gamma (default: 0.7)",
+    )
+    parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training")
+    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
+    parser.add_argument(
+        "--save-model",
+        action="store_true",
+        default=False,
+        help="For Saving the current Model",
+    )
+    args = parser.parse_args()
+
+    torch.manual_seed(args.seed)
+
+    WORLD_SIZE = torch.cuda.device_count()
+    mp.spawn(fsdp_main, args=(WORLD_SIZE, args), nprocs=WORLD_SIZE, join=True)
diff --git a/examples/language/openmoe/train1.py b/examples/language/openmoe/train1.py
deleted file mode 100644
index f1b1327b5a54..000000000000
--- a/examples/language/openmoe/train1.py
+++ /dev/null
@@ -1,252 +0,0 @@
-import argparse
-import os
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from model.modeling_openmoe import OpenMoeForCausalLM
-from torchvision import datasets, transforms
-
-os.environ["TRANSFORMERS_OFFLINE"] = "1"
-# import datasets
-import torch
-import torch.distributed as dist
-import torch.multiprocessing as mp
-import transformers
-from huggingface_hub import snapshot_download
-from model.modeling_openmoe import LlamaConfig, OpenMoeForCausalLM
-from model.openmoe_policy import OpenMoeForCausalLMPolicy
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch, CPUOffload
-from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
-from torch.distributed.fsdp.wrap import enable_wrap, wrap
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.optim.lr_scheduler import StepLR
-from torch.utils.data import Dataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm
-from transformers import Adafactor, T5Tokenizer
-from transformers.models.llama import LlamaConfig
-
-import colossalai
-from colossalai import get_default_parser
-from colossalai.booster import Booster
-from colossalai.booster.plugin import LowLevelZeroPlugin, TorchFSDPPlugin
-from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
-from colossalai.cluster import DistCoordinator
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.moe import MoeCheckpintIO
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import skip_init
-from colossalai.utils import get_current_device
-
-
-def setup(rank, world_size):
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "14523"
-
-    # initialize the process group
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
-
-def cleanup():
-    dist.destroy_process_group()
-
-
-class Net(nn.Module):
-
-    def __init__(self):
-        super(Net, self).__init__()
-        self.conv1 = nn.Conv2d(1, 32, 3, 1)
-        self.conv2 = nn.Conv2d(32, 64, 3, 1)
-        self.dropout1 = nn.Dropout(0.25)
-        self.dropout2 = nn.Dropout(0.5)
-        self.fc1 = nn.Linear(9216, 128)
-        self.fc2 = nn.Linear(128, 10)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = F.relu(x)
-        x = self.conv2(x)
-        x = F.relu(x)
-        x = F.max_pool2d(x, 2)
-        x = self.dropout1(x)
-        x = torch.flatten(x, 1)
-        x = self.fc1(x)
-        x = F.relu(x)
-        x = self.dropout2(x)
-        x = self.fc2(x)
-        output = F.log_softmax(x, dim=1)
-        return output
-
-
-def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None):
-    MOE_MANAGER.setup(seed=42, parallel=None, use_kernel_optim=False)
-    model.train()
-    ddp_loss = torch.zeros(2).to(rank)
-    if sampler:
-        sampler.set_epoch(epoch)
-    for batch_idx, (data, target) in enumerate(train_loader):
-        input_ids = torch.randint(0, 40, (4, 16), device=get_current_device()).to(rank)
-        labels = input_ids.to(rank)
-        optimizer.zero_grad()
-        output = model(input_ids=input_ids, labels=labels)
-        loss = output["loss"]
-        loss.backward()
-        optimizer.step()
-        ddp_loss[0] += loss.item()
-        ddp_loss[1] += len(data)
-        print("!1111")
-
-    dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
-    if rank == 0:
-        print("Train Epoch: {} \tLoss: {:.6f}".format(epoch, ddp_loss[0] / ddp_loss[1]))
-
-
-def test(model, rank, world_size, test_loader):
-    model.eval()
-    correct = 0
-    ddp_loss = torch.zeros(3).to(rank)
-    with torch.no_grad():
-        for data, target in test_loader:
-            data, target = data.to(rank), target.to(rank)
-            output = model(data)
-            ddp_loss[0] += F.nll_loss(output, target, reduction="sum").item()    # sum up batch loss
-            pred = output.argmax(dim=1, keepdim=True)    # get the index of the max log-probability
-            ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item()
-            ddp_loss[2] += len(data)
-
-    dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
-
-    if rank == 0:
-        test_loss = ddp_loss[0] / ddp_loss[2]
-        print("Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n".format(
-            test_loss,
-            int(ddp_loss[1]),
-            int(ddp_loss[2]),
-            100.0 * ddp_loss[1] / ddp_loss[2],
-        ))
-
-
-def fsdp_main(rank, world_size, args):
-    # 每个进程都要setup一下
-    setup(rank, world_size)
-
-    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
-
-    dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
-    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
-
-    sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True)
-    sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size)
-
-    train_kwargs = {"batch_size": args.batch_size, "sampler": sampler1}
-    test_kwargs = {"batch_size": args.test_batch_size, "sampler": sampler2}
-    cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": False}
-    train_kwargs.update(cuda_kwargs)
-    test_kwargs.update(cuda_kwargs)
-
-    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
-    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
-    torch.cuda.set_device(rank)
-
-    init_start_event = torch.cuda.Event(enable_timing=True)
-    init_end_event = torch.cuda.Event(enable_timing=True)
-
-    # model = Net().to(rank)
-    config = LlamaConfig.from_pretrained("hpcaitech/openmoe-base")
-    setattr(config, "router_aux_loss_factor", 0.1)
-    setattr(config, "router_z_loss_factor", 0.1)
-    setattr(config, "label_smoothing", 0.1)
-    setattr(config, "z_loss_factor", 0.1)
-    model = OpenMoeForCausalLM(config).to(rank)
-    # 使用FSDP将model warp起来
-    model = FSDP(
-        model,
-        mixed_precision=MixedPrecision(
-            param_dtype=torch.float16,
-            reduce_dtype=torch.float16,
-            buffer_dtype=torch.float16,
-        ),
-    )
-
-    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
-
-    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
-    init_start_event.record()
-    for epoch in range(1, args.epochs + 1):
-        train(
-            args,
-            model,
-            rank,
-            world_size,
-            train_loader,
-            optimizer,
-            epoch,
-            sampler=sampler1,
-        )
-        scheduler.step()
-
-    init_end_event.record()
-
-    if rank == 0:
-        print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec")
-        print(f"{model}")
-
-    cleanup()
-
-
-if __name__ == "__main__":
-    # Training settings
-    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=64,
-        metavar="N",
-        help="input batch size for training (default: 64)",
-    )
-    parser.add_argument(
-        "--test-batch-size",
-        type=int,
-        default=1000,
-        metavar="N",
-        help="input batch size for testing (default: 1000)",
-    )
-    parser.add_argument(
-        "--epochs",
-        type=int,
-        default=10,
-        metavar="N",
-        help="number of epochs to train (default: 14)",
-    )
-    parser.add_argument(
-        "--lr",
-        type=float,
-        default=1.0,
-        metavar="LR",
-        help="learning rate (default: 1.0)",
-    )
-    parser.add_argument(
-        "--gamma",
-        type=float,
-        default=0.7,
-        metavar="M",
-        help="Learning rate step gamma (default: 0.7)",
-    )
-    parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training")
-    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
-    parser.add_argument(
-        "--save-model",
-        action="store_true",
-        default=False,
-        help="For Saving the current Model",
-    )
-    args = parser.parse_args()
-
-    torch.manual_seed(args.seed)
-
-    WORLD_SIZE = torch.cuda.device_count()
-    mp.spawn(fsdp_main, args=(WORLD_SIZE, args), nprocs=WORLD_SIZE, join=True)

From 1dd685a6be85f998e6fd17b9d144ce505136cc50 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 21 Sep 2023 14:44:26 +0800
Subject: [PATCH 19/23] fsdp benchmark

---
 .../openmoe/benchmark/benchmark_fsdp.py       | 157 +++++++-----------
 1 file changed, 59 insertions(+), 98 deletions(-)

diff --git a/examples/language/openmoe/benchmark/benchmark_fsdp.py b/examples/language/openmoe/benchmark/benchmark_fsdp.py
index 0e9db4797d22..acbc9435731c 100644
--- a/examples/language/openmoe/benchmark/benchmark_fsdp.py
+++ b/examples/language/openmoe/benchmark/benchmark_fsdp.py
@@ -4,23 +4,18 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-import torch.optim as optim
+import tqdm
 from model.modeling_openmoe import LlamaConfig, OpenMoeForCausalLM
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
 from torch.utils.data import Dataset
 from torch.utils.data.distributed import DistributedSampler
+from transformers import Adafactor
 from transformers.models.llama import LlamaConfig
 
 from colossalai.moe.manager import MOE_MANAGER
 
-
-def setup(rank, world_size):
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "14523"
-
-    # initialize the process group
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+from .utils import PerformanceEvaluator, get_model_numel
 
 
 class RandomDataset(Dataset):
@@ -42,45 +37,21 @@ def __getitem__(self, idx):
         }
 
 
-def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None):
-    model.train()
-
-    for idx, data in enumerate(train_loader):
-
-        input_ids, attention_mask, labels = (
-            data["input_ids"].cuda(),
-            data["attention_mask"].cuda(),
-            data["labels"].cuda(),
-        )
-
-        optimizer.zero_grad()
-        output = model(
-            input_ids=input_ids,
-            labels=labels,
-            attention_mask=attention_mask,
-            chunk_head=False,
-        )
-        loss = output["loss"]
-        loss.backward()
-        optimizer.step()
-
-
 def fsdp_main(rank, world_size, args):
-    # 每个进程都要setup一下
-    setup(rank, world_size)
-    MOE_MANAGER.setup(seed=42, parallel=None, use_kernel_optim=False)
-
-    dataset1 = RandomDataset()
-    sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True)
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "14523"
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
 
-    train_kwargs = {"batch_size": args.batch_size, "sampler": sampler1}
-    cuda_kwargs = {"num_workers": 2, "shuffle": False}
-    train_kwargs.update(cuda_kwargs)
+    MOE_MANAGER.setup(seed=42, parallel=None, use_kernel_optim=False)
 
-    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
+    dataset = RandomDataset(max_length=args.seq_length)
+    sampler = DistributedSampler(dataset, rank=rank, num_replicas=world_size, shuffle=False)
+    train_kwargs = {"batch_size": args.batch_size, "sampler": sampler}
+    train_loader = torch.utils.data.DataLoader(dataset, **train_kwargs)
     torch.cuda.set_device(rank)
 
-    config = LlamaConfig.from_pretrained("hpcaitech/openmoe-base")
+    config = LlamaConfig.from_pretrained("hpcaitech/openmoe-%s" % args.model_name)
     setattr(config, "router_aux_loss_factor", 0.1)
     setattr(config, "router_z_loss_factor", 0.1)
     setattr(config, "label_smoothing", 0.1)
@@ -95,71 +66,61 @@ def fsdp_main(rank, world_size, args):
             buffer_dtype=torch.float16,
         ),
     )
+    optimizer = Adafactor(model.parameters())
+    model.train()
+
+    model_numel = get_model_numel(model)
+    performance_evaluator = PerformanceEvaluator(
+        model_numel,
+        enable_grad_checkpoint=True,
+        ignore_steps=args.warm_up,
+        dp_world_size=dist.get_world_size(),
+    )
+
+    for step, data in tqdm.tqdm(enumerate(train_loader), total=args.warm_up + args.active):
+        if step == args.warm_up + args.active:
+            break
+
+        performance_evaluator.on_step_start(step)
+        input_ids, attention_mask, labels = (
+            data["input_ids"].cuda(),
+            data["attention_mask"].cuda(),
+            data["labels"].cuda(),
+        )
 
-    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
-
-    for epoch in range(1, args.epochs + 1):
-        train(
-            args,
-            model,
-            rank,
-            world_size,
-            train_loader,
-            optimizer,
-            epoch,
-            sampler=sampler1,
+        optimizer.zero_grad()
+        output = model(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=attention_mask,
+            chunk_head=False,
         )
+        loss = output["loss"]
+        loss.backward()
+        optimizer.step()
+        performance_evaluator.on_step_end(input_ids)
+
+    performance_evaluator.on_fit_end()
+    if dist.get_rank() == 0:
+        print(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
 
 if __name__ == "__main__":
-    # Training settings
-    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=1,
-        metavar="N",
-        help="input batch size for training (default: 64)",
-    )
-    parser.add_argument(
-        "--test-batch-size",
-        type=int,
-        default=1000,
-        metavar="N",
-        help="input batch size for testing (default: 1000)",
-    )
-    parser.add_argument(
-        "--epochs",
-        type=int,
-        default=10,
-        metavar="N",
-        help="number of epochs to train (default: 14)",
-    )
-    parser.add_argument(
-        "--lr",
-        type=float,
-        default=1.0,
-        metavar="LR",
-        help="learning rate (default: 1.0)",
-    )
-    parser.add_argument(
-        "--gamma",
-        type=float,
-        default=0.7,
-        metavar="M",
-        help="Learning rate step gamma (default: 0.7)",
-    )
-    parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training")
-    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
+    parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--save-model",
-        action="store_true",
-        default=False,
-        help="For Saving the current Model",
+        "--model_name",
+        type=str,
+        default="base",
+        choices=["base", "8b"],
+        help="base or 8b",
     )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--seq_length", type=int, default=2048)
+    parser.add_argument("--warm_up", type=int, default=20)
+    parser.add_argument("--active", type=int, default=20)
     args = parser.parse_args()
 
-    torch.manual_seed(args.seed)
+    torch.manual_seed(42)
 
     WORLD_SIZE = torch.cuda.device_count()
     mp.spawn(fsdp_main, args=(WORLD_SIZE, args), nprocs=WORLD_SIZE, join=True)

From 4146eedd4bdadef963a317859aa38ff254500fbc Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 21 Sep 2023 14:48:33 +0800
Subject: [PATCH 20/23] rename

---
 .../{benchmark_train.py => benchmark_cai.py}  | 49 +++++++------------
 1 file changed, 19 insertions(+), 30 deletions(-)
 rename examples/language/openmoe/benchmark/{benchmark_train.py => benchmark_cai.py} (81%)

diff --git a/examples/language/openmoe/benchmark/benchmark_train.py b/examples/language/openmoe/benchmark/benchmark_cai.py
similarity index 81%
rename from examples/language/openmoe/benchmark/benchmark_train.py
rename to examples/language/openmoe/benchmark/benchmark_cai.py
index 373516c56f84..429217dc56ce 100644
--- a/examples/language/openmoe/benchmark/benchmark_train.py
+++ b/examples/language/openmoe/benchmark/benchmark_cai.py
@@ -1,7 +1,14 @@
-import colossalai
 import datasets
 import torch
 import transformers
+from model.modeling_openmoe import OpenMoeForCausalLM
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from transformers import Adafactor
+from transformers.models.llama import LlamaConfig
+from utils import SimpleTimer, print_model_numel
+
+import colossalai
 from colossalai import get_default_parser
 from colossalai.booster import Booster
 from colossalai.booster.plugin import LowLevelZeroPlugin
@@ -10,27 +17,15 @@
 from colossalai.moe.manager import MOE_MANAGER
 from colossalai.moe.utils import skip_init
 from colossalai.utils import get_current_device
-from model.modeling_openmoe import OpenMoeForCausalLM
-from torch.utils.data import Dataset
-from tqdm import tqdm
-from transformers import Adafactor
-from transformers.models.llama import LlamaConfig
-from utils import SimpleTimer, print_model_numel
 
 
 class RandomDataset(Dataset):
 
-    def __init__(self,
-                 num_samples: int = 1000,
-                 max_length: int = 2048,
-                 vocab_size: int = 32000):
+    def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000):
         self.num_samples = num_samples
         self.max_length = max_length
-        self.input_ids = torch.randint(0, vocab_size,
-                                       (num_samples, max_length),
-                                       device=get_current_device())
-        self.attention_mask = torch.ones_like(self.input_ids,
-                                              device=get_current_device())
+        self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
+        self.attention_mask = torch.ones_like(self.input_ids, device=get_current_device())
 
     def __len__(self):
         return self.num_samples
@@ -49,7 +44,10 @@ def parse_args():
     # parser.add_argument("--model_name", type=str, default="base", choices=["base", "8b"],
     #                     help="Path to pretrained model or model identifier from huggingface.co/models.")
     parser.add_argument("--num_epoch", type=int, default=1, help="Number of epochs.")
-    parser.add_argument("--batch_size", type=int, default=4, help="Batch size (per dp group) for the training dataloader.")
+    parser.add_argument("--batch_size",
+                        type=int,
+                        default=4,
+                        help="Batch size (per dp group) for the training dataloader.")
     parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
     parser.add_argument("--num_samples", type=int, default=1000, help="Number of samples in the dataset.")
 
@@ -61,9 +59,7 @@ def main():
     args = parse_args()
 
     MDOEL_CONFIG = {
-        "architectures": [
-            "OpenMoeForCausalLM"
-        ],
+        "architectures": ["OpenMoeForCausalLM"],
         "capacity_factor_eval": 2.0,
         "capacity_factor_train": 1.25,
         "drop_tks": True,
@@ -140,10 +136,7 @@ def main():
 
     # Prepare tokenizer and dataloader
     dataset = RandomDataset(num_samples=args.num_samples)
-    dataloader = plugin.prepare_dataloader(dataset,
-                                           batch_size=args.batch_size,
-                                           shuffle=True,
-                                           drop_last=True)
+    dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
 
     # Set optimizer
     optimizer = Adafactor(model.parameters(),
@@ -152,9 +145,7 @@ def main():
 
     # Set booster
     booster = Booster(plugin=plugin)
-    model, optimizer, _, dataloader, _ = booster.boost(model=model,
-                                                       optimizer=optimizer,
-                                                       dataloader=dataloader)
+    model, optimizer, _, dataloader, _ = booster.boost(model=model, optimizer=optimizer, dataloader=dataloader)
 
     # Start benchmark
     model.train()
@@ -162,9 +153,7 @@ def main():
 
     timer = SimpleTimer()
     for epoch in range(args.num_epoch):
-        for batch in tqdm(dataloader,
-                          desc=f'Epoch [{epoch + 1}]',
-                          disable=not coordinator.is_master()):
+        for batch in tqdm(dataloader, desc=f'Epoch [{epoch + 1}]', disable=not coordinator.is_master()):
             timer.start("train_step")
 
             # Forward

From 4093e1d0c24121ebca4f87f663d7dfe3645abab7 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 21 Sep 2023 15:23:34 +0800
Subject: [PATCH 21/23] update fsdp bench

---
 .../openmoe/benchmark/benchmark_fsdp.py       | 18 ++++++-------
 .../openmoe/benchmark/benchmark_fsdp.sh       | 25 +++++++++++++++++++
 2 files changed, 33 insertions(+), 10 deletions(-)
 create mode 100755 examples/language/openmoe/benchmark/benchmark_fsdp.sh

diff --git a/examples/language/openmoe/benchmark/benchmark_fsdp.py b/examples/language/openmoe/benchmark/benchmark_fsdp.py
index acbc9435731c..cb231687ef39 100644
--- a/examples/language/openmoe/benchmark/benchmark_fsdp.py
+++ b/examples/language/openmoe/benchmark/benchmark_fsdp.py
@@ -12,11 +12,10 @@
 from torch.utils.data.distributed import DistributedSampler
 from transformers import Adafactor
 from transformers.models.llama import LlamaConfig
+from utils import PerformanceEvaluator, get_model_numel
 
 from colossalai.moe.manager import MOE_MANAGER
 
-from .utils import PerformanceEvaluator, get_model_numel
-
 
 class RandomDataset(Dataset):
 
@@ -39,13 +38,15 @@ def __getitem__(self, idx):
 
 def fsdp_main(rank, world_size, args):
     os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "14523"
+    os.environ["MASTER_PORT"] = "14501"
     # initialize the process group
     dist.init_process_group("nccl", rank=rank, world_size=world_size)
 
     MOE_MANAGER.setup(seed=42, parallel=None, use_kernel_optim=False)
 
-    dataset = RandomDataset(max_length=args.seq_length)
+    dp_size = dist.get_world_size()
+    dataset = RandomDataset(max_length=args.seq_length,
+                            num_samples=args.batch_size * (args.warmup + args.active) * dp_size)
     sampler = DistributedSampler(dataset, rank=rank, num_replicas=world_size, shuffle=False)
     train_kwargs = {"batch_size": args.batch_size, "sampler": sampler}
     train_loader = torch.utils.data.DataLoader(dataset, **train_kwargs)
@@ -73,14 +74,11 @@ def fsdp_main(rank, world_size, args):
     performance_evaluator = PerformanceEvaluator(
         model_numel,
         enable_grad_checkpoint=True,
-        ignore_steps=args.warm_up,
+        ignore_steps=args.warmup,
         dp_world_size=dist.get_world_size(),
     )
 
-    for step, data in tqdm.tqdm(enumerate(train_loader), total=args.warm_up + args.active):
-        if step == args.warm_up + args.active:
-            break
-
+    for step, data in tqdm.tqdm(enumerate(train_loader), total=len(train_loader)):
         performance_evaluator.on_step_start(step)
         input_ids, attention_mask, labels = (
             data["input_ids"].cuda(),
@@ -116,7 +114,7 @@ def fsdp_main(rank, world_size, args):
     )
     parser.add_argument("--batch_size", type=int, default=1)
     parser.add_argument("--seq_length", type=int, default=2048)
-    parser.add_argument("--warm_up", type=int, default=20)
+    parser.add_argument("--warmup", type=int, default=20)
     parser.add_argument("--active", type=int, default=20)
     args = parser.parse_args()
 
diff --git a/examples/language/openmoe/benchmark/benchmark_fsdp.sh b/examples/language/openmoe/benchmark/benchmark_fsdp.sh
new file mode 100755
index 000000000000..a4cb32019431
--- /dev/null
+++ b/examples/language/openmoe/benchmark/benchmark_fsdp.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+set -xue
+
+NUM_GPU=4
+MODEL="base"
+BATCH_SIZE=1
+SEQ_LENGTH=2048
+WARMUP=10
+ACTIVE=10
+
+# HACK: make model importable
+example_dir=$(dirname $(realpath $(dirname $0)))
+if [ -z ${PYTHONPATH+x} ]; then
+    export PYTHONPATH=$example_dir
+else
+    export PYTHONPATH=$example_dir:$PYTHONPATH
+fi
+
+python $example_dir/benchmark/benchmark_fsdp.py \
+    --model_name $MODEL \
+    --batch_size $BATCH_SIZE \
+    --seq_length $SEQ_LENGTH \
+    --warmup $WARMUP \
+    --active $ACTIVE

From 8b7724d6cc81ccf18fcfc137348872e2900746ca Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 21 Sep 2023 15:23:59 +0800
Subject: [PATCH 22/23] fix plugin

---
 colossalai/booster/plugin/moe_hybrid_parallel_plugin.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index fab6c2f0cb7b..1f3bb294a7ca 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -103,7 +103,10 @@ def __init__(self,
                  overlap_communication: bool = True,
                  custom_policy: Policy = None) -> None:
 
-        super().__init__()
+        super().__init__(tp_size=tp_size,
+                         pp_size=pp_size,
+                         num_microbatches=num_microbatches,
+                         microbatch_size=microbatch_size)
         assert dist.get_world_size() % (
             tp_size * pp_size
         ) == 0, f'world size {dist.get_world_size()} is not divisible by tp_size {tp_size} * pp_size {pp_size}'

From 3a608b21f75c7f51a28378f3ea180a6522745489 Mon Sep 17 00:00:00 2001
From: oahzxl <xuanlei.zhao@gmail.com>
Date: Thu, 21 Sep 2023 15:49:34 +0800
Subject: [PATCH 23/23] update benchmark

---
 .../openmoe/benchmark/benchmark_cai.py        | 261 +++++++++++-------
 .../openmoe/benchmark/benchmark_cai.sh        |  56 ++++
 .../openmoe/benchmark/benchmark_train.sh      |  34 ---
 examples/language/openmoe/benchmark/utils.py  | 151 +++++++---
 4 files changed, 318 insertions(+), 184 deletions(-)
 create mode 100755 examples/language/openmoe/benchmark/benchmark_cai.sh
 delete mode 100755 examples/language/openmoe/benchmark/benchmark_train.sh

diff --git a/examples/language/openmoe/benchmark/benchmark_cai.py b/examples/language/openmoe/benchmark/benchmark_cai.py
index 429217dc56ce..7f36f8a88925 100644
--- a/examples/language/openmoe/benchmark/benchmark_cai.py
+++ b/examples/language/openmoe/benchmark/benchmark_cai.py
@@ -1,56 +1,94 @@
 import datasets
 import torch
+import torch.distributed as dist
 import transformers
 from model.modeling_openmoe import OpenMoeForCausalLM
+from model.openmoe_policy import OpenMoeForCausalLMPolicy
 from torch.utils.data import Dataset
 from tqdm import tqdm
 from transformers import Adafactor
 from transformers.models.llama import LlamaConfig
-from utils import SimpleTimer, print_model_numel
+from utils import PerformanceEvaluator, get_model_numel
 
 import colossalai
 from colossalai import get_default_parser
 from colossalai.booster import Booster
 from colossalai.booster.plugin import LowLevelZeroPlugin
+from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import skip_init
 from colossalai.utils import get_current_device
 
 
+def move_to_cuda(batch, device):
+    return {k: v.to(device) for k, v in batch.items()}
+
+
 class RandomDataset(Dataset):
 
     def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000):
         self.num_samples = num_samples
         self.max_length = max_length
         self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
-        self.attention_mask = torch.ones_like(self.input_ids, device=get_current_device())
+        self.attention_mask = torch.ones_like(self.input_ids)
 
     def __len__(self):
         return self.num_samples
 
     def __getitem__(self, idx):
         return {
-            'input_ids': self.input_ids[idx],
-            'attention_mask': self.attention_mask[idx],
-            'labels': self.input_ids[idx]
+            "input_ids": self.input_ids[idx],
+            "attention_mask": self.attention_mask[idx],
+            "labels": self.input_ids[idx],
         }
 
 
 def parse_args():
+    # basic settings
     parser = get_default_parser()
-    # TODO: add model_name
-    # parser.add_argument("--model_name", type=str, default="base", choices=["base", "8b"],
-    #                     help="Path to pretrained model or model identifier from huggingface.co/models.")
-    parser.add_argument("--num_epoch", type=int, default=1, help="Number of epochs.")
-    parser.add_argument("--batch_size",
-                        type=int,
-                        default=4,
-                        help="Batch size (per dp group) for the training dataloader.")
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="base",
+        choices=["base", "8b"],
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=4,
+        help="Batch size (per dp group) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--seq_length",
+        type=int,
+        default=2048,
+        help="sequence length for the training dataloader.",
+    )
     parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
-    parser.add_argument("--num_samples", type=int, default=1000, help="Number of samples in the dataset.")
-
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="hybrid",
+        help="parallel plugin",
+        choices=["zero1", "zero2", "hybrid"],
+    )
+    # hybrid plugin
+    parser.add_argument("--pp_size", type=int, default=2, help="pp size")
+    parser.add_argument("--dp_size", type=int, default=1, help="dp size")
+    parser.add_argument("--ep_size", type=int, default=2, help="ep size")
+    parser.add_argument("--zero_stage", type=int, default=1, help="zero stage in hybrid plugin")
+    parser.add_argument("--microbatch_size", type=int, default=1, help="microbatch size")
+    # kernel
+    parser.add_argument(
+        "--use_kernel",
+        action="store_true",
+        help="Use kernel optim. Need to install flash attention, apex, triton to enable all kernel optimizations.",
+    )
+    # bench
+    parser.add_argument("--warmup", type=int, default=20)
+    parser.add_argument("--active", type=int, default=20)
     args = parser.parse_args()
     return args
 
@@ -58,53 +96,10 @@ def parse_args():
 def main():
     args = parse_args()
 
-    MDOEL_CONFIG = {
-        "architectures": ["OpenMoeForCausalLM"],
-        "capacity_factor_eval": 2.0,
-        "capacity_factor_train": 1.25,
-        "drop_tks": True,
-        "dropout_rate": 0.0,
-        "expert_parallel": None,
-        "gated": True,
-        "head_dim": 64,
-        "hidden_act": "swiglu",
-        "hidden_size": 768,
-        "intermediate_size": 2048,
-        "label_smoothing": 0.0,
-        "layer_norm_epsilon": 1e-06,
-        "min_capacity": 4,
-        "moe_layer_interval": 4,
-        "noisy_policy": None,
-        "num_attention_heads": 12,
-        "num_experts": 16,
-        "num_hidden_layers": 12,
-        "num_key_value_heads": 12,
-        "pretraining_tp": 1,
-        "rope_scaling": None,
-        "router_aux_loss_factor": 0.01,
-        "router_z_loss_factor": 0.0001,
-        "topk": 2,
-        "torch_dtype": "float32",
-        "vocab_size": 256384,
-        "z_loss_factor": 0.0001
-    }
-    OPTIM_CONFIG = {
-        "decay_rate": -0.8,
-        "weight_decay": 0.01,
-    }
-
-    # update config from args
-    for k in MDOEL_CONFIG:
-        if hasattr(args, k):
-            MDOEL_CONFIG[k] = getattr(args, k)
-
     # Launch ColossalAI
     colossalai.launch_from_torch(config={}, seed=args.seed)
     coordinator = DistCoordinator()
 
-    # Set up moe
-    MOE_MANAGER.setup(seed=42, parallel="EP")
-
     # Manage loggers
     disable_existing_loggers()
     logger = get_dist_logger()
@@ -115,70 +110,122 @@ def main():
         datasets.utils.logging.set_verbosity_error()
         transformers.utils.logging.set_verbosity_error()
 
-    # Build OpenMoe model
-    config = LlamaConfig()
-    for k, v in MDOEL_CONFIG.items():
-        setattr(config, k, v)
-
-    with skip_init():
-        model = OpenMoeForCausalLM(config)
+    # Set plugin
+    booster_kwargs = {}
+    if args.plugin == "zero1":
+        dp_size = dist.get_world_size()
+        plugin = LowLevelZeroPlugin(initial_scale=2**5, stage=1)
+        MOE_MANAGER.setup(
+            seed=42,
+            parallel="EP",
+            use_kernel_optim=args.use_kernel,
+        )
+    elif args.plugin == "zero2":
+        dp_size = dist.get_world_size()
+        plugin = LowLevelZeroPlugin(initial_scale=2**5, stage=2)
+        MOE_MANAGER.setup(
+            seed=42,
+            parallel="EP",
+            use_kernel_optim=args.use_kernel,
+        )
+    elif args.plugin == "hybrid":
+        dp_size = dist.get_world_size() // args.pp_size
+        plugin = MoeHybridParallelPlugin(
+            tp_size=1,
+            pp_size=args.pp_size,
+            zero_stage=args.zero_stage,
+            microbatch_size=args.microbatch_size,
+            custom_policy=OpenMoeForCausalLMPolicy(),
+            enable_fused_normalization=args.use_kernel,
+            enable_jit_fused=args.use_kernel,
+        )
+        MOE_MANAGER.setup(
+            seed=42,
+            parallel="EP",
+            mode="fixed",
+            fixed_dp_size=args.dp_size,
+            fixed_ep_size=args.ep_size,
+            fixed_pp_size=args.pp_size,
+            use_kernel_optim=args.use_kernel,
+        )
+    else:
+        raise ValueError(f"Invalid plugin {args.plugin}")
+    logger.info(f"Set plugin as {plugin}", ranks=[0])
 
+    # Build OpenMoe model
+    repo_name = "hpcaitech/openmoe-" + args.model_name
+    config = LlamaConfig.from_pretrained(repo_name)
+    setattr(config, "router_aux_loss_factor", 0.1)
+    setattr(config, "router_z_loss_factor", 0.1)
+    setattr(config, "label_smoothing", 0.1)
+    setattr(config, "z_loss_factor", 0.1)
+    model = OpenMoeForCausalLM(config)
     logger.info(f"Finish init model with config:\n{config}", ranks=[0])
-    model_param = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    logger.info(f"Model param count: {model_param/1e6:.2f}M", ranks=[0])
 
     # Enable gradient checkpointing
     model.gradient_checkpointing_enable()
 
-    # Set plugin
-    plugin = LowLevelZeroPlugin(initial_scale=2**5, stage=2)
-    logger.info(f"Set plugin as {plugin}", ranks=[0])
-
     # Prepare tokenizer and dataloader
-    dataset = RandomDataset(num_samples=args.num_samples)
-    dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
+    dataset = RandomDataset(
+        num_samples=args.batch_size * (args.warmup + args.active + 1) * dp_size,
+        max_length=args.seq_length,
+    )
+    dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size)
 
     # Set optimizer
-    optimizer = Adafactor(model.parameters(),
-                          decay_rate=OPTIM_CONFIG["decay_rate"],
-                          weight_decay=OPTIM_CONFIG["weight_decay"])
+    optimizer = Adafactor(model.parameters(), weight_decay=0.01)
+
+    model_numel = get_model_numel(model)
+    performance_evaluator = PerformanceEvaluator(
+        model_numel,
+        enable_grad_checkpoint=True,
+        ignore_steps=args.warmup,
+        dp_world_size=dp_size,
+    )
 
     # Set booster
-    booster = Booster(plugin=plugin)
+    booster = Booster(plugin=plugin, **booster_kwargs)
     model, optimizer, _, dataloader, _ = booster.boost(model=model, optimizer=optimizer, dataloader=dataloader)
+    use_pipeline = (isinstance(booster.plugin, MoeHybridParallelPlugin) and booster.plugin.pp_size > 1)
+    is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage()
+    logger.info(f"Finish init booster", ranks=[0])
 
-    # Start benchmark
+    # Start finetuning
+    logger.info(f"Start finetuning", ranks=[0])
     model.train()
-    logger.info(f"Start benchmark", ranks=[0])
-
-    timer = SimpleTimer()
-    for epoch in range(args.num_epoch):
-        for batch in tqdm(dataloader, desc=f'Epoch [{epoch + 1}]', disable=not coordinator.is_master()):
-            timer.start("train_step")
-
-            # Forward
-            timer.start("forward")
-            outputs = model(use_cache=False, chunk_head=True, **batch)
-            loss = outputs['loss']
-            torch.cuda.synchronize()
-            timer.stop("forward")
-
-            # Backward
-            timer.start("backward")
-            booster.backward(loss, optimizer)
-            torch.cuda.synchronize()
-            timer.stop("backward")
-
-            # Optimizer step
-            timer.start("optimizer_step")
+    train_dataloader_iter = iter(dataloader)
+    total_len = len(train_dataloader_iter) - 1
+    exmaple_data = next(train_dataloader_iter)
+    with tqdm(range(total_len), disable=not coordinator.is_master()) as pbar:
+        for step in pbar:
+            performance_evaluator.on_step_start(step)
+            if use_pipeline:
+                # Forward pass
+                outputs = booster.execute_pipeline(
+                    train_dataloader_iter,
+                    model,
+                    lambda x, y: x.loss,
+                    optimizer,
+                    return_loss=True,
+                    return_outputs=True,
+                )
+                # Backward and optimize
+                if is_pp_last_stage:
+                    loss = outputs["loss"]
+                    pbar.set_postfix({"loss": loss.item()})
+            else:
+                # Forward pass
+                data = move_to_cuda(data, torch.cuda.current_device())
+                outputs = model(**data)
+                loss = outputs["loss"]
+                # Backward
+                booster.backward(loss, optimizer)
+                pbar.set_postfix({"loss": loss.item()})
+
             optimizer.step()
             optimizer.zero_grad()
-            torch.cuda.synchronize()
-            timer.stop("optimizer_step")
-
-            timer.stop("train_step")
-
-    logger.info(f"Benchmark result:\n{repr(timer)}", ranks=[0])
+            performance_evaluator.on_step_end(exmaple_data["input_ids"])
+    performance_evaluator.on_fit_end()
 
 
 if __name__ == "__main__":
diff --git a/examples/language/openmoe/benchmark/benchmark_cai.sh b/examples/language/openmoe/benchmark/benchmark_cai.sh
new file mode 100755
index 000000000000..24d0c1b23ab2
--- /dev/null
+++ b/examples/language/openmoe/benchmark/benchmark_cai.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+set -xue
+
+NUM_GPU=4
+MODEL="base"
+BATCH_SIZE=1
+SEQ_LENGTH=2048
+WARMUP=10
+ACTIVE=10
+
+# HACK: make model importable
+example_dir=$(dirname $(realpath $(dirname $0)))
+if [ -z ${PYTHONPATH+x} ]; then
+    export PYTHONPATH=$example_dir
+else
+    export PYTHONPATH=$example_dir:$PYTHONPATH
+fi
+
+# hybrid
+torchrun --standalone --nproc_per_node $NUM_GPU \
+    $example_dir/benchmark/benchmark_cai.py \
+    --model_name $MODEL \
+    --batch_size $BATCH_SIZE \
+    --seq_length $SEQ_LENGTH \
+    --warmup $WARMUP \
+    --active $ACTIVE \
+    --use_kernel \
+    --plugin hybrid \
+    --pp_size 2 \
+    --dp_size 1 \
+    --ep_size 2 \
+    --zero_stage 1 \
+    --microbatch_size 1
+
+# zero1
+torchrun --standalone --nproc_per_node $NUM_GPU \
+    $example_dir/benchmark/benchmark_cai.py \
+    --model_name $MODEL \
+    --batch_size $BATCH_SIZE \
+    --seq_length $SEQ_LENGTH \
+    --warmup $WARMUP \
+    --active $ACTIVE \
+    --plugin zero1 \
+    --use_kernel
+
+# zero2
+torchrun --standalone --nproc_per_node $NUM_GPU \
+    $example_dir/benchmark/benchmark_cai.py \
+    --model_name $MODEL \
+    --batch_size $BATCH_SIZE \
+    --seq_length $SEQ_LENGTH \
+    --warmup $WARMUP \
+    --active $ACTIVE \
+    --plugin zero2 \
+    --use_kernel
diff --git a/examples/language/openmoe/benchmark/benchmark_train.sh b/examples/language/openmoe/benchmark/benchmark_train.sh
deleted file mode 100755
index 0496a31a7479..000000000000
--- a/examples/language/openmoe/benchmark/benchmark_train.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-set -xue
-
-BENCHMARK_DIR=benchmark
-NUM_GPU=2
-
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
-        tail -n +2 |
-        nl -v 0 |
-        tee /dev/tty |
-        sort -g -k 2 |
-        awk '{print $1}' |
-        head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES $NUM_GPU
-
-# HACK: make model importable
-example_dir=$(dirname $(realpath $(dirname $0)))
-if [ -z ${PYTHONPATH+x} ]; then
-    export PYTHONPATH=$example_dir
-else
-    export PYTHONPATH=$example_dir:$PYTHONPATH
-fi
-
-torchrun --standalone --nproc_per_node $NUM_GPU \
-    $example_dir/$BENCHMARK_DIR/benchmark_train.py
diff --git a/examples/language/openmoe/benchmark/utils.py b/examples/language/openmoe/benchmark/utils.py
index d2edee64451c..7a0955bb028a 100644
--- a/examples/language/openmoe/benchmark/utils.py
+++ b/examples/language/openmoe/benchmark/utils.py
@@ -1,61 +1,126 @@
-import dataclasses
-import time
-from typing import Dict
+from time import time
+from typing import Optional
 
+import torch
 import torch.distributed as dist
 import torch.nn as nn
+from torch import Tensor
+
 from colossalai.logging import DistributedLogger
 
 
-def print_model_numel(logger: DistributedLogger,
-                      model: nn.Module) -> None:
+def print_model_numel(logger: DistributedLogger, model: nn.Module) -> None:
     B = 1024**3
     M = 1024**2
     K = 1024
     outputs = "Model param count: "
     model_param = sum(p.numel() for p in model.parameters() if p.requires_grad)
     if model_param >= B:
-        outputs += f'{model_param / B:.2f} B\n'
+        outputs += f"{model_param / B:.2f} B\n"
     elif model_param >= M:
-        outputs += f'{model_param / M:.2f} M\n'
+        outputs += f"{model_param / M:.2f} M\n"
     elif model_param >= K:
-        outputs += f'{model_param / K:.2f} K\n'
+        outputs += f"{model_param / K:.2f} K\n"
     else:
-        outputs += f'{model_param}\n'
+        outputs += f"{model_param}\n"
     logger.info(outputs, ranks=[0])
 
 
-@dataclasses.dataclass
-class TimingItem():
-    last_time: float = 0.0
-    total_time: float = 0.0
-    count: float = 0
-
-    def __str__(self) -> str:
-        return f"average time: {self.total_time/self.count * 1000:.2f} ms"
-
-
-class SimpleTimer():
-    def __init__(self, warmup: int = 20) -> None:
-        self.timing_items: Dict[str, TimingItem] = {}
-        self.warmup = warmup
-
-    def start(self, name: str):
-        if name not in self.timing_items:
-            self.timing_items[name] = TimingItem()
-        self.timing_items[name].last_time = time.time()
-
-    def stop(self, name: str):
-        assert name in self.timing_items
-        timing_item = self.timing_items[name]
-        timing_item.total_time += time.time() - timing_item.last_time
-        timing_item.count += 1
-        if timing_item.count > self.warmup:
-            timing_item.count = 0
-            timing_item.total_time = 0.0
-
-    def __repr__(self) -> str:
-        result = "[Timer]:\n"
-        for name, timing_item in self.timing_items.items():
-            result += f"    {name}: {timing_item}\n"
-        return result
+def get_model_numel(model: nn.Module) -> None:
+    model_param = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return model_param
+
+
+def divide(x: float, y: float) -> float:
+    if y == 0:
+        return float("inf")
+    elif y == float("inf"):
+        return float("nan")
+    return x / y
+
+
+@torch.no_grad()
+def all_reduce_mean(x: float, world_size: int) -> float:
+    if world_size == 1:
+        return x
+    tensor = torch.tensor([x], device=torch.cuda.current_device())
+    dist.all_reduce(tensor)
+    tensor = tensor / world_size
+    return tensor.item()
+
+
+class Timer:
+
+    def __init__(self) -> None:
+        self.start_time: Optional[float] = None
+        self.duration: float = 0.0
+
+    def start(self) -> None:
+        self.start_time = time()
+
+    def end(self) -> None:
+        assert self.start_time is not None
+        self.duration += time() - self.start_time
+        self.start_time = None
+
+    def reset(self) -> None:
+        self.duration = 0.0
+
+
+class PerformanceEvaluator:
+    """
+        Callback for valuate the performance of the model.
+    Args:
+        actor_num_params: The number of parameters of the actor model.
+        critic_num_params: The number of parameters of the critic model.
+        initial_model_num_params: The number of parameters of the initial model.
+        reward_model_num_params: The number of parameters of the reward model.
+        enable_grad_checkpoint: Whether to enable gradient checkpointing.
+        ignore_episodes: The number of episodes to ignore when calculating the performance.
+    """
+
+    def __init__(
+        self,
+        model_numel: int,
+        enable_grad_checkpoint: bool = False,
+        ignore_steps: int = 0,
+        dp_world_size: Optional[int] = None,
+    ) -> None:
+        self.model_numel = model_numel
+        self.enable_grad_checkpoint = enable_grad_checkpoint
+        self.ignore_steps = ignore_steps
+        self.dp_world_size = dp_world_size
+        self.world_size = dist.get_world_size()
+        self.disable: bool = False
+        self.timer = Timer()
+        self.num_samples: int = 0
+        self.flop: int = 0
+
+    def on_step_start(self, step: int) -> None:
+        self.disable = self.ignore_steps > 0 and step < self.ignore_steps
+        if self.disable:
+            return
+        torch.cuda.synchronize()
+        self.timer.start()
+
+    def on_step_end(self, input_ids: Tensor, **kwargs) -> None:
+        if self.disable:
+            return
+        torch.cuda.synchronize()
+        self.timer.end()
+
+        batch_size, seq_len = input_ids.shape
+
+        self.num_samples += batch_size
+        self.flop += (batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint)))
+
+    def on_fit_end(self) -> None:
+        avg_duration = all_reduce_mean(self.timer.duration, self.world_size)
+        avg_throughput = self.num_samples * self.dp_world_size / (avg_duration + 1e-12)
+        mp_world_size = self.world_size // self.dp_world_size
+        avg_tflops_per_gpu = self.flop / 1e12 / (avg_duration + 1e-12) / mp_world_size
+        if dist.get_rank() == 0:
+            print(
+                f"num_samples: {self.num_samples}, dp_world_size: {self.dp_world_size}, flop: {self.flop}, avg_duration: {avg_duration}, "
+                f"avg_throughput: {avg_throughput}")
+            print(f"Throughput: {avg_throughput:.2f} samples/sec, TFLOPS per GPU: {avg_tflops_per_gpu:.2f}")