From 79510f41066b307d65946c7c11143d65bb1f762e Mon Sep 17 00:00:00 2001
From: haze188 <haze188@qq.com>
Date: Mon, 1 Jul 2024 06:10:10 +0000
Subject: [PATCH 1/8] [misc] fix typos

---
 colossalai/shardformer/policies/deepseek.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py
index f8f39e66c121..1d64c643ebdb 100644
--- a/colossalai/shardformer/policies/deepseek.py
+++ b/colossalai/shardformer/policies/deepseek.py
@@ -39,11 +39,11 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         if self.shard_config.enable_sequence_parallelism:
             self.shard_config.enable_sequence_parallelism = False
             raise NotImplementedError(
-                "Mixtral dosen't support sequence parallelism now, will ignore the sequence parallelism flag."
+                "Deepseek dosen't support sequence parallelism now, will ignore the sequence parallelism flag."
             )
 
         if self.shard_config.enable_tensor_parallelism:
-            raise NotImplementedError("Tensor parallelism is not supported for Mixtral model now.")
+            raise NotImplementedError("Tensor parallelism is not supported for Deepseek model now.")
         if getattr(self.shard_config, "ep_group", None) is None:
             raise ValueError("You must pass in ep_group via shard_config for expert parallel!")
 
@@ -117,7 +117,7 @@ def get_held_layers(self) -> List[Module]:
         """Get pipeline layers for current stage."""
         assert self.pipeline_stage_manager is not None
 
-        if self.model.__class__.__name__ == "MixtralModel":
+        if self.model.__class__.__name__ == "DeepseekModel":
             module = self.model
         else:
             module = self.model.model
@@ -145,7 +145,7 @@ def module_policy(self):
             # set None as default
             self.set_pipeline_forward(
                 model_cls=DeepseekModel,
-                new_forward=DeepseekPipelineForwards.mixtral_model_forward,
+                new_forward=DeepseekPipelineForwards.deepseek_model_forward,
                 policy=policy,
             )
         return policy

From e9bf95ef286da4f68a10a3fcac5de024c7049531 Mon Sep 17 00:00:00 2001
From: haze188 <haze188@qq.com>
Date: Thu, 4 Jul 2024 08:03:42 +0000
Subject: [PATCH 2/8] [Feature] deepseek support via auto model, remove
 modeling file

---
 colossalai/cluster/process_group_mesh.py    |  2 +-
 colossalai/shardformer/modeling/deepseek.py | 51 +++++++++++++--------
 colossalai/shardformer/policies/deepseek.py | 17 +++----
 tests/test_moe/test_deepseek_layer.py       | 21 +++++----
 tests/test_moe/test_moe_checkpoint.py       | 15 +-----
 5 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/colossalai/cluster/process_group_mesh.py b/colossalai/cluster/process_group_mesh.py
index 1319a4529093..b6aff0d72fe6 100644
--- a/colossalai/cluster/process_group_mesh.py
+++ b/colossalai/cluster/process_group_mesh.py
@@ -147,7 +147,7 @@ def get_group(self, ranks_in_group: List[int], backend: Optional[str] = None) ->
             ProcessGroup: The process group with the given ranks.
         """
         ranks_in_group = sorted(ranks_in_group)
-        if tuple(ranks_in_group) not in self._group_to_ranks:
+        if tuple(ranks_in_group) not in self._ranks_to_group:
             group = dist.new_group(ranks_in_group, backend=backend)
             self._ranks_to_group[tuple(ranks_in_group)] = group
             self._group_to_ranks[group] = tuple(ranks_in_group)
diff --git a/colossalai/shardformer/modeling/deepseek.py b/colossalai/shardformer/modeling/deepseek.py
index 91391639dd50..6e79ce144cc8 100644
--- a/colossalai/shardformer/modeling/deepseek.py
+++ b/colossalai/shardformer/modeling/deepseek.py
@@ -2,32 +2,47 @@
 
 import torch
 import torch.distributed as dist
+import torch.nn as nn
 from torch.distributed import ProcessGroup
 
 # from colossalai.tensor.moe_tensor.moe_info import MoeParallelInfo
 from torch.nn import CrossEntropyLoss
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import is_flash_attn_2_available, logging
 
 from colossalai.lazy import LazyInitContext
 from colossalai.moe._operation import MoeInGradScaler, MoeOutGradScaler, all_to_all_uneven
 from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import (
-    AddAuxiliaryLoss,
-    CausalLMOutputWithPast,
-    DeepseekForCausalLM,
-    DeepseekMLP,
-    DeepseekModel,
-    DeepseekMoE,
-)
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.shardformer.shard.utils import set_tensors_to_none
 
 
-class EPDeepseekMoE(DeepseekMoE):
-    def __init__(self, config: DeepseekConfig):
-        super().__init__(config)
+# copied from modeling_deepseek.py
+class AddAuxiliaryLoss(torch.autograd.Function):
+    """
+    The trick function of adding auxiliary (aux) loss,
+    which includes the gradient of the aux loss during backpropagation.
+    """
+
+    @staticmethod
+    def forward(ctx, x, loss):
+        assert loss.numel() == 1
+        ctx.dtype = loss.dtype
+        ctx.required_aux_loss = loss.requires_grad
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_loss = None
+        if ctx.required_aux_loss:
+            grad_loss = torch.ones(1, dtype=ctx.dtype, device=grad_output.device)
+        return grad_output, grad_loss
+
+
+class EPDeepseekMoE(nn.Module):
+    def __init__(self):
+        super(EPDeepseekMoE, self).__init__()
 
     def setup_ep(self, ep_group: ProcessGroup):
         ep_group = ep_group
@@ -44,9 +59,9 @@ def setup_ep(self, ep_group: ProcessGroup):
             p.ep_group = ep_group
 
     @staticmethod
-    def from_native_module(module: Union[DeepseekMoE, DeepseekMLP], *args, **kwargs) -> "EPDeepseekMoE":
+    def from_native_module(module: Union["DeepseekMoE", "DeepseekMLP"], *args, **kwargs) -> "EPDeepseekMoE":
         LazyInitContext.materialize(module)
-        if isinstance(module, DeepseekMLP):
+        if module.__class__.__name__ == "DeepseekMLP":
             return module
         module.__class__ = EPDeepseekMoE
         assert "ep_group" in kwargs, "You should pass ep_group in SubModuleReplacementDescription via shard_config!!"
@@ -120,7 +135,7 @@ class DeepseekPipelineForwards:
 
     @staticmethod
     def deepseek_model_forward(
-        self: DeepseekModel,
+        self: "DeepseekModel",
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -147,9 +162,9 @@ def deepseek_model_forward(
         Example:
 
         ```python
-        >>> from transformers import AutoTokenizer, DeepseekForCausalLM
+        >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
-        >>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> model = AutoModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
@@ -303,7 +318,7 @@ def custom_forward(*inputs):
 
     @staticmethod
     def deepseek_for_causal_lm_forward(
-        self: DeepseekForCausalLM,
+        self: "DeepseekForCausalLM",
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py
index 1d64c643ebdb..07b86cd638c8 100644
--- a/colossalai/shardformer/policies/deepseek.py
+++ b/colossalai/shardformer/policies/deepseek.py
@@ -7,11 +7,6 @@
 
 from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
 from colossalai.shardformer.modeling.deepseek import DeepseekPipelineForwards, EPDeepseekMoE
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import (
-    DeepseekDecoderLayer,
-    DeepseekForCausalLM,
-    DeepseekModel,
-)
 from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 __all__ = ["DeepseekPolicy", "DeepseekForCausalLMPolicy"]
@@ -57,7 +52,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 )
             ],
             policy=policy,
-            target_key=DeepseekDecoderLayer,
+            target_key="DeepseekDecoderLayer",
         )
 
         # optimization configuration
@@ -74,7 +69,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     ),
                 ],
                 policy=policy,
-                target_key=DeepseekDecoderLayer,
+                target_key="DeepseekDecoderLayer",
             )
 
             self.append_or_create_submodule_replacement(
@@ -83,7 +78,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     target_module=FusedRMSNorm,
                 ),
                 policy=policy,
-                target_key=DeepseekModel,
+                target_key="DeepseekModel",
             )
 
         if self.shard_config.enable_flash_attention:
@@ -144,7 +139,7 @@ def module_policy(self):
         if self.pipeline_stage_manager:
             # set None as default
             self.set_pipeline_forward(
-                model_cls=DeepseekModel,
+                model_cls="DeepseekModel",
                 new_forward=DeepseekPipelineForwards.deepseek_model_forward,
                 policy=policy,
             )
@@ -167,7 +162,7 @@ def module_policy(self):
         if self.shard_config.enable_tensor_parallelism:
             # add a new item for casual lm
             new_item = {
-                DeepseekForCausalLM: ModulePolicyDescription(
+                "DeepseekForCausalLM": ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
                             suffix="lm_head",
@@ -182,7 +177,7 @@ def module_policy(self):
         if self.pipeline_stage_manager:
             # set None as default
             self.set_pipeline_forward(
-                model_cls=DeepseekForCausalLM,
+                model_cls="DeepseekForCausalLM",
                 new_forward=DeepseekPipelineForwards.deepseek_for_causal_lm_forward,
                 policy=policy,
             )
diff --git a/tests/test_moe/test_deepseek_layer.py b/tests/test_moe/test_deepseek_layer.py
index 06dfbfe3b515..328ffb1de5f8 100644
--- a/tests/test_moe/test_deepseek_layer.py
+++ b/tests/test_moe/test_deepseek_layer.py
@@ -4,12 +4,11 @@
 import torch
 import torch.distributed as dist
 from torch.testing import assert_close
+from transformers import AutoConfig, AutoModel
 
 import colossalai
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.shardformer.modeling.deepseek import EPDeepseekMoE
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import DeepseekMoE
 from colossalai.testing.utils import spawn
 
 tokens, n_experts = 7, 4
@@ -25,14 +24,18 @@ def check_deepseek_moe_layer():
         pp_size=1,
         ep_size=dist.get_world_size(),
     )
-    config = DeepseekConfig(
-        hidden_size=hidden_size,
-        intermediate_size=hidden_size * 2,
-        n_routed_experts=n_experts,
-        num_experts_per_tok=top_k,
-    )
+
+    config = AutoConfig.from_pretrained("deepseek-ai/deepseek-moe-16b-base", trust_remote_code=True)
+    config.num_hidden_layers = 1
+    config.n_routed_experts = n_experts
+    config.num_experts_per_tok = top_k
+    config.hidden_size = hidden_size
+    config.intermediate_size = hidden_size * 2
+    config.first_k_dense_replace = 0
+    config.num_attention_heads = 2
     torch.manual_seed(0)
-    orig_model = DeepseekMoE(config).cuda()
+    # get the moe layer in auto model
+    orig_model = AutoModel.from_config(config, trust_remote_code=True).layers[0].mlp.cuda()
     x = torch.rand(1, tokens, hidden_size, requires_grad=True).cuda()
     orig_output = orig_model(x)
     model = deepcopy(orig_model)
diff --git a/tests/test_moe/test_moe_checkpoint.py b/tests/test_moe/test_moe_checkpoint.py
index f3c5726ea0ae..8113b32d0411 100644
--- a/tests/test_moe/test_moe_checkpoint.py
+++ b/tests/test_moe/test_moe_checkpoint.py
@@ -14,8 +14,6 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.checkpoint_io import MoECheckpointIO
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import DeepseekForCausalLM
 from colossalai.tensor.moe_tensor.api import is_moe_tensor
 from colossalai.testing import parameterize, spawn
 from colossalai.testing.utils import spawn
@@ -91,21 +89,10 @@ def check_optimizer_snapshot_equal(snapshot1, snapshot2, param2name, moe_dp_grou
                 num_experts_per_tok=top_k,
                 num_attention_heads=2,
                 num_key_value_heads=2,
+                num_hidden_layers=4,
             ),
             MixtralForCausalLM,
         ],
-        [
-            DeepseekConfig(
-                hidden_size=hidden_size,
-                intermediate_size=hidden_size * 2,
-                n_routed_experts=n_experts,
-                num_experts_per_tok=top_k,
-                num_attention_heads=2,
-                num_key_value_heads=2,
-                first_k_dense_replace=4,
-            ),
-            DeepseekForCausalLM,
-        ],
     ],
 )
 def check_moe_checkpoint(test_config):

From 4030aa6ea039948b9b58307b45861acf150859d8 Mon Sep 17 00:00:00 2001
From: haze188 <haze188@qq.com>
Date: Thu, 4 Jul 2024 08:17:45 +0000
Subject: [PATCH 3/8] [misc] delete useless file

---
 examples/language/llama/scripts/benchmark_7B/hosts.txt | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 examples/language/llama/scripts/benchmark_7B/hosts.txt

diff --git a/examples/language/llama/scripts/benchmark_7B/hosts.txt b/examples/language/llama/scripts/benchmark_7B/hosts.txt
deleted file mode 100644
index c9c165ebb978..000000000000
--- a/examples/language/llama/scripts/benchmark_7B/hosts.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-10.20.1.170
-10.20.1.83

From c6abfbc64637fd0cf2edcc79039220921d84c717 Mon Sep 17 00:00:00 2001
From: haze188 <haze188@qq.com>
Date: Thu, 4 Jul 2024 08:19:21 +0000
Subject: [PATCH 4/8] [misc] fix typos

---
 tests/test_moe/test_moe_checkpoint.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_moe/test_moe_checkpoint.py b/tests/test_moe/test_moe_checkpoint.py
index 8113b32d0411..164301695865 100644
--- a/tests/test_moe/test_moe_checkpoint.py
+++ b/tests/test_moe/test_moe_checkpoint.py
@@ -89,7 +89,6 @@ def check_optimizer_snapshot_equal(snapshot1, snapshot2, param2name, moe_dp_grou
                 num_experts_per_tok=top_k,
                 num_attention_heads=2,
                 num_key_value_heads=2,
-                num_hidden_layers=4,
             ),
             MixtralForCausalLM,
         ],

From f0555716a428d0f01a3d6abd9d8e6f4e98c73ad1 Mon Sep 17 00:00:00 2001
From: haze188 <haze188@qq.com>
Date: Thu, 4 Jul 2024 08:29:26 +0000
Subject: [PATCH 5/8] [misc] remove redundant code

---
 colossalai/shardformer/policies/auto_policy.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/colossalai/shardformer/policies/auto_policy.py b/colossalai/shardformer/policies/auto_policy.py
index 24cd0a800f4e..ae9f3603c96e 100644
--- a/colossalai/shardformer/policies/auto_policy.py
+++ b/colossalai/shardformer/policies/auto_policy.py
@@ -167,12 +167,6 @@ class PolicyLocation:
     "transformers_modules.modeling_deepseek.DeepseekForCausalLM": PolicyLocation(
         file_name="deepseek", class_name="DeepseekForCausalLMPolicy"
     ),
-    "colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek.DeepseekModel": PolicyLocation(
-        file_name="deepseek", class_name="DeepseekModelPolicy"
-    ),
-    "colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek.DeepseekForCausalLM": PolicyLocation(
-        file_name="deepseek", class_name="DeepseekForCausalLMPolicy"
-    ),
     # Falcon
     "transformers.models.falcon.modeling_falcon.FalconModel": PolicyLocation(
         file_name="falcon", class_name="FalconModelPolicy"

From aac2fb207e8dcd71b6ac43a21e5c8b20e07d6051 Mon Sep 17 00:00:00 2001
From: haze188 <haze188@qq.com>
Date: Fri, 5 Jul 2024 05:54:42 +0000
Subject: [PATCH 6/8] [misc] mv module replacement into if branch

---
 colossalai/shardformer/policies/deepseek.py | 29 ++++++++++-----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py
index 07b86cd638c8..c3b518929ea5 100644
--- a/colossalai/shardformer/policies/deepseek.py
+++ b/colossalai/shardformer/policies/deepseek.py
@@ -39,21 +39,20 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
 
         if self.shard_config.enable_tensor_parallelism:
             raise NotImplementedError("Tensor parallelism is not supported for Deepseek model now.")
-        if getattr(self.shard_config, "ep_group", None) is None:
-            raise ValueError("You must pass in ep_group via shard_config for expert parallel!")
-
-        # expert parallel
-        self.append_or_create_submodule_replacement(
-            description=[
-                SubModuleReplacementDescription(
-                    suffix="mlp",
-                    target_module=EPDeepseekMoE,
-                    kwargs={"ep_group": self.shard_config.ep_group},
-                )
-            ],
-            policy=policy,
-            target_key="DeepseekDecoderLayer",
-        )
+
+        if getattr(self.shard_config, "ep_group", None) is not None:
+            # expert parallel
+            self.append_or_create_submodule_replacement(
+                description=[
+                    SubModuleReplacementDescription(
+                        suffix="mlp",
+                        target_module=EPDeepseekMoE,
+                        kwargs={"ep_group": self.shard_config.ep_group},
+                    )
+                ],
+                policy=policy,
+                target_key="DeepseekDecoderLayer",
+            )
 
         # optimization configuration
         if self.shard_config.enable_fused_normalization:

From 0f5d9d47b527849484aa3cffa4386d0fde7b391f Mon Sep 17 00:00:00 2001
From: haze188 <haze188@qq.com>
Date: Fri, 5 Jul 2024 06:04:49 +0000
Subject: [PATCH 7/8] [misc] add some warning message and modify some code in
 unit test

---
 colossalai/shardformer/policies/deepseek.py |  6 +++++-
 tests/test_moe/test_deepseek_layer.py       | 19 +++++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py
index c3b518929ea5..a27c236902de 100644
--- a/colossalai/shardformer/policies/deepseek.py
+++ b/colossalai/shardformer/policies/deepseek.py
@@ -1,3 +1,4 @@
+import warnings
 from functools import partial
 from typing import Callable, Dict, List, Union
 
@@ -81,7 +82,10 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
             )
 
         if self.shard_config.enable_flash_attention:
-            raise NotImplementedError("Flash attention has already been replaced in deepseek.")
+            warnings.warn(
+                "Flash attention has already been replaced in deepseek, and now set enable_flash_attention = True."
+            )
+            self.shard_config.enable_flash_attention = False
 
         return policy
 
diff --git a/tests/test_moe/test_deepseek_layer.py b/tests/test_moe/test_deepseek_layer.py
index 328ffb1de5f8..85cc986959fd 100644
--- a/tests/test_moe/test_deepseek_layer.py
+++ b/tests/test_moe/test_deepseek_layer.py
@@ -25,14 +25,17 @@ def check_deepseek_moe_layer():
         ep_size=dist.get_world_size(),
     )
 
-    config = AutoConfig.from_pretrained("deepseek-ai/deepseek-moe-16b-base", trust_remote_code=True)
-    config.num_hidden_layers = 1
-    config.n_routed_experts = n_experts
-    config.num_experts_per_tok = top_k
-    config.hidden_size = hidden_size
-    config.intermediate_size = hidden_size * 2
-    config.first_k_dense_replace = 0
-    config.num_attention_heads = 2
+    config = AutoConfig.from_pretrained(
+        "deepseek-ai/deepseek-moe-16b-base",
+        num_hidden_layers=1,
+        n_routed_experts=n_experts,
+        num_experts_per_tok=top_k,
+        hidden_size=hidden_size,
+        intermediate_size=hidden_size * 2,
+        first_k_dense_replace=0,
+        num_attention_heads=2,
+        trust_remote_code=True,
+    )
     torch.manual_seed(0)
     # get the moe layer in auto model
     orig_model = AutoModel.from_config(config, trust_remote_code=True).layers[0].mlp.cuda()

From 5115ee20ab62d423c1cfa2ef9172f1e299499b59 Mon Sep 17 00:00:00 2001
From: haze188 <haze188@qq.com>
Date: Fri, 5 Jul 2024 06:08:20 +0000
Subject: [PATCH 8/8] [misc] fix typos

---
 colossalai/shardformer/policies/deepseek.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py
index a27c236902de..8ebda357b380 100644
--- a/colossalai/shardformer/policies/deepseek.py
+++ b/colossalai/shardformer/policies/deepseek.py
@@ -83,7 +83,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
 
         if self.shard_config.enable_flash_attention:
             warnings.warn(
-                "Flash attention has already been replaced in deepseek, and now set enable_flash_attention = True."
+                "Flash attention has already been replaced in deepseek, and now set enable_flash_attention = False."
             )
             self.shard_config.enable_flash_attention = False