From 79510f41066b307d65946c7c11143d65bb1f762e Mon Sep 17 00:00:00 2001 From: haze188 Date: Mon, 1 Jul 2024 06:10:10 +0000 Subject: [PATCH 1/8] [misc] fix typos --- colossalai/shardformer/policies/deepseek.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py index f8f39e66c121..1d64c643ebdb 100644 --- a/colossalai/shardformer/policies/deepseek.py +++ b/colossalai/shardformer/policies/deepseek.py @@ -39,11 +39,11 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: if self.shard_config.enable_sequence_parallelism: self.shard_config.enable_sequence_parallelism = False raise NotImplementedError( - "Mixtral dosen't support sequence parallelism now, will ignore the sequence parallelism flag." + "Deepseek dosen't support sequence parallelism now, will ignore the sequence parallelism flag." ) if self.shard_config.enable_tensor_parallelism: - raise NotImplementedError("Tensor parallelism is not supported for Mixtral model now.") + raise NotImplementedError("Tensor parallelism is not supported for Deepseek model now.") if getattr(self.shard_config, "ep_group", None) is None: raise ValueError("You must pass in ep_group via shard_config for expert parallel!") @@ -117,7 +117,7 @@ def get_held_layers(self) -> List[Module]: """Get pipeline layers for current stage.""" assert self.pipeline_stage_manager is not None - if self.model.__class__.__name__ == "MixtralModel": + if self.model.__class__.__name__ == "DeepseekModel": module = self.model else: module = self.model.model @@ -145,7 +145,7 @@ def module_policy(self): # set None as default self.set_pipeline_forward( model_cls=DeepseekModel, - new_forward=DeepseekPipelineForwards.mixtral_model_forward, + new_forward=DeepseekPipelineForwards.deepseek_model_forward, policy=policy, ) return policy From e9bf95ef286da4f68a10a3fcac5de024c7049531 Mon Sep 17 00:00:00 2001 From: haze188 Date: Thu, 4 Jul 2024 08:03:42 +0000 Subject: [PATCH 2/8] [Feature] deepseek support via auto model, remove modeling file --- colossalai/cluster/process_group_mesh.py | 2 +- colossalai/shardformer/modeling/deepseek.py | 51 +++++++++++++-------- colossalai/shardformer/policies/deepseek.py | 17 +++---- tests/test_moe/test_deepseek_layer.py | 21 +++++---- tests/test_moe/test_moe_checkpoint.py | 15 +----- 5 files changed, 53 insertions(+), 53 deletions(-) diff --git a/colossalai/cluster/process_group_mesh.py b/colossalai/cluster/process_group_mesh.py index 1319a4529093..b6aff0d72fe6 100644 --- a/colossalai/cluster/process_group_mesh.py +++ b/colossalai/cluster/process_group_mesh.py @@ -147,7 +147,7 @@ def get_group(self, ranks_in_group: List[int], backend: Optional[str] = None) -> ProcessGroup: The process group with the given ranks. """ ranks_in_group = sorted(ranks_in_group) - if tuple(ranks_in_group) not in self._group_to_ranks: + if tuple(ranks_in_group) not in self._ranks_to_group: group = dist.new_group(ranks_in_group, backend=backend) self._ranks_to_group[tuple(ranks_in_group)] = group self._group_to_ranks[group] = tuple(ranks_in_group) diff --git a/colossalai/shardformer/modeling/deepseek.py b/colossalai/shardformer/modeling/deepseek.py index 91391639dd50..6e79ce144cc8 100644 --- a/colossalai/shardformer/modeling/deepseek.py +++ b/colossalai/shardformer/modeling/deepseek.py @@ -2,32 +2,47 @@ import torch import torch.distributed as dist +import torch.nn as nn from torch.distributed import ProcessGroup # from colossalai.tensor.moe_tensor.moe_info import MoeParallelInfo from torch.nn import CrossEntropyLoss from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.utils import is_flash_attn_2_available, logging from colossalai.lazy import LazyInitContext from colossalai.moe._operation import MoeInGradScaler, MoeOutGradScaler, all_to_all_uneven from colossalai.pipeline.stage_manager import PipelineStageManager -from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig -from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import ( - AddAuxiliaryLoss, - CausalLMOutputWithPast, - DeepseekForCausalLM, - DeepseekMLP, - DeepseekModel, - DeepseekMoE, -) from colossalai.shardformer.shard import ShardConfig from colossalai.shardformer.shard.utils import set_tensors_to_none -class EPDeepseekMoE(DeepseekMoE): - def __init__(self, config: DeepseekConfig): - super().__init__(config) +# copied from modeling_deepseek.py +class AddAuxiliaryLoss(torch.autograd.Function): + """ + The trick function of adding auxiliary (aux) loss, + which includes the gradient of the aux loss during backpropagation. + """ + + @staticmethod + def forward(ctx, x, loss): + assert loss.numel() == 1 + ctx.dtype = loss.dtype + ctx.required_aux_loss = loss.requires_grad + return x + + @staticmethod + def backward(ctx, grad_output): + grad_loss = None + if ctx.required_aux_loss: + grad_loss = torch.ones(1, dtype=ctx.dtype, device=grad_output.device) + return grad_output, grad_loss + + +class EPDeepseekMoE(nn.Module): + def __init__(self): + super(EPDeepseekMoE, self).__init__() def setup_ep(self, ep_group: ProcessGroup): ep_group = ep_group @@ -44,9 +59,9 @@ def setup_ep(self, ep_group: ProcessGroup): p.ep_group = ep_group @staticmethod - def from_native_module(module: Union[DeepseekMoE, DeepseekMLP], *args, **kwargs) -> "EPDeepseekMoE": + def from_native_module(module: Union["DeepseekMoE", "DeepseekMLP"], *args, **kwargs) -> "EPDeepseekMoE": LazyInitContext.materialize(module) - if isinstance(module, DeepseekMLP): + if module.__class__.__name__ == "DeepseekMLP": return module module.__class__ = EPDeepseekMoE assert "ep_group" in kwargs, "You should pass ep_group in SubModuleReplacementDescription via shard_config!!" @@ -120,7 +135,7 @@ class DeepseekPipelineForwards: @staticmethod def deepseek_model_forward( - self: DeepseekModel, + self: "DeepseekModel", input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, @@ -147,9 +162,9 @@ def deepseek_model_forward( Example: ```python - >>> from transformers import AutoTokenizer, DeepseekForCausalLM + >>> from transformers import AutoTokenizer, AutoModelForCausalLM - >>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> model = AutoModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) >>> prompt = "Hey, are you conscious? Can you talk to me?" @@ -303,7 +318,7 @@ def custom_forward(*inputs): @staticmethod def deepseek_for_causal_lm_forward( - self: DeepseekForCausalLM, + self: "DeepseekForCausalLM", input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py index 1d64c643ebdb..07b86cd638c8 100644 --- a/colossalai/shardformer/policies/deepseek.py +++ b/colossalai/shardformer/policies/deepseek.py @@ -7,11 +7,6 @@ from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col from colossalai.shardformer.modeling.deepseek import DeepseekPipelineForwards, EPDeepseekMoE -from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import ( - DeepseekDecoderLayer, - DeepseekForCausalLM, - DeepseekModel, -) from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription __all__ = ["DeepseekPolicy", "DeepseekForCausalLMPolicy"] @@ -57,7 +52,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: ) ], policy=policy, - target_key=DeepseekDecoderLayer, + target_key="DeepseekDecoderLayer", ) # optimization configuration @@ -74,7 +69,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: ), ], policy=policy, - target_key=DeepseekDecoderLayer, + target_key="DeepseekDecoderLayer", ) self.append_or_create_submodule_replacement( @@ -83,7 +78,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: target_module=FusedRMSNorm, ), policy=policy, - target_key=DeepseekModel, + target_key="DeepseekModel", ) if self.shard_config.enable_flash_attention: @@ -144,7 +139,7 @@ def module_policy(self): if self.pipeline_stage_manager: # set None as default self.set_pipeline_forward( - model_cls=DeepseekModel, + model_cls="DeepseekModel", new_forward=DeepseekPipelineForwards.deepseek_model_forward, policy=policy, ) @@ -167,7 +162,7 @@ def module_policy(self): if self.shard_config.enable_tensor_parallelism: # add a new item for casual lm new_item = { - DeepseekForCausalLM: ModulePolicyDescription( + "DeepseekForCausalLM": ModulePolicyDescription( sub_module_replacement=[ SubModuleReplacementDescription( suffix="lm_head", @@ -182,7 +177,7 @@ def module_policy(self): if self.pipeline_stage_manager: # set None as default self.set_pipeline_forward( - model_cls=DeepseekForCausalLM, + model_cls="DeepseekForCausalLM", new_forward=DeepseekPipelineForwards.deepseek_for_causal_lm_forward, policy=policy, ) diff --git a/tests/test_moe/test_deepseek_layer.py b/tests/test_moe/test_deepseek_layer.py index 06dfbfe3b515..328ffb1de5f8 100644 --- a/tests/test_moe/test_deepseek_layer.py +++ b/tests/test_moe/test_deepseek_layer.py @@ -4,12 +4,11 @@ import torch import torch.distributed as dist from torch.testing import assert_close +from transformers import AutoConfig, AutoModel import colossalai from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin from colossalai.shardformer.modeling.deepseek import EPDeepseekMoE -from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig -from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import DeepseekMoE from colossalai.testing.utils import spawn tokens, n_experts = 7, 4 @@ -25,14 +24,18 @@ def check_deepseek_moe_layer(): pp_size=1, ep_size=dist.get_world_size(), ) - config = DeepseekConfig( - hidden_size=hidden_size, - intermediate_size=hidden_size * 2, - n_routed_experts=n_experts, - num_experts_per_tok=top_k, - ) + + config = AutoConfig.from_pretrained("deepseek-ai/deepseek-moe-16b-base", trust_remote_code=True) + config.num_hidden_layers = 1 + config.n_routed_experts = n_experts + config.num_experts_per_tok = top_k + config.hidden_size = hidden_size + config.intermediate_size = hidden_size * 2 + config.first_k_dense_replace = 0 + config.num_attention_heads = 2 torch.manual_seed(0) - orig_model = DeepseekMoE(config).cuda() + # get the moe layer in auto model + orig_model = AutoModel.from_config(config, trust_remote_code=True).layers[0].mlp.cuda() x = torch.rand(1, tokens, hidden_size, requires_grad=True).cuda() orig_output = orig_model(x) model = deepcopy(orig_model) diff --git a/tests/test_moe/test_moe_checkpoint.py b/tests/test_moe/test_moe_checkpoint.py index f3c5726ea0ae..8113b32d0411 100644 --- a/tests/test_moe/test_moe_checkpoint.py +++ b/tests/test_moe/test_moe_checkpoint.py @@ -14,8 +14,6 @@ from colossalai.booster import Booster from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin from colossalai.checkpoint_io import MoECheckpointIO -from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig -from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import DeepseekForCausalLM from colossalai.tensor.moe_tensor.api import is_moe_tensor from colossalai.testing import parameterize, spawn from colossalai.testing.utils import spawn @@ -91,21 +89,10 @@ def check_optimizer_snapshot_equal(snapshot1, snapshot2, param2name, moe_dp_grou num_experts_per_tok=top_k, num_attention_heads=2, num_key_value_heads=2, + num_hidden_layers=4, ), MixtralForCausalLM, ], - [ - DeepseekConfig( - hidden_size=hidden_size, - intermediate_size=hidden_size * 2, - n_routed_experts=n_experts, - num_experts_per_tok=top_k, - num_attention_heads=2, - num_key_value_heads=2, - first_k_dense_replace=4, - ), - DeepseekForCausalLM, - ], ], ) def check_moe_checkpoint(test_config): From 4030aa6ea039948b9b58307b45861acf150859d8 Mon Sep 17 00:00:00 2001 From: haze188 Date: Thu, 4 Jul 2024 08:17:45 +0000 Subject: [PATCH 3/8] [misc] delete useless file --- examples/language/llama/scripts/benchmark_7B/hosts.txt | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 examples/language/llama/scripts/benchmark_7B/hosts.txt diff --git a/examples/language/llama/scripts/benchmark_7B/hosts.txt b/examples/language/llama/scripts/benchmark_7B/hosts.txt deleted file mode 100644 index c9c165ebb978..000000000000 --- a/examples/language/llama/scripts/benchmark_7B/hosts.txt +++ /dev/null @@ -1,2 +0,0 @@ -10.20.1.170 -10.20.1.83 From c6abfbc64637fd0cf2edcc79039220921d84c717 Mon Sep 17 00:00:00 2001 From: haze188 Date: Thu, 4 Jul 2024 08:19:21 +0000 Subject: [PATCH 4/8] [misc] fix typos --- tests/test_moe/test_moe_checkpoint.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_moe/test_moe_checkpoint.py b/tests/test_moe/test_moe_checkpoint.py index 8113b32d0411..164301695865 100644 --- a/tests/test_moe/test_moe_checkpoint.py +++ b/tests/test_moe/test_moe_checkpoint.py @@ -89,7 +89,6 @@ def check_optimizer_snapshot_equal(snapshot1, snapshot2, param2name, moe_dp_grou num_experts_per_tok=top_k, num_attention_heads=2, num_key_value_heads=2, - num_hidden_layers=4, ), MixtralForCausalLM, ], From f0555716a428d0f01a3d6abd9d8e6f4e98c73ad1 Mon Sep 17 00:00:00 2001 From: haze188 Date: Thu, 4 Jul 2024 08:29:26 +0000 Subject: [PATCH 5/8] [misc] remove redundant code --- colossalai/shardformer/policies/auto_policy.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/colossalai/shardformer/policies/auto_policy.py b/colossalai/shardformer/policies/auto_policy.py index 24cd0a800f4e..ae9f3603c96e 100644 --- a/colossalai/shardformer/policies/auto_policy.py +++ b/colossalai/shardformer/policies/auto_policy.py @@ -167,12 +167,6 @@ class PolicyLocation: "transformers_modules.modeling_deepseek.DeepseekForCausalLM": PolicyLocation( file_name="deepseek", class_name="DeepseekForCausalLMPolicy" ), - "colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek.DeepseekModel": PolicyLocation( - file_name="deepseek", class_name="DeepseekModelPolicy" - ), - "colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek.DeepseekForCausalLM": PolicyLocation( - file_name="deepseek", class_name="DeepseekForCausalLMPolicy" - ), # Falcon "transformers.models.falcon.modeling_falcon.FalconModel": PolicyLocation( file_name="falcon", class_name="FalconModelPolicy" From aac2fb207e8dcd71b6ac43a21e5c8b20e07d6051 Mon Sep 17 00:00:00 2001 From: haze188 Date: Fri, 5 Jul 2024 05:54:42 +0000 Subject: [PATCH 6/8] [misc] mv module replacement into if branch --- colossalai/shardformer/policies/deepseek.py | 29 ++++++++++----------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py index 07b86cd638c8..c3b518929ea5 100644 --- a/colossalai/shardformer/policies/deepseek.py +++ b/colossalai/shardformer/policies/deepseek.py @@ -39,21 +39,20 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: if self.shard_config.enable_tensor_parallelism: raise NotImplementedError("Tensor parallelism is not supported for Deepseek model now.") - if getattr(self.shard_config, "ep_group", None) is None: - raise ValueError("You must pass in ep_group via shard_config for expert parallel!") - - # expert parallel - self.append_or_create_submodule_replacement( - description=[ - SubModuleReplacementDescription( - suffix="mlp", - target_module=EPDeepseekMoE, - kwargs={"ep_group": self.shard_config.ep_group}, - ) - ], - policy=policy, - target_key="DeepseekDecoderLayer", - ) + + if getattr(self.shard_config, "ep_group", None) is not None: + # expert parallel + self.append_or_create_submodule_replacement( + description=[ + SubModuleReplacementDescription( + suffix="mlp", + target_module=EPDeepseekMoE, + kwargs={"ep_group": self.shard_config.ep_group}, + ) + ], + policy=policy, + target_key="DeepseekDecoderLayer", + ) # optimization configuration if self.shard_config.enable_fused_normalization: From 0f5d9d47b527849484aa3cffa4386d0fde7b391f Mon Sep 17 00:00:00 2001 From: haze188 Date: Fri, 5 Jul 2024 06:04:49 +0000 Subject: [PATCH 7/8] [misc] add some warning message and modify some code in unit test --- colossalai/shardformer/policies/deepseek.py | 6 +++++- tests/test_moe/test_deepseek_layer.py | 19 +++++++++++-------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py index c3b518929ea5..a27c236902de 100644 --- a/colossalai/shardformer/policies/deepseek.py +++ b/colossalai/shardformer/policies/deepseek.py @@ -1,3 +1,4 @@ +import warnings from functools import partial from typing import Callable, Dict, List, Union @@ -81,7 +82,10 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: ) if self.shard_config.enable_flash_attention: - raise NotImplementedError("Flash attention has already been replaced in deepseek.") + warnings.warn( + "Flash attention has already been replaced in deepseek, and now set enable_flash_attention = True." + ) + self.shard_config.enable_flash_attention = False return policy diff --git a/tests/test_moe/test_deepseek_layer.py b/tests/test_moe/test_deepseek_layer.py index 328ffb1de5f8..85cc986959fd 100644 --- a/tests/test_moe/test_deepseek_layer.py +++ b/tests/test_moe/test_deepseek_layer.py @@ -25,14 +25,17 @@ def check_deepseek_moe_layer(): ep_size=dist.get_world_size(), ) - config = AutoConfig.from_pretrained("deepseek-ai/deepseek-moe-16b-base", trust_remote_code=True) - config.num_hidden_layers = 1 - config.n_routed_experts = n_experts - config.num_experts_per_tok = top_k - config.hidden_size = hidden_size - config.intermediate_size = hidden_size * 2 - config.first_k_dense_replace = 0 - config.num_attention_heads = 2 + config = AutoConfig.from_pretrained( + "deepseek-ai/deepseek-moe-16b-base", + num_hidden_layers=1, + n_routed_experts=n_experts, + num_experts_per_tok=top_k, + hidden_size=hidden_size, + intermediate_size=hidden_size * 2, + first_k_dense_replace=0, + num_attention_heads=2, + trust_remote_code=True, + ) torch.manual_seed(0) # get the moe layer in auto model orig_model = AutoModel.from_config(config, trust_remote_code=True).layers[0].mlp.cuda() From 5115ee20ab62d423c1cfa2ef9172f1e299499b59 Mon Sep 17 00:00:00 2001 From: haze188 Date: Fri, 5 Jul 2024 06:08:20 +0000 Subject: [PATCH 8/8] [misc] fix typos --- colossalai/shardformer/policies/deepseek.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py index a27c236902de..8ebda357b380 100644 --- a/colossalai/shardformer/policies/deepseek.py +++ b/colossalai/shardformer/policies/deepseek.py @@ -83,7 +83,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: if self.shard_config.enable_flash_attention: warnings.warn( - "Flash attention has already been replaced in deepseek, and now set enable_flash_attention = True." + "Flash attention has already been replaced in deepseek, and now set enable_flash_attention = False." ) self.shard_config.enable_flash_attention = False