-
Notifications
You must be signed in to change notification settings - Fork 4.5k
[moe] refactor code to better adapt to llm #4469
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
7d7ddcc
polish code
oahzxl 764f65d
rename
oahzxl 0032286
refactor code
oahzxl e0ceb6b
fix test
oahzxl 057f6c2
refactor code
oahzxl 5a572b0
update flash attention version
oahzxl b0d9c6a
Support TP (#6)
oahzxl 290e5d1
remove fa dependency
oahzxl f38e77d
update dependency
oahzxl dfbc2a6
update softmax
oahzxl e3c4d9a
update checkpointio
oahzxl 31e23b5
update processgroupmesh
oahzxl 9892701
update name
oahzxl 5f9d943
update param
oahzxl 76652cd
add keep vars
oahzxl File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,10 @@ | ||
| from .checkpoint import load_moe_model, save_moe_model | ||
| from .experts import Experts, FFNExperts, TPExperts | ||
| from .layers import MoeLayer, MoeModule | ||
| from .checkpoint import MoeCheckpintIO | ||
| from .experts import EPMLPExperts, TPMLPExperts | ||
| from .layers import MoeLayer, MoeModule, SparseMLP | ||
| from .routers import MoeRouter, Top1Router, Top2Router | ||
| from .utils import NormalNoiseGenerator, UniformNoiseGenerator, build_ffn_experts | ||
|
|
||
| __all__ = [ | ||
| 'Experts', 'FFNExperts', 'TPExperts', 'Top1Router', 'Top2Router', 'MoeLayer', 'NormalNoiseGenerator', | ||
| 'UniformNoiseGenerator', 'build_ffn_experts', 'MoeModule', 'MoeRouter', 'save_moe_model', 'load_moe_model' | ||
| 'EPMLPExperts', 'TPMLPExperts', 'Top1Router', 'Top2Router', 'MoeLayer', 'MoeModule', 'NormalNoiseGenerator', | ||
| 'UniformNoiseGenerator', 'build_ffn_experts', 'SparseMLP', 'MoeRouter', 'MoeCheckpintIO' | ||
| ] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,40 +1,61 @@ | ||
| from pathlib import Path | ||
| from typing import Optional | ||
|
|
||
| import torch | ||
| import torch.distributed as dist | ||
| import torch.nn as nn | ||
| from torch.optim import Optimizer | ||
|
|
||
| from colossalai.checkpoint_io import CheckpointIO | ||
| from colossalai.tensor.moe_tensor.api import get_ep_group | ||
|
|
||
|
|
||
| class MoeCheckpintIO(CheckpointIO): | ||
|
|
||
| def __init__(self) -> None: | ||
| super().__init__() | ||
|
|
||
| def load_unsharded_model(self, model: nn.Module, checkpoint: str, strict: bool): | ||
| state_dict = torch.load(checkpoint) | ||
| for name, param in model.named_parameters(): | ||
| if '.experts.' in name: | ||
| ep_rank = dist.get_rank(get_ep_group(param)) | ||
| ep_size = dist.get_world_size(get_ep_group(param)) | ||
| for rank in range(ep_size): | ||
| new_name = name.replace('.experts.', f'.experts.{rank}.') | ||
| if rank == ep_rank: | ||
| state_dict[name] = state_dict.pop(new_name) | ||
| else: | ||
| state_dict.pop(new_name) | ||
|
|
||
| from .experts import MoeExperts | ||
| model.load_state_dict(state_dict, strict=strict) | ||
|
|
||
| def save_unsharded_model(self, model: nn.Module, checkpoint: str, gather_dtensor: bool, use_safetensors: bool): | ||
| state_dict = model.state_dict() | ||
| if dist.get_rank() == 0: | ||
| torch.save(state_dict, checkpoint) | ||
| dist.barrier() | ||
|
|
||
| def save_moe_model(model: nn.Module, save_path: str): | ||
| state_dict = model.state_dict() | ||
| if dist.get_rank() == 0: | ||
| torch.save(state_dict, save_path) | ||
| dist.barrier() | ||
| def load_sharded_model(self, model: nn.Module, index_file_path: str, strict: bool): | ||
| raise NotImplementedError() | ||
|
|
||
| def save_sharded_model(self, model: nn.Module, checkpoint: str, gather_dtensor: bool, prefix: Optional[str], | ||
| size_per_shard: int, use_safetensors: bool): | ||
| raise NotImplementedError() | ||
|
|
||
| def load_moe_model(model: nn.Module, load_path: str): | ||
| state_dict = torch.load(load_path) | ||
| # ======================================================== | ||
| # Abstract methods for optimizer loading/saving implementation | ||
| # ======================================================== | ||
|
|
||
| for prefix, module in model.named_modules(): | ||
| if prefix.endswith('.moe_layer.experts'): | ||
| # this module should be an Experts instance | ||
| assert isinstance(module, MoeExperts) | ||
| def load_sharded_optimizer(self, optimizer: Optimizer, index_file_path: str, prefix: str): | ||
| raise NotImplementedError() | ||
|
|
||
| ep_rank = dist.get_rank(module.dist_info.ep_group) | ||
| num_local = module.num_local_experts | ||
| for i in range(num_local): | ||
| expert_id = ep_rank * num_local + i | ||
| for name, _ in module.experts[i].named_parameters(): | ||
| cur_key = f'{prefix}.experts.{i}.{name}' | ||
| param_key = f'{prefix}.experts.{expert_id}.{name}' | ||
| load_param = state_dict[param_key] | ||
| state_dict[cur_key] = load_param | ||
| def load_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: Path): | ||
| raise NotImplementedError() | ||
|
|
||
| for name, _ in module.experts[0].named_parameters(): | ||
| pop_pre = f'{prefix}.experts.' | ||
| pop_suf = f'.{name}' | ||
| for i in range(num_local, module.num_total_experts): | ||
| pop_key = f'{pop_pre}{i}{pop_suf}' | ||
| state_dict.pop(pop_key) | ||
| def save_sharded_optimizer(self, optimizer: Optimizer, checkpoint: Path, gather_dtensor: bool, prefix: str, | ||
| size_per_shard: int): | ||
| raise NotImplementedError() | ||
|
|
||
| model.load_state_dict(state_dict) | ||
| def save_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: Path, gather_dtensor: bool): | ||
| raise NotImplementedError() | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.