Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d49fd63
add mixtral auto policy & move pipeline forward code to modeling folder
Hz188 May 31, 2024
d2e07fc
[moe refactor] modify kernel test without Route Class
Hz188 Jun 4, 2024
7556b8f
[moe refactor] add moe tensor test path environment variable to githu…
Hz188 Jun 4, 2024
16329d5
fix typos
Hz188 Jun 4, 2024
b934437
fix moe test bug due to the code rebase
Hz188 Jun 5, 2024
a792e83
[moe refactor] fix moe zero test, and little bug in low level zero
Hz188 Jun 6, 2024
d203ba8
fix typo
Hz188 Jun 6, 2024
55c7416
add moe tensor path to github workflow
Hz188 Jun 6, 2024
8915e9d
remove some useless code
Hz188 Jun 6, 2024
7963fb0
fix typo & unify global variable XX_AXIS logic without using -1
Hz188 Jun 7, 2024
32ced74
fix typo & prettifier the code
Hz188 Jun 7, 2024
3100c1b
remove print code & support zero 2 test
Hz188 Jun 7, 2024
928ee39
remove useless code
Hz188 Jun 7, 2024
6dc0cfc
reanme function
Hz188 Jun 7, 2024
4417840
fix typo
Hz188 Jun 7, 2024
eb35655
fix typo
Hz188 Jun 7, 2024
d1d446b
Further improve the test code
Hz188 Jun 7, 2024
09a5188
remove print code
Hz188 Jun 7, 2024
4c6ea42
[moe refactor] change test model from fake moe model to mixtral moe l…
Hz188 Jun 11, 2024
80b6586
[moe refactor] skip some unit test which will be refactored later
Hz188 Jun 11, 2024
7d06220
[moe refactor] fix unit import error
Hz188 Jun 11, 2024
fb41f42
[moe refactor] fix circular import issues
Hz188 Jun 11, 2024
e99b69c
[moe refactor] remove debug code
Hz188 Jun 11, 2024
af9ade6
[moe refactor] update github workflow
Hz188 Jun 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/build_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ jobs:
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch
timeout-minutes: 90
defaults:
run:
Expand Down Expand Up @@ -165,6 +165,7 @@ jobs:
env:
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LLAMA_PATH: /data/scratch/llama-tiny
MOE_TENSOR_PATH: /data/scratch/moe_tensors

- name: Collate artifact
env:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ jobs:
env:
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LLAMA_PATH: /data/scratch/llama-tiny
MOE_TENSOR_PATH: /data/scratch/moe_tensors

- name: Notify Lark
id: message-preparation
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/compatiblity_test_on_dispatch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,4 @@ jobs:
DATA: /data/scratch/cifar-10
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LLAMA_PATH: /data/scratch/llama-tiny
MOE_TENSOR_PATH: /data/scratch/moe_tensors
1 change: 1 addition & 0 deletions .github/workflows/compatiblity_test_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,4 @@ jobs:
DATA: /data/scratch/cifar-10
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LLAMA_PATH: /data/scratch/llama-tiny
MOE_TENSOR_PATH: /data/scratch/moe_tensors
1 change: 1 addition & 0 deletions .github/workflows/compatiblity_test_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ jobs:
DATA: /data/scratch/cifar-10
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LLAMA_PATH: /data/scratch/llama-tiny
MOE_TENSOR_PATH: /data/scratch/moe_tensors

- name: Notify Lark
id: message-preparation
Expand Down
2 changes: 0 additions & 2 deletions applications/ColossalMoE/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.cluster import DistCoordinator
from colossalai.moe.checkpoint import MoECheckpointIO
from colossalai.shardformer.policies.mixtral import MixtralForCausalLMPolicy


def parse_args():
Expand Down Expand Up @@ -70,7 +69,6 @@ def main():
ep_size=ep_size,
zero_stage=1,
precision=args.precision,
custom_policy=MixtralForCausalLMPolicy(),
checkpoint_io=MoECheckpointIO,
enable_fused_normalization=args.use_layernorm_kernel,
enable_jit_fused=args.use_kernel,
Expand Down
177 changes: 0 additions & 177 deletions applications/ColossalMoE/tests/test_moe_checkpoint.py

This file was deleted.

2 changes: 0 additions & 2 deletions applications/ColossalMoE/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from colossalai.moe.checkpoint import MoECheckpointIO
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import HybridAdam
from colossalai.shardformer.policies.mixtral import MixtralForCausalLMPolicy
from colossalai.utils import get_current_device


Expand Down Expand Up @@ -155,7 +154,6 @@ def main():
pp_size=args.pp_size,
ep_size=args.ep_size,
microbatch_size=args.microbatch_size,
custom_policy=MixtralForCausalLMPolicy(),
enable_fused_normalization=args.use_layernorm_kernel,
enable_jit_fused=args.use_kernel,
precision=args.precision,
Expand Down
6 changes: 4 additions & 2 deletions colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from colossalai.shardformer.policies.base_policy import Policy
from colossalai.zero.low_level import LowLevelZeroOptimizer

PP_AXIS, DP_AXIS, EP_AXIS, TP_AXIS = 0, 1, 2, -1
PP_AXIS, DP_AXIS, EP_AXIS, TP_AXIS = 0, 1, 2, 3


class HybridParallelZeroOptimizer(LowLevelZeroOptimizer):
Expand Down Expand Up @@ -352,7 +352,9 @@ def seed_worker(worker_id):

def get_checkpoint_io(self) -> MoECheckpointIO:
if self.checkpoint_io is None:
self.checkpoint_io = MoECheckpointIO(self.global_dp_group, self.pp_group, self.tp_group, self.zero_stage)
self.checkpoint_io = MoECheckpointIO(
self.global_dp_group, self.pp_group, self.tp_group, self.ep_group, self.moe_dp_group, self.zero_stage
)
else:
self.checkpoint_io = self.checkpoint_io(
self.global_dp_group,
Expand Down
2 changes: 1 addition & 1 deletion colossalai/moe/load_balance.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from torch.distributed import ProcessGroup

from colossalai.cluster import ProcessGroupMesh
from colossalai.moe.experts import MLPExperts
from colossalai.moe.manager import MOE_MANAGER
from colossalai.shardformer.layer.moe.layers import MLPExperts
from colossalai.zero.low_level import LowLevelZeroOptimizer


Expand Down
2 changes: 1 addition & 1 deletion colossalai/shardformer/layer/moe/experts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from colossalai.moe.manager import MOE_MANAGER
from colossalai.moe.utils import get_activation
from colossalai.shardformer.layer.utils import Randomizer
from colossalai.tensor.moe_tensor.api import get_ep_rank, get_ep_size, set_moe_tensor_info
from colossalai.tensor.moe_tensor.api import get_ep_rank, get_ep_size

if HAS_TRITON:
from colossalai.kernel.triton.llama_act_combine_kernel import LlamaActCombine
Expand Down
1 change: 0 additions & 1 deletion colossalai/shardformer/layer/moe/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from colossalai.moe.load_balance import LoadBalancer
from colossalai.moe.utils import create_ep_hierarchical_group, get_noise_generator
from colossalai.shardformer.layer.moe import MLPExperts
from colossalai.shardformer.layer.moe.routers import MoeRouter, get_router_cls
from colossalai.tensor.moe_tensor.api import get_dp_group, get_ep_group, get_ep_group_ranks, get_ep_size


Expand Down
2 changes: 1 addition & 1 deletion colossalai/shardformer/layer/moe/routers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from colossalai.moe.manager import MOE_MANAGER
from colossalai.moe.utils import get_activation
from colossalai.shardformer.layer.utils import Randomizer
from colossalai.tensor.moe_tensor.api import get_ep_rank, get_ep_size, set_moe_tensor_info
from colossalai.tensor.moe_tensor.api import get_ep_rank, get_ep_size

if HAS_TRITON:
from colossalai.kernel.triton.llama_act_combine_kernel import LlamaActCombine
Expand Down
Loading