Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
Empty file.
5 changes: 2 additions & 3 deletions applications/ColossalMoE/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

import torch
import torch.distributed as dist
from colossal_moe.models.mixtral_checkpoint import MixtralMoEHybridParallelCheckpointIO
from colossal_moe.models.mixtral_policy import MixtralForCausalLMPolicy
from mixtral_checkpoint import MixtralMoEHybridParallelCheckpointIO
from transformers import AutoTokenizer
from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM

import colossalai
from colossalai.booster import Booster
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.cluster import DistCoordinator
from colossalai.shardformer.policies.mixtral import MixtralForCausalLMPolicy


def parse_args():
Expand Down Expand Up @@ -106,6 +106,5 @@ def main():
print(f"[{coordinator.rank}] {outputs}")



if __name__ == "__main__":
main()
3 changes: 2 additions & 1 deletion applications/ColossalMoE/infer.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
NUM_GPU=2
MODEL="mistralai/Mixtral-8x7B-v0.1"
# MODEL="mistralai/Mixtral-8x7B-v0.1"
MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"

# ep
torchrun --standalone --nproc_per_node $NUM_GPU infer.py \
Expand Down
2 changes: 1 addition & 1 deletion applications/ColossalMoE/tests/test_mixtral_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
import pytest
import torch
import torch.distributed as dist
from colossal_moe.models.mixtral_layer import EPMixtralSparseMoeBlock
from torch.testing import assert_close
from transformers.models.mixtral.configuration_mixtral import MixtralConfig
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock

import colossalai
from colossalai.moe import MOE_MANAGER
from colossalai.shardformer.modeling.mixtral import EPMixtralSparseMoeBlock
from colossalai.testing.utils import spawn

tokens, n_experts = 7, 4
Expand Down
4 changes: 1 addition & 3 deletions applications/ColossalMoE/tests/test_moe_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import pytest
import torch
import torch.distributed as dist
from colossal_moe.models.mixtral_checkpoint import MixtralMoEHybridParallelCheckpointIO
from colossal_moe.models.mixtral_policy import MixtralForCausalLMPolicy
from mixtral_checkpoint import MixtralMoEHybridParallelCheckpointIO
from torch.optim import Adam
from transformers.models.mixtral.configuration_mixtral import MixtralConfig
from transformers.models.mixtral.modeling_mixtral import MixtralForCausalLM
Expand Down Expand Up @@ -81,7 +80,6 @@ def check_mixtral_moe_layer():
tp_size=1,
pp_size=2,
ep_size=2,
custom_policy=MixtralForCausalLMPolicy(),
checkpoint_io=MixtralMoEHybridParallelCheckpointIO,
microbatch_size=1,
zero_stage=1,
Expand Down
14 changes: 2 additions & 12 deletions applications/ColossalMoE/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@

import torch
import torch.distributed as dist
from colossal_moe.models.mixtral_checkpoint import MixtralMoEHybridParallelCheckpointIO
from colossal_moe.models.mixtral_policy import MixtralForCausalLMPolicy
from colossal_moe.utils import load_checkpoint, move_to_cuda, save_checkpoint
from mixtral_checkpoint import MixtralMoEHybridParallelCheckpointIO
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers.models.mixtral import MixtralForCausalLM
from utils import load_checkpoint, move_to_cuda, save_checkpoint

import colossalai
from colossalai.booster import Booster
Expand Down Expand Up @@ -155,7 +154,6 @@ def main():
pp_size=args.pp_size,
ep_size=args.ep_size,
microbatch_size=args.microbatch_size,
custom_policy=MixtralForCausalLMPolicy(),
enable_fused_normalization=args.use_layernorm_kernel,
enable_jit_fused=args.use_kernel,
precision=args.precision,
Expand Down Expand Up @@ -260,14 +258,6 @@ def main():
lr_scheduler.step()
optimizer.zero_grad()

# Apply load balance
# if (
# args.load_balance
# and args.load_balance_interval > 0
# and (step + 1) % args.load_balance_interval == 0
# ):
# coordinator.print_on_master(f"Apply load balance")
# apply_load_balance(model, optimizer)
# save ckeckpoint
if (step + 1) % args.save_interval == 0:
coordinator.print_on_master(f"Saving model checkpoint to {args.output_path}")
Expand Down
13 changes: 0 additions & 13 deletions colossalai/moe/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,7 @@
from .checkpoint import MoECheckpointIO
from .experts import MLPExperts
from .layers import SparseMLP, apply_load_balance
from .manager import MOE_MANAGER
from .routers import MoeRouter, Top1Router, Top2Router, TopKRouter
from .utils import NormalNoiseGenerator, UniformNoiseGenerator

__all__ = [
"MLPExperts",
"MoeRouter",
"Top1Router",
"Top2Router",
"TopKRouter",
"NormalNoiseGenerator",
"UniformNoiseGenerator",
"SparseMLP",
"MoECheckpointIO",
"MOE_MANAGER",
"apply_load_balance",
]
161 changes: 0 additions & 161 deletions colossalai/moe/experts.py

This file was deleted.

Loading