diff --git a/applications/Chat/.gitignore b/applications/Chat/.gitignore index 2b9b4f345d0f..5fa068105e26 100644 --- a/applications/Chat/.gitignore +++ b/applications/Chat/.gitignore @@ -145,4 +145,4 @@ docs/.build # wandb log example/wandb/ -examples/awesome-chatgpt-prompts/ \ No newline at end of file +examples/awesome-chatgpt-prompts/ diff --git a/applications/Chat/coati/dataset/sft_dataset.py b/applications/Chat/coati/dataset/sft_dataset.py index 3702d00cc609..3038fbe071db 100644 --- a/applications/Chat/coati/dataset/sft_dataset.py +++ b/applications/Chat/coati/dataset/sft_dataset.py @@ -74,15 +74,10 @@ def __getitem__(self, idx): return dict(input_ids=self.input_ids[idx], labels=self.labels[idx]) -def _tokenize_fn(strings: Sequence[str], - tokenizer: transformers.PreTrainedTokenizer, - max_length: int - ) -> Dict[str, torch.Tensor]: +def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer, + max_length: int) -> Dict[str, torch.Tensor]: """Tokenize a list of strings.""" - tokenized_list = tokenizer( - strings, return_tensors="pt", padding="longest", - max_length=max_length, truncation=True - ) + tokenized_list = tokenizer(strings, return_tensors="pt", padding="longest", max_length=max_length, truncation=True) input_ids = labels = tokenized_list["input_ids"] input_ids_lens = labels_lens = \ tokenized_list["input_ids"].ne(tokenizer.pad_token_id).sum(dim=-1) @@ -103,8 +98,7 @@ def preprocess( """Preprocess the data by tokenizing.""" examples = [s + t for s, t in zip(sources, targets)] examples_tokenized, sources_tokenized = [ - _tokenize_fn(strings, tokenizer, max_length) - for strings in (examples, sources) + _tokenize_fn(strings, tokenizer, max_length) for strings in (examples, sources) ] input_ids = examples_tokenized["input_ids"] labels = copy.deepcopy(input_ids) @@ -116,7 +110,11 @@ def preprocess( class SupervisedDataset(Dataset): """Dataset for supervised fine-tuning.""" - def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, max_datasets_size: int = None, max_length: int = 512): + def __init__(self, + data_path: str, + tokenizer: transformers.PreTrainedTokenizer, + max_datasets_size: int = None, + max_length: int = 512): super(SupervisedDataset, self).__init__() logger.info("Loading data...") list_data_dict = jload(data_path) diff --git a/applications/Chat/coati/models/base/actor.py b/applications/Chat/coati/models/base/actor.py index 2034d5cc81d4..6842f81d9b87 100644 --- a/applications/Chat/coati/models/base/actor.py +++ b/applications/Chat/coati/models/base/actor.py @@ -21,16 +21,13 @@ def __init__(self, model: nn.Module, lora_rank: int = 0, lora_train_bias: str = self.model = model self.convert_to_lora() - def forward(self, - input_ids: torch.LongTensor, - attention_mask: Optional[torch.Tensor] = None, - **model_kwargs, # HACK: `generate` method may pass more kwargs - ) -> torch.Tensor: + def forward( + self, + input_ids: torch.LongTensor, + attention_mask: Optional[torch.Tensor] = None, + **model_kwargs, # HACK: `generate` method may pass more kwargs + ) -> torch.Tensor: """Returns model output. """ - output = self.model( - input_ids, - attention_mask=attention_mask, - **model_kwargs - ) + output = self.model(input_ids, attention_mask=attention_mask, **model_kwargs) return output diff --git a/applications/Chat/coati/models/generation.py b/applications/Chat/coati/models/generation.py index 0156e2284e52..d96ad78a89ce 100644 --- a/applications/Chat/coati/models/generation.py +++ b/applications/Chat/coati/models/generation.py @@ -5,7 +5,6 @@ import torch.nn as nn import torch.nn.functional as F - try: from transformers.generation_logits_process import ( LogitsProcessorList, @@ -148,12 +147,12 @@ def generate(model: nn.Module, @torch.no_grad() -def generate_with_actor(actor_model: nn.Module, - input_ids: torch.Tensor, - return_action_mask: bool = True, - **kwargs - ) -> Union[Tuple[torch.LongTensor, torch.LongTensor], - Tuple[torch.LongTensor, torch.LongTensor, torch.BoolTensor]]: +def generate_with_actor( + actor_model: nn.Module, + input_ids: torch.Tensor, + return_action_mask: bool = True, + **kwargs +) -> Union[Tuple[torch.LongTensor, torch.LongTensor], Tuple[torch.LongTensor, torch.LongTensor, torch.BoolTensor]]: """Generate token sequence with actor model. Refer to `generate` for more details. """ # generate sequences diff --git a/applications/Chat/coati/models/utils.py b/applications/Chat/coati/models/utils.py index b9f15f894a1f..772bfc32982a 100644 --- a/applications/Chat/coati/models/utils.py +++ b/applications/Chat/coati/models/utils.py @@ -46,10 +46,7 @@ def log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.T return log_probs_labels.squeeze(-1) -def calc_action_log_probs(output: torch.Tensor, - sequences: torch.LongTensor, - num_actions: int - ) -> torch.Tensor: +def calc_action_log_probs(output: torch.Tensor, sequences: torch.LongTensor, num_actions: int) -> torch.Tensor: """Calculate action log probs. Args: diff --git a/applications/Chat/coati/trainer/base.py b/applications/Chat/coati/trainer/base.py index 13571cdcc23a..b4d168a563d9 100644 --- a/applications/Chat/coati/trainer/base.py +++ b/applications/Chat/coati/trainer/base.py @@ -25,12 +25,13 @@ class SLTrainer(ABC): optim (Optimizer): the optimizer to use for training """ - def __init__(self, - strategy: Strategy, - max_epochs: int, - model: nn.Module, - optimizer: Optimizer, - ) -> None: + def __init__( + self, + strategy: Strategy, + max_epochs: int, + model: nn.Module, + optimizer: Optimizer, + ) -> None: super().__init__() self.strategy = strategy self.max_epochs = max_epochs @@ -50,10 +51,7 @@ def _before_fit(self): def fit(self, *args, **kwargs): self._before_fit(*args, **kwargs) - for epoch in tqdm.trange(self.max_epochs, - desc="Epochs", - disable=not is_rank_0() or self.no_epoch_bar - ): + for epoch in tqdm.trange(self.max_epochs, desc="Epochs", disable=not is_rank_0() or self.no_epoch_bar): self._train(epoch) self._eval(epoch) @@ -75,8 +73,7 @@ def __init__(self, buffer: NaiveReplayBuffer, sample_buffer: bool, dataloader_pin_memory: bool, - callbacks: List[Callback] = [] - ) -> None: + callbacks: List[Callback] = []) -> None: super().__init__() self.strategy = strategy self.buffer = buffer @@ -138,7 +135,7 @@ def _make_experience(self, collect_step: int): @abstractmethod def _learn(self, update_step: int): """ - Implement this method to learn from experience, either + Implement this method to learn from experience, either sample from buffer or transform buffer into dataloader. """ raise NotImplementedError() @@ -154,13 +151,14 @@ def _update_phase(self, update_step: int): self._learn(update_step) self._on_learn_epoch_end(update_step) - def fit(self, - prompt_dataloader: DataLoader, - pretrain_dataloader: DataLoader, - num_episodes: int, - num_collect_steps: int, - num_update_steps: int, - ): + def fit( + self, + prompt_dataloader: DataLoader, + pretrain_dataloader: DataLoader, + num_episodes: int, + num_collect_steps: int, + num_update_steps: int, + ): """ The main training loop of on-policy rl trainers. @@ -175,23 +173,16 @@ def fit(self, self.pretrain_dataloader = CycledDataLoader(pretrain_dataloader) with self._fit_ctx(): - for episode in tqdm.trange(num_episodes, - desc="Episodes", - disable=not is_rank_0()): + for episode in tqdm.trange(num_episodes, desc="Episodes", disable=not is_rank_0()): with self._episode_ctx(episode): - for collect_step in tqdm.trange(num_collect_steps, - desc="Collect steps", - disable=not is_rank_0()): + for collect_step in tqdm.trange(num_collect_steps, desc="Collect steps", disable=not is_rank_0()): self._collect_phase(collect_step) if not self.sample_buffer: # HACK(cwher): according to the design of boost API, dataloader should also be boosted, # but it is impractical to adapt this pattern in RL training. Thus, I left dataloader unboosted. # I only call strategy.setup_dataloader() to setup dataloader. - self.dataloader = self.strategy.setup_dataloader(self.buffer, - self.dataloader_pin_memory) - for update_step in tqdm.trange(num_update_steps, - desc="Update steps", - disable=not is_rank_0()): + self.dataloader = self.strategy.setup_dataloader(self.buffer, self.dataloader_pin_memory) + for update_step in tqdm.trange(num_update_steps, desc="Update steps", disable=not is_rank_0()): self._update_phase(update_step) # NOTE: this is for on-policy algorithms self.buffer.clear() diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py index 80bc3272872e..3d1dfaf784cf 100644 --- a/applications/Chat/coati/trainer/strategies/base.py +++ b/applications/Chat/coati/trainer/strategies/base.py @@ -79,8 +79,7 @@ def prepare(self, *boost_args: _BoostArgSpec) -> Union[List[_BoostArgSpec], _Boo model, optimizer = arg except ValueError: raise RuntimeError(f'Expect (model, optimizer) pair, got a tuple with size "{len(arg)}"') - model, optimizer, *_ = self.booster.boost(model=model, - optimizer=optimizer) + model, optimizer, *_ = self.booster.boost(model=model, optimizer=optimizer) rets.append((model, optimizer)) elif isinstance(arg, Dict): model, optimizer, criterion, dataloader, lr_scheduler = self.booster.boost(**arg) @@ -90,10 +89,7 @@ def prepare(self, *boost_args: _BoostArgSpec) -> Union[List[_BoostArgSpec], _Boo dataloader=dataloader, lr_scheduler=lr_scheduler) # remove None values - boost_result = { - key: value - for key, value in boost_result.items() if value is not None - } + boost_result = {key: value for key, value in boost_result.items() if value is not None} rets.append(boost_result) else: raise RuntimeError(f'Type {type(arg)} is not supported') @@ -112,23 +108,13 @@ def unwrap_model(model: nn.Module) -> nn.Module: """ return model - def save_model(self, - model: nn.Module, - path: str, - only_rank0: bool = True, - **kwargs - ) -> None: + def save_model(self, model: nn.Module, path: str, only_rank0: bool = True, **kwargs) -> None: self.booster.save_model(model, path, shard=not only_rank0, **kwargs) def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None: self.booster.load_model(model, path, strict) - def save_optimizer(self, - optimizer: Optimizer, - path: str, - only_rank0: bool = False, - **kwargs - ) -> None: + def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False, **kwargs) -> None: self.booster.save_optimizer(optimizer, path, shard=not only_rank0, **kwargs) def load_optimizer(self, optimizer: Optimizer, path: str) -> None: diff --git a/applications/Chat/coati/trainer/utils.py b/applications/Chat/coati/trainer/utils.py index c9fc8d0fe19f..4d45061bab09 100644 --- a/applications/Chat/coati/trainer/utils.py +++ b/applications/Chat/coati/trainer/utils.py @@ -14,9 +14,10 @@ class CycledDataLoader: NOTE: next(iter(dataloader)) is not equivalent to for batch in dataloader: break, it causes slightly different behavior. """ - def __init__(self, - dataloader: DataLoader, - ) -> None: + def __init__( + self, + dataloader: DataLoader, + ) -> None: self.dataloader = dataloader self.count = 0 diff --git a/applications/Chat/examples/ray/mmmt_prompt.py b/applications/Chat/examples/ray/mmmt_prompt.py index 60f049bd5b70..76929c9d0144 100644 --- a/applications/Chat/examples/ray/mmmt_prompt.py +++ b/applications/Chat/examples/ray/mmmt_prompt.py @@ -87,8 +87,8 @@ def model_fn(): kl_coef=0.1, debug=args.debug, update_lora_weights=not (args.lora_rank == 0), - # sync_models_from_trainers=True, - # generation kwargs: + # sync_models_from_trainers=True, + # generation kwargs: max_length=512, do_sample=True, temperature=1.0, @@ -161,12 +161,10 @@ def tokenize_fn(texts): parser.add_argument('--prompt_path', type=str, default=None) parser.add_argument('--num_makers', type=int, default=1) parser.add_argument('--num_trainers', type=int, default=1) - parser.add_argument('--trainer_strategy', - choices=[ - 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu', - 'colossalai_zero2_cpu' - ], - default='ddp') + parser.add_argument( + '--trainer_strategy', + choices=['ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu', 'colossalai_zero2_cpu'], + default='ddp') parser.add_argument('--maker_strategy', choices=['naive'], default='naive') parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama']) parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama']) diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py index 5b1b8d3d16b2..fb9802e38542 100644 --- a/applications/Chat/examples/train_reward_model.py +++ b/applications/Chat/examples/train_reward_model.py @@ -150,9 +150,7 @@ def train(args): pin_memory=True) lr_scheduler = CosineAnnealingLR(optim, train_dataloader.__len__() // 100) - strategy_dict = strategy.prepare( - dict(model=model, optimizer=optim, lr_scheduler=lr_scheduler) - ) + strategy_dict = strategy.prepare(dict(model=model, optimizer=optim, lr_scheduler=lr_scheduler)) model = strategy_dict['model'] optim = strategy_dict['optimizer'] lr_scheduler = strategy_dict['lr_scheduler'] @@ -163,9 +161,7 @@ def train(args): loss_fn=loss_fn, max_epochs=args.max_epochs) - trainer.fit(train_dataloader=train_dataloader, - valid_dataloader=valid_dataloader, - eval_dataloader=eval_dataloader) + trainer.fit(train_dataloader=train_dataloader, valid_dataloader=valid_dataloader, eval_dataloader=eval_dataloader) # save model checkpoint after fitting on only rank0 strategy.save_model(model, args.save_path, only_rank0=True) # save optimizer checkpoint on all ranks diff --git a/applications/Chat/inference/requirements.txt b/applications/Chat/inference/requirements.txt index 511fe1a4f1f3..cb6275361736 100644 --- a/applications/Chat/inference/requirements.txt +++ b/applications/Chat/inference/requirements.txt @@ -10,4 +10,4 @@ uvicorn git+https://github.com/huggingface/transformers accelerate bitsandbytes -jieba \ No newline at end of file +jieba diff --git a/applications/Chat/inference/server.py b/applications/Chat/inference/server.py index b4627299397e..e23f0fceb2fa 100644 --- a/applications/Chat/inference/server.py +++ b/applications/Chat/inference/server.py @@ -14,7 +14,7 @@ from slowapi.util import get_remote_address from sse_starlette.sse import EventSourceResponse from transformers import AutoTokenizer, GenerationConfig, LlamaForCausalLM -from utils import ChatPromptProcessor, Dialogue, LockedIterator, sample_streamingly, update_model_kwargs_fn, load_json +from utils import ChatPromptProcessor, Dialogue, LockedIterator, load_json, sample_streamingly, update_model_kwargs_fn CONTEXT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.' MAX_LEN = 512 @@ -145,7 +145,9 @@ def generate_no_stream(data: GenerationTaskReq, request: Request): help='Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.') parser.add_argument('--http_host', default='0.0.0.0') parser.add_argument('--http_port', type=int, default=7070) - parser.add_argument('--profanity_file', default=None, help='Path to profanity words list. It should be a JSON file containing a list of words.') + parser.add_argument('--profanity_file', + default=None, + help='Path to profanity words list. It should be a JSON file containing a list of words.') args = parser.parse_args() if args.quant == '4bit': diff --git a/colossalai/auto_parallel/offload/amp_optimizer.py b/colossalai/auto_parallel/offload/amp_optimizer.py index a79e5006e7d2..19d85b80dd3d 100644 --- a/colossalai/auto_parallel/offload/amp_optimizer.py +++ b/colossalai/auto_parallel/offload/amp_optimizer.py @@ -1,24 +1,25 @@ -from typing import Dict, Tuple from enum import Enum +from typing import Dict, Tuple + import torch from torch.optim import Optimizer +from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler from colossalai.logging import get_dist_logger from colossalai.nn.optimizer import ColossalaiOptimizer -from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler from colossalai.utils import get_current_device from .base_offload_module import BaseOffloadModule -from .region_manager import RegionManager from .region import Region +from .region_manager import RegionManager class OptimState(Enum): SCALED = 0 UNSCALED = 1 -class AMPOptimizer(ColossalaiOptimizer): +class AMPOptimizer(ColossalaiOptimizer): """ A wrapper for Optimizer. Code reference: https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/optimizer/zero_optimizer.py @@ -174,4 +175,4 @@ def __init__optimizer(self): # Leverage state_dict() and load_state_dict() to # recast preexisting per-param state tensors - self.optim.load_state_dict(self.optim.state_dict()) \ No newline at end of file + self.optim.load_state_dict(self.optim.state_dict()) diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py index 9a2314826448..1a6dc7815176 100644 --- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py +++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py @@ -55,7 +55,7 @@ def size_processing(size: Union[int, torch.Size], def solution_annotation_pass(gm: torch.fx.GraphModule, solution: List[int], - strategies_constructor: StrategiesConstructor): + strategies_constructor: StrategiesConstructor): """ This method is used to stick the solution strategy to the nodes and add the information required in runtime into graph as placeholder nodes. diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/unary_elementwise_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/unary_elementwise_generator.py index b867a30686eb..39799a67c5a0 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/unary_elementwise_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/unary_elementwise_generator.py @@ -1,7 +1,7 @@ import copy from typing import List -from colossalai.auto_parallel.tensor_shard.sharding_strategy import (MemoryCost, ShardingStrategy, TrainCycleItem) +from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, ShardingStrategy, TrainCycleItem from .strategy_generator import FollowingStrategyGenerator diff --git a/colossalai/booster/mixed_precision/mixed_precision_base.py b/colossalai/booster/mixed_precision/mixed_precision_base.py index 8caa34e505e1..a86fdfc17eaf 100644 --- a/colossalai/booster/mixed_precision/mixed_precision_base.py +++ b/colossalai/booster/mixed_precision/mixed_precision_base.py @@ -13,10 +13,11 @@ class MixedPrecision(ABC): """ @abstractmethod - def configure(self, - model: nn.Module, - optimizer: Optional[Optimizer] = None, - criterion: Optional[Callable] = None, - ) -> Tuple[nn.Module, OptimizerWrapper, Callable]: + def configure( + self, + model: nn.Module, + optimizer: Optional[Optimizer] = None, + criterion: Optional[Callable] = None, + ) -> Tuple[nn.Module, OptimizerWrapper, Callable]: # TODO: implement this method pass diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py index 94d722080367..3ec0d34092a4 100644 --- a/colossalai/booster/plugin/low_level_zero_plugin.py +++ b/colossalai/booster/plugin/low_level_zero_plugin.py @@ -208,10 +208,7 @@ def configure( if optimizer is not None and \ not isinstance(optimizer, OptimizerWrapper): - optimizer = LowLevelZeroOptimizer(model.unwrap(), - optimizer, - self.zero_optim_config, - self.optim_kwargs, + optimizer = LowLevelZeroOptimizer(model.unwrap(), optimizer, self.zero_optim_config, self.optim_kwargs, self.verbose) return model, optimizer, criterion, dataloader, lr_scheduler diff --git a/colossalai/cli/benchmark/utils.py b/colossalai/cli/benchmark/utils.py index 825b795f21f6..ee7d92d6ea6a 100644 --- a/colossalai/cli/benchmark/utils.py +++ b/colossalai/cli/benchmark/utils.py @@ -1,10 +1,11 @@ import math import time +from typing import Callable, Dict, List, Tuple + import torch +from colossalai.context import Config, ParallelMode from colossalai.utils import MultiTimer -from colossalai.context import ParallelMode, Config -from typing import List, Dict, Tuple, Callable def get_time_stamp() -> int: @@ -25,8 +26,8 @@ def get_memory_states() -> Tuple[float]: Return the memory statistics. Returns: - max_allocated (float): the allocated CUDA memory - max_cached (float): the cached CUDA memory + max_allocated (float): the allocated CUDA memory + max_cached (float): the cached CUDA memory """ max_allocated = torch.cuda.max_memory_allocated() / (1024**3) @@ -101,7 +102,7 @@ def profile_model(model: torch.nn.Module, warmup_steps: int, profile_steps: int, profile_steps (int): the number of steps for profiling data_func (Callable): a function to generate random data timer (colossalai.utils.Multitimer): a timer instance for time recording - + Returns: fwd_time (float): the average forward time taken by forward pass in second bwd_time (float): the average backward time taken by forward pass in second diff --git a/colossalai/communication/p2p.py b/colossalai/communication/p2p.py index 1f20fca4f74d..d28d140168fd 100644 --- a/colossalai/communication/p2p.py +++ b/colossalai/communication/p2p.py @@ -1,16 +1,18 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- +import operator +from functools import reduce from typing import List, Tuple, Union + import torch import torch.distributed as dist from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.utils import get_current_device -from functools import reduce -import operator -from .utils import split_tensor_into_1d_equal_chunks, gather_split_1d_tensor + +from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks TensorShape = Union[torch.Size, List[int], Tuple[int]] @@ -260,7 +262,7 @@ def send_forward_recv_backward(output_tensor, next_rank=None, dtype=torch.float, scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]: - """Batched communication operation. Sends the input tensor to the + """Batched communication operation. Sends the input tensor to the next stage in pipeline, while receives the gradient tensor from the next stage in pipeline as the input gradient tensor of this stage. @@ -319,7 +321,7 @@ def send_forward_recv_forward(output_tensor, next_rank=None, dtype=torch.float, scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]: - """Batched communication operation. Sends the input tensor to the + """Batched communication operation. Sends the input tensor to the next stage in pipeline, while receives the output tensor from the previous stage in pipeline as the input of this stage. diff --git a/colossalai/communication/utils.py b/colossalai/communication/utils.py index ef9eceea847d..1516df356278 100644 --- a/colossalai/communication/utils.py +++ b/colossalai/communication/utils.py @@ -1,10 +1,11 @@ +from typing import List, Tuple, Union + import torch import torch.distributed as dist from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.utils import get_current_device -from typing import Union, List, Tuple TensorShape = Union[torch.Size, List[int], Tuple[int]] diff --git a/examples/language/llama/test_ci.sh b/examples/language/llama/test_ci.sh new file mode 100755 index 000000000000..e69de29bb2d1