From 21d35afa635a1fa0bfbe26f3b73ff5cd6c5cdfe2 Mon Sep 17 00:00:00 2001 From: CWHer Date: Sun, 25 Jun 2023 18:09:32 +0800 Subject: [PATCH 01/10] feat: remove on_learn_epoch fn as not used --- applications/Chat/coati/trainer/callbacks/base.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/applications/Chat/coati/trainer/callbacks/base.py b/applications/Chat/coati/trainer/callbacks/base.py index f5616048855b..4351c0079b8d 100644 --- a/applications/Chat/coati/trainer/callbacks/base.py +++ b/applications/Chat/coati/trainer/callbacks/base.py @@ -26,12 +26,6 @@ def on_make_experience_start(self) -> None: def on_make_experience_end(self, experience: Experience) -> None: pass - def on_learn_epoch_start(self, epoch: int) -> None: - pass - - def on_learn_epoch_end(self, epoch: int) -> None: - pass - def on_learn_batch_start(self) -> None: pass From d171634202322e1bb5da2a2338b32718b4d1b1b0 Mon Sep 17 00:00:00 2001 From: CWHer Date: Mon, 26 Jun 2023 11:50:09 +0800 Subject: [PATCH 02/10] revert: add _on_learn_epoch fn --- applications/Chat/coati/trainer/callbacks/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/applications/Chat/coati/trainer/callbacks/base.py b/applications/Chat/coati/trainer/callbacks/base.py index 4351c0079b8d..f5616048855b 100644 --- a/applications/Chat/coati/trainer/callbacks/base.py +++ b/applications/Chat/coati/trainer/callbacks/base.py @@ -26,6 +26,12 @@ def on_make_experience_start(self) -> None: def on_make_experience_end(self, experience: Experience) -> None: pass + def on_learn_epoch_start(self, epoch: int) -> None: + pass + + def on_learn_epoch_end(self, epoch: int) -> None: + pass + def on_learn_batch_start(self) -> None: pass From eb344862652b082582fafcfd27e6bafdb6de677d Mon Sep 17 00:00:00 2001 From: CWHer Date: Mon, 26 Jun 2023 15:06:32 +0800 Subject: [PATCH 03/10] to: remove the use of NaiveStrategy --- applications/Chat/README.md | 2 +- applications/Chat/coati/trainer/strategies/__init__.py | 3 +-- applications/Chat/examples/README.md | 6 +++--- .../Chat/examples/community/peft/train_peft_prompts.py | 10 ++++------ .../Chat/examples/community/peft/train_peft_sft.py | 10 ++++------ applications/Chat/examples/train_prompts.py | 8 +++----- applications/Chat/examples/train_reward_model.py | 8 +++----- applications/Chat/examples/train_sft.py | 8 +++----- 8 files changed, 22 insertions(+), 33 deletions(-) diff --git a/applications/Chat/README.md b/applications/Chat/README.md index 082cbb22b587..016272ed8c89 100644 --- a/applications/Chat/README.md +++ b/applications/Chat/README.md @@ -287,7 +287,7 @@ If you only have a single 24G GPU, you can use the following script. `batch_size torchrun --standalone --nproc_per_node=1 train_sft.py \ --pretrain "/path/to/LLaMa-7B/" \ --model 'llama' \ - --strategy naive \ + --strategy ddp \ --log_interval 10 \ --save_path /path/to/Coati-7B \ --dataset /path/to/data.json \ diff --git a/applications/Chat/coati/trainer/strategies/__init__.py b/applications/Chat/coati/trainer/strategies/__init__.py index f258c9b8a873..2755648fb0ed 100644 --- a/applications/Chat/coati/trainer/strategies/__init__.py +++ b/applications/Chat/coati/trainer/strategies/__init__.py @@ -1,6 +1,5 @@ from .base import Strategy from .colossalai import ColossalAIStrategy from .ddp import DDPStrategy -from .naive import NaiveStrategy -__all__ = ['Strategy', 'NaiveStrategy', 'DDPStrategy', 'ColossalAIStrategy'] +__all__ = ['Strategy', 'DDPStrategy', 'ColossalAIStrategy'] diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md index 3e9d9c4325d8..56e4cc992c17 100644 --- a/applications/Chat/examples/README.md +++ b/applications/Chat/examples/README.md @@ -69,7 +69,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \ --grad_checkpoint ``` ### Arg List -- --strategy: the strategy using for training, choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2' +- --strategy: the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2' - --model: model type, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom' - --pretrain: pretrain model, type=str, default=None - --max_datasets_size: the max size of dataset, type=int, default=None @@ -118,7 +118,7 @@ Model performance in [Anthropics paper](https://arxiv.org/abs/2204.05862):
We also train the reward model based on LLaMA-7B, which reaches the ACC of 72.06% after 1 epoch, performing almost the same as Anthropic's best RM. ### Arg List -- --strategy: the strategy using for training, choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2' +- --strategy: the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2' - --model: model type, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom' - --pretrain: pretrain model, type=str, default=None - --model_path: the path of rm model(if continue to train), type=str, default=None @@ -160,7 +160,7 @@ Prompt dataset: the instruction dataset mentioned in the above figure which incl Pretrain dataset: the pretrain dataset including the instruction and corresponding response, e.g. you can use the [InstructWild Data](https://github.com/XueFuzhao/InstructionWild/tree/main/data) in stage 1 supervised instructs tuning. ### Arg List -- --strategy: the strategy using for training, choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2' +- --strategy: the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2' - --model: model type of actor, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom' - --pretrain: pretrain model, type=str, default=None - --rm_model: reward model type, type=str, choices=['gpt2', 'bloom', 'opt', 'llama'], default=None diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py index 00ed7aa36257..83b68a777474 100644 --- a/applications/Chat/examples/community/peft/train_peft_prompts.py +++ b/applications/Chat/examples/community/peft/train_peft_prompts.py @@ -9,7 +9,7 @@ from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM from coati.models.opt import OPTRM, OPTActor, OPTCritic from coati.trainer import PPOTrainer -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy +from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy from coati.utils import prepare_llama_tokenizer_and_embedding from easy_dataset import EasyPromptsDataset, EasySupervisedDataset from easy_models import BLOOMActor @@ -24,9 +24,7 @@ def main(args): # configure strategy - if args.strategy == 'naive': - strategy = NaiveStrategy() - elif args.strategy == 'ddp': + if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': strategy = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5) @@ -202,8 +200,8 @@ def tokenize_fn(texts): parser.add_argument('--prompt_path', type=str, default=None, help='path to the prompt dataset') parser.add_argument('--pretrain_dataset', type=str, default=None, help='path to the pretrained dataset') parser.add_argument('--strategy', - choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], - default='naive', + choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], + default='ddp', help='strategy to use') parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama']) parser.add_argument('--pretrain', type=str, default=None) diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py index d2b08b72ca95..bdbbc5766802 100644 --- a/applications/Chat/examples/community/peft/train_peft_sft.py +++ b/applications/Chat/examples/community/peft/train_peft_sft.py @@ -11,7 +11,7 @@ from coati.models.llama import LlamaLM from coati.models.opt import OPTLM from coati.trainer import SFTTrainer -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy +from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy from coati.utils import prepare_llama_tokenizer_and_embedding from datasets import load_dataset from easy_dataset import EasyDataset @@ -30,9 +30,7 @@ def train(args): # configure strategy - if args.strategy == 'naive': - strategy = NaiveStrategy() - elif args.strategy == 'ddp': + if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': strategy = ColossalAIStrategy(stage=3, placement_policy='cuda') @@ -170,8 +168,8 @@ def train(args): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--strategy', - choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], - default='naive') + choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], + default='ddp') parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom') parser.add_argument('--pretrain', type=str, default=None) parser.add_argument('--dataset', type=str, default=None) diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index a9bc0e532e5d..b8b32a8176e5 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -9,7 +9,7 @@ from coati.models.opt import OPTRM, OPTActor, OPTCritic from coati.models.roberta import RoBERTaActor, RoBERTaCritic, RoBERTaRM from coati.trainer import PPOTrainer -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy +from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy from coati.utils import prepare_llama_tokenizer_and_embedding from torch.optim import Adam from torch.utils.data import DataLoader @@ -21,9 +21,7 @@ def main(args): # configure strategy - if args.strategy == 'naive': - strategy = NaiveStrategy() - elif args.strategy == 'ddp': + if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5) @@ -208,7 +206,7 @@ def main(args): parser.add_argument('--prompt_dataset', type=str, default=None, help='path to the prompt dataset') parser.add_argument('--pretrain_dataset', type=str, default=None, help='path to the pretrained dataset') parser.add_argument('--strategy', - choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], + choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2', help='strategy to use') parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama', 'roberta']) diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py index 4a6851ab5b24..d1ab807b0461 100644 --- a/applications/Chat/examples/train_reward_model.py +++ b/applications/Chat/examples/train_reward_model.py @@ -14,7 +14,7 @@ from coati.models.opt import OPTRM from coati.models.roberta import RoBERTaRM from coati.trainer import RewardModelTrainer -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy +from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy from coati.utils import prepare_llama_tokenizer_and_embedding from datasets import load_dataset from torch.optim import Adam @@ -29,9 +29,7 @@ def train(args): # configure strategy - if args.strategy == 'naive': - strategy = NaiveStrategy() - elif args.strategy == 'ddp': + if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': strategy = ColossalAIStrategy(stage=3, placement_policy='cuda') @@ -195,7 +193,7 @@ def train(args): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--strategy', - choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], + choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2') parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'deberta', 'llama', 'roberta'], default='bloom') parser.add_argument('--pretrain', type=str, default=None) diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py index 967b7c277c6a..451349b48f96 100644 --- a/applications/Chat/examples/train_sft.py +++ b/applications/Chat/examples/train_sft.py @@ -8,7 +8,7 @@ from coati.dataset import DataCollatorForSupervisedDataset, SFTDataset, SupervisedDataset from coati.models import convert_to_lora_module from coati.trainer import SFTTrainer -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy +from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy from coati.utils import prepare_llama_tokenizer_and_embedding from datasets import load_dataset from torch.optim import Adam @@ -29,9 +29,7 @@ def train(args): # configure strategy - if args.strategy == 'naive': - strategy = NaiveStrategy() - elif args.strategy == 'ddp': + if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': raise NotImplementedError( @@ -190,7 +188,7 @@ def train(args): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--strategy', - choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_zero2_cpu'], + choices=['ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_zero2_cpu'], default='colossalai_zero2') parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom') parser.add_argument('--pretrain', type=str, default=None) From b3cd6468326acaae471fb77594a1d48ca6f20210 Mon Sep 17 00:00:00 2001 From: CWHer Date: Mon, 26 Jun 2023 15:07:56 +0800 Subject: [PATCH 04/10] test: remove NaiveStrategy tests --- applications/Chat/examples/test_ci.sh | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/applications/Chat/examples/test_ci.sh b/applications/Chat/examples/test_ci.sh index 4bf5524afb01..29cd2cb1a904 100755 --- a/applications/Chat/examples/test_ci.sh +++ b/applications/Chat/examples/test_ci.sh @@ -49,13 +49,13 @@ wandb init -m offline # - roberta-*: RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)` SKIPPED_TESTS=( "gpt2-ddp" - "llama-naive" "llama-ddp" "llama-colossalai_gemini" "llama-colossalai_zero2" - "roberta-naive" "roberta-ddp" "roberta-colossalai_gemini" "roberta-colossalai_zero2" + "llama-ddp" "llama-colossalai_gemini" "llama-colossalai_zero2" + "roberta-ddp" "roberta-colossalai_gemini" "roberta-colossalai_zero2" ) # These tests are quick and do not have any dependencies for model in 'gpt2' 'bloom' 'opt' 'llama' 'roberta'; do - for strategy in 'naive' 'ddp' 'colossalai_gemini' 'colossalai_zero2'; do + for strategy in 'ddp' 'colossalai_gemini' 'colossalai_zero2'; do if [[ " ${SKIPPED_TESTS[*]} " =~ " ${model}-${strategy} " ]]; then echo "[Test]: Skipped $model-$strategy" continue @@ -91,12 +91,6 @@ torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'gpt2' --model 'gpt2' --strategy ddp --lora_rank 4 \ --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \ --save_path ${BASE}/output - -# torchrun --standalone --nproc_per_node=4 ${BASE}/train_sft.py --pretrain 'facebook/opt-350m' \ -# --model 'opt' --strategy naive \ -# --dataset $SFT_DATASET --max_datasets_size 512 --max_epochs 1 \ -# --save_path ${BASE}/output - rm -rf ${BASE}/output # train rm From 03e536092ecc73b49660215b324bc4254a3716ae Mon Sep 17 00:00:00 2001 From: CWHer Date: Mon, 26 Jun 2023 15:15:50 +0800 Subject: [PATCH 05/10] feat: remove NaiveStrategy --- .../Chat/coati/trainer/strategies/ddp.py | 70 ++++++++++-- .../Chat/coati/trainer/strategies/naive.py | 103 ------------------ 2 files changed, 62 insertions(+), 111 deletions(-) delete mode 100644 applications/Chat/coati/trainer/strategies/naive.py diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py index 42867645290c..2fdfebd2a6a8 100644 --- a/applications/Chat/coati/trainer/strategies/ddp.py +++ b/applications/Chat/coati/trainer/strategies/ddp.py @@ -1,4 +1,6 @@ +import os import random +from collections import OrderedDict from typing import Callable, Optional import numpy as np @@ -6,18 +8,27 @@ import torch.distributed as dist import torch.nn as nn from coati.replay_buffer import ReplayBuffer -from torch.optim import Optimizer from torch.utils.data import DataLoader +from transformers.modeling_utils import PreTrainedModel from transformers.tokenization_utils_base import PreTrainedTokenizerBase from colossalai.booster.plugin import TorchDDPPlugin from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPModel -from .naive import NaiveStrategy +from .base import Strategy from .sampler import DistributedSampler -class DDPStrategy(NaiveStrategy): +# TODO Move this to a util.py (Moving to ray.util introduces ringed import) +def get_grad_required_state_dict(model: nn.Module): + state_dict = OrderedDict() + for name, parameter in model.named_parameters(): + if parameter.requires_grad: + state_dict[name] = parameter.detach() + return state_dict + + +class DDPStrategy(Strategy): """ Strategy for distributed training using torch.distributed. """ @@ -29,12 +40,30 @@ def __init__(self, self.seed = seed super().__init__(plugin_initializer) + def _try_init_dist(self, force: bool = False) -> None: + try: + rank = int(os.environ['RANK']) + local_rank = int(os.environ['LOCAL_RANK']) + world_size = int(os.environ['WORLD_SIZE']) + host = os.environ['MASTER_ADDR'] + port = int(os.environ['MASTER_PORT']) + dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank) + torch.cuda.set_device(local_rank) + except KeyError as e: + if force: + raise RuntimeError( + f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch" + ) + except Exception as e: + if force: + raise e + def _post_init(self) -> None: assert isinstance(self.plugin, TorchDDPPlugin), \ f'{type(self).__name__}\'s plugin is not initialized properly.' def setup_distributed(self) -> None: - self._try_init_dist(force=True) + self._try_init_dist(force=False) self.set_seed(self.seed) def set_seed(self, seed: int) -> None: @@ -42,9 +71,6 @@ def set_seed(self, seed: int) -> None: np.random.seed(seed) torch.manual_seed(seed) - def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, **kwargs) -> None: - self.booster.backward(loss, optimizer) - def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False) -> DataLoader: return self.plugin.prepare_dataloader(replay_buffer, batch_size=replay_buffer.sample_batch_size, @@ -68,4 +94,32 @@ def save_pretrained(self, tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None: if only_rank0 and dist.get_rank() != 0: return - super().save_pretrained(model, path, only_rank0, tokenizer) + unwrapped_model = self.unwrap_model(model) + assert isinstance(unwrapped_model, PreTrainedModel) + unwrapped_model.save_pretrained(path) + if tokenizer is not None: + tokenizer.save_pretrained(path) + + def get_model_state_dict_shard(self, model: nn.Module, **config): + # TODO: implement sharding on naive strategy + model = self.unwrap_model(model) + if 'requires_grad_only' in config and config['requires_grad_only'] == True: + state_dict = get_grad_required_state_dict(model) + else: + state_dict = model.state_dict() + + if 'shard_size' in config: + shard_size = config['shard_size'] + accumulate_size = 0 + state_dict_shard = OrderedDict() + for name, param in state_dict.items(): + state_dict_shard[name] = param + accumulate_size += param.numel() * param.element_size() + if accumulate_size >= shard_size: + accumulate_size = 0 + yield state_dict_shard + state_dict_shard = OrderedDict() + if accumulate_size > 0: + yield state_dict_shard + else: + yield state_dict diff --git a/applications/Chat/coati/trainer/strategies/naive.py b/applications/Chat/coati/trainer/strategies/naive.py deleted file mode 100644 index d121237a68ea..000000000000 --- a/applications/Chat/coati/trainer/strategies/naive.py +++ /dev/null @@ -1,103 +0,0 @@ -import os -from collections import OrderedDict -from typing import Optional - -import torch -import torch.distributed as dist -import torch.nn as nn -from coati.replay_buffer import ReplayBuffer -from torch.optim import Optimizer -from torch.utils.data import DataLoader -from transformers.modeling_utils import PreTrainedModel -from transformers.tokenization_utils_base import PreTrainedTokenizerBase - -from .base import Strategy - - -# TODO Move this to a util.py (Moving to ray.util introduces ringed import) -def get_grad_required_state_dict(model: nn.Module): - state_dict = OrderedDict() - for name, parameter in model.named_parameters(): - if parameter.requires_grad: - state_dict[name] = parameter.detach() - return state_dict - - -class NaiveStrategy(Strategy): - """ - Strategy for single GPU. No parallelism is used. - """ - - def _post_init(self) -> None: - assert self.plugin is None, \ - f'{type(self).__name__}\'s plugin is not initialized properly.' - - def setup_distributed(self) -> None: - self._try_init_dist(force=False) - - def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, **kwargs) -> None: - # HACK: self.booster.backward(loss, optimizer) can't work if plugin is None, - # it would run `optimizer.backward(loss)`, which is not compatible with torch.optim.Optimizer - assert self.plugin is None, "DO NOT call this method if plugin is not None" - loss.backward() - - def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False) -> DataLoader: - return DataLoader(replay_buffer, - batch_size=replay_buffer.sample_batch_size, - shuffle=True, - drop_last=True, - pin_memory=pin_memory, - collate_fn=replay_buffer.collate_fn) - - def save_pretrained(self, - model: nn.Module, - path: str, - only_rank0: bool = True, - tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None: - unwrapped_model = self.unwrap_model(model) - assert isinstance(unwrapped_model, PreTrainedModel) - unwrapped_model.save_pretrained(path) - if tokenizer is not None: - tokenizer.save_pretrained(path) - - def get_model_state_dict_shard(self, model: nn.Module, **config): - # TODO: implement sharding on naive strategy - model = self.unwrap_model(model) - if 'requires_grad_only' in config and config['requires_grad_only'] == True: - state_dict = get_grad_required_state_dict(model) - else: - state_dict = model.state_dict() - - if 'shard_size' in config: - shard_size = config['shard_size'] - accumulate_size = 0 - state_dict_shard = OrderedDict() - for name, param in state_dict.items(): - state_dict_shard[name] = param - accumulate_size += param.numel() * param.element_size() - if accumulate_size >= shard_size: - accumulate_size = 0 - yield state_dict_shard - state_dict_shard = OrderedDict() - if accumulate_size > 0: - yield state_dict_shard - else: - yield state_dict - - def _try_init_dist(self, force: bool = False) -> None: - try: - rank = int(os.environ['RANK']) - local_rank = int(os.environ['LOCAL_RANK']) - world_size = int(os.environ['WORLD_SIZE']) - host = os.environ['MASTER_ADDR'] - port = int(os.environ['MASTER_PORT']) - dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank) - torch.cuda.set_device(local_rank) - except KeyError as e: - if force: - raise RuntimeError( - f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch" - ) - except Exception as e: - if force: - raise e From 52a0c53465ee041450053185e090799047506102 Mon Sep 17 00:00:00 2001 From: CWHer Date: Tue, 27 Jun 2023 11:46:31 +0800 Subject: [PATCH 06/10] style: modify comments and params --- applications/Chat/examples/test_ci.sh | 2 +- applications/Chat/examples/train_sft.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/applications/Chat/examples/test_ci.sh b/applications/Chat/examples/test_ci.sh index 29cd2cb1a904..dec1f7c036c8 100755 --- a/applications/Chat/examples/test_ci.sh +++ b/applications/Chat/examples/test_ci.sh @@ -138,9 +138,9 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_reward_model.py \ --dataset 'Anthropic/hh-rlhf' --subset 'harmless-base' \ --test True --lora_rank 4 \ --save_path ${BASE}/rm_ckpt.pt - rm -rf ${BASE}/rm_ckpt.pt +# train rl torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py \ --prompt_dataset $PROMPT_PATH --pretrain_dataset $PRETRAIN_DATASET \ --strategy colossalai_zero2 --num_episodes 1 \ diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py index 451349b48f96..cee798bac2b7 100644 --- a/applications/Chat/examples/train_sft.py +++ b/applications/Chat/examples/train_sft.py @@ -64,7 +64,7 @@ def train(args): tokenizer = GPT2Tokenizer.from_pretrained('gpt2') tokenizer.pad_token = tokenizer.eos_token elif args.model == 'bloom': - tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain) + tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m') tokenizer.pad_token = tokenizer.eos_token elif args.model == 'opt': tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") From 3cfcb875c04c55bebf3dcf4eeab53d396e0b8c9f Mon Sep 17 00:00:00 2001 From: CWHer Date: Tue, 27 Jun 2023 14:53:51 +0800 Subject: [PATCH 07/10] feat: split ColossalAIStrategy into LowLevelZeroStrategy and GeminiStrategy --- .../Chat/coati/trainer/strategies/__init__.py | 7 +- .../coati/trainer/strategies/colossalai.py | 259 ++++++++++-------- 2 files changed, 157 insertions(+), 109 deletions(-) diff --git a/applications/Chat/coati/trainer/strategies/__init__.py b/applications/Chat/coati/trainer/strategies/__init__.py index 2755648fb0ed..b49a2c742db3 100644 --- a/applications/Chat/coati/trainer/strategies/__init__.py +++ b/applications/Chat/coati/trainer/strategies/__init__.py @@ -1,5 +1,8 @@ from .base import Strategy -from .colossalai import ColossalAIStrategy +from .colossalai import GeminiStrategy, LowLevelZeroStrategy from .ddp import DDPStrategy -__all__ = ['Strategy', 'DDPStrategy', 'ColossalAIStrategy'] +__all__ = [ + 'Strategy', 'DDPStrategy', + 'LowLevelZeroStrategy', 'GeminiStrategy' +] diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py index e5a69f3351cb..1b59d704eec3 100644 --- a/applications/Chat/coati/trainer/strategies/colossalai.py +++ b/applications/Chat/coati/trainer/strategies/colossalai.py @@ -18,13 +18,96 @@ from .ddp import DDPStrategy -class ColossalAIStrategy(DDPStrategy): +class LowLevelZeroStrategy(DDPStrategy): + """ + The strategy for training with ColossalAI. + + Args: + stage(int): The stage to use in ZeRO. Choose in (1, 2) + precision(str): The precision to use. Choose in ('fp32', 'fp16'). + seed(int): The seed for the random number generator. + placement_policy(str): The placement policy for gemini. Choose in ('cpu', 'cuda') + If it is “cpu”, parameters, gradients and optimizer states will be offloaded to CPU, + If it is “cuda”, they will not be offloaded, which means max CUDA memory will be used. It is the fastest. + reduce_bucket_size(int): The reduce bucket size in bytes. Only for ZeRO-1 and ZeRO-2. + overlap_communication(bool): Whether to overlap communication and computation. Only for ZeRO-1 and ZeRO-2. + initial_scale(float): The initial scale for the optimizer. + growth_factor(float): The growth factor for the optimizer. + backoff_factor(float): The backoff factor for the optimizer. + growth_interval(int): The growth interval for the optimizer. + hysteresis(int): The hysteresis for the optimizer. + min_scale(float): The minimum scale for the optimizer. + max_scale(float): The maximum scale for the optimizer. + max_norm(float): The maximum norm for the optimizer. + norm_type(float): The norm type for the optimizer. + + """ + + def __init__(self, + stage: int = 3, + precision: str = 'fp16', + seed: int = 42, + placement_policy: str = 'cuda', + reduce_bucket_size: int = 12 * 1024**2, # only for stage 1&2 + overlap_communication: bool = True, # only for stage 1&2 + initial_scale: float = 2**16, + growth_factor: float = 2, + backoff_factor: float = 0.5, + growth_interval: int = 1000, + hysteresis: int = 2, + min_scale: float = 1, + max_scale: float = 2**32, + max_norm: float = 0.0, + norm_type: float = 2.0 + ) -> None: + + assert stage in (1, 2), f'Unsupported stage "{stage}"' + assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"' + assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"' + + plugin_initializer = lambda: LowLevelZeroPlugin( + # zero_config + stage=stage, + precision=precision, + # zero_optim_config + reduce_bucket_size_in_m=reduce_bucket_size, + overlap_communication=overlap_communication, + cpu_offload=(placement_policy == 'cpu'), + # optim_config + initial_scale=initial_scale, + growth_factor=growth_factor, + backoff_factor=backoff_factor, + growth_interval=growth_interval, + hysteresis=hysteresis, + min_scale=min_scale, + max_scale=max_scale, + max_norm=max_norm, + norm_type=norm_type + ) + + super().__init__(seed, plugin_initializer) + + def _post_init(self) -> None: + assert isinstance(self.plugin, LowLevelZeroPlugin), \ + f'{type(self).__name__}\'s plugin is not initialized properly.' + + def setup_distributed(self) -> None: + colossalai.launch_from_torch({}, seed=self.seed) + + def unwrap_model(self, model: nn.Module) -> nn.Module: + assert isinstance(model, LowLevelZeroModel) + return model.module + + def get_model_state_dict_shard(self, model: nn.Module, **config): + assert isinstance(model, LowLevelZeroModel) + yield from model.state_dict_shard(max_shard_size=1024, only_rank_0=False) + + +class GeminiStrategy(DDPStrategy): """ The strategy for training with ColossalAI. Args: - stage(int): The stage to use in ZeRO. Choose in (1, 2, 3) - precision(str): The precision to use. Choose in ('fp32', 'fp16'). Stage 3 only supports fp16. seed(int): The seed for the random number generator. shard_init(bool): Whether to shard the model parameters during initialization. Only for ZeRO-3. This is not compatible with `from_pretrained()`. We temporarily disable this and will support it in the future. @@ -37,8 +120,6 @@ class ColossalAIStrategy(DDPStrategy): hidden_dim(optional, int): The hidden dimension for the gemini. Only for ZeRO-3. min_chunk_size_m(float): The minimum chunk size divided by 2^20. Only for ZeRO-3. gpu_margin_mem_ratio(float): The margin memory ratio for the GPU. Only for ZeRO-3. - reduce_bucket_size(int): The reduce bucket size in bytes. Only for ZeRO-1 and ZeRO-2. - overlap_communication(bool): Whether to overlap communication and computation. Only for ZeRO-1 and ZeRO-2. initial_scale(float): The initial scale for the optimizer. growth_factor(float): The growth factor for the optimizer. backoff_factor(float): The backoff factor for the optimizer. @@ -51,132 +132,96 @@ class ColossalAIStrategy(DDPStrategy): """ - def __init__( - self, - stage: int = 3, - precision: str = 'fp16', - seed: int = 42, - shard_init: bool = False, # only for stage 3 - placement_policy: str = 'cuda', - pin_memory: bool = True, # only for stage 3 - force_outputs_fp32: bool = False, # only for stage 3 - search_range_m: int = 32, # only for stage 3 - hidden_dim: Optional[int] = None, # only for stage 3 - min_chunk_size_m: float = 32, # only for stage 3 - gpu_margin_mem_ratio: float = 0.0, # only for stage 3 - reduce_bucket_size: int = 12 * 1024**2, # only for stage 1&2 - overlap_communication: bool = True, # only for stage 1&2 - initial_scale: float = 2**16, - growth_factor: float = 2, - backoff_factor: float = 0.5, - growth_interval: int = 1000, - hysteresis: int = 2, - min_scale: float = 1, - max_scale: float = 2**32, - max_norm: float = 0.0, - norm_type: float = 2.0) -> None: - - assert stage in (1, 2, 3), f'Unsupported stage "{stage}"' + def __init__(self, + seed: int = 42, + shard_init: bool = False, # only for stage 3 + placement_policy: str = 'cuda', + pin_memory: bool = True, # only for stage 3 + force_outputs_fp32: bool = False, # only for stage 3 + search_range_m: int = 32, # only for stage 3 + hidden_dim: Optional[int] = None, # only for stage 3 + min_chunk_size_m: float = 32, # only for stage 3 + gpu_margin_mem_ratio: float = 0.0, # only for stage 3 + initial_scale: float = 2**16, + growth_factor: float = 2, + backoff_factor: float = 0.5, + growth_interval: int = 1000, + hysteresis: int = 2, + min_scale: float = 1, + max_scale: float = 2**32, + max_norm: float = 0.0, + norm_type: float = 2.0 + ) -> None: + assert placement_policy in ('cpu', 'cuda'), f'Unsupported placement policy "{placement_policy}"' - assert precision in ('fp32', 'fp16'), f'Unsupported precision "{precision}"' # TODO(ver217): support shard_init when using from_pretrained() if shard_init: - warnings.warn(f'Shard init is not supported model.from_pretrained() yet. ' - 'Please load weights after strategy.prepare()') - if stage == 3 and precision == 'fp32': - warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.') - precision = 'fp16' - self.precision = precision + warnings.warn( + f'Shard init is not supported model.from_pretrained() yet. ' + 'Please load weights after strategy.prepare()' + ) self.shard_init = shard_init - optim_kwargs = dict(initial_scale=initial_scale, - growth_factor=growth_factor, - backoff_factor=backoff_factor, - growth_interval=growth_interval, - hysteresis=hysteresis, - min_scale=min_scale, - max_scale=max_scale, - max_norm=max_norm, - norm_type=norm_type) + warnings.warn(f'Stage 3 only supports fp16. Precision is set to fp16.') + # NOTE: dist should be initialized before calling get_current_device() - if stage == 3: - plugin_initializer = lambda: GeminiPlugin( - # gemini_config - device=get_current_device(), - placement_policy=placement_policy, - precision=precision, - pin_memory=pin_memory, - force_outputs_fp32=force_outputs_fp32, - strict_ddp_mode=shard_init, - search_range_m=search_range_m, - hidden_dim=hidden_dim, - min_chunk_size_m=min_chunk_size_m, - # zero_optim_config - gpu_margin_mem_ratio=gpu_margin_mem_ratio, - # optim_config - **optim_kwargs) - else: - plugin_initializer = lambda: LowLevelZeroPlugin( - # zero_config - stage=stage, - precision=precision, - # zero_optim_config - reduce_bucket_size_in_m=reduce_bucket_size, - overlap_communication=overlap_communication, - cpu_offload=(placement_policy == 'cpu'), - # optim_config - **optim_kwargs) + plugin_initializer = lambda: GeminiPlugin( + # gemini_config + device=get_current_device(), + placement_policy=placement_policy, + precision='fp16', + pin_memory=pin_memory, + force_outputs_fp32=force_outputs_fp32, + strict_ddp_mode=shard_init, + search_range_m=search_range_m, + hidden_dim=hidden_dim, + min_chunk_size_m=min_chunk_size_m, + # zero_optim_config + gpu_margin_mem_ratio=gpu_margin_mem_ratio, + # optim_config + initial_scale=initial_scale, + growth_factor=growth_factor, + backoff_factor=backoff_factor, + growth_interval=growth_interval, + hysteresis=hysteresis, + min_scale=min_scale, + max_scale=max_scale, + max_norm=max_norm, + norm_type=norm_type + ) super().__init__(seed, plugin_initializer) def _post_init(self) -> None: - assert isinstance(self.plugin, (LowLevelZeroPlugin, GeminiPlugin)), \ + assert isinstance(self.plugin, GeminiPlugin), \ f'{type(self).__name__}\'s plugin is not initialized properly.' def setup_distributed(self) -> None: colossalai.launch_from_torch({}, seed=self.seed) def model_init_context(self): - if isinstance(self.plugin, GeminiPlugin): - world_size = dist.get_world_size() - shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None - default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None - return ColoInitContext(device=get_current_device(), - dtype=torch.half, - default_pg=shard_pg, - default_dist_spec=default_dist_spec) - return super().model_init_context() + world_size = dist.get_world_size() + shard_pg = ProcessGroup(tp_degree=world_size) if self.shard_init else None + default_dist_spec = ShardSpec([-1], [world_size]) if self.shard_init else None + return ColoInitContext(device=get_current_device(), + dtype=torch.half, + default_pg=shard_pg, + default_dist_spec=default_dist_spec) def unwrap_model(self, model: nn.Module) -> nn.Module: - if isinstance(self.plugin, GeminiPlugin): - assert isinstance(model, GeminiModel) - ddp_model = model.unwrap() - assert isinstance(ddp_model, GeminiDDP) - return ddp_model.module - elif isinstance(self.plugin, LowLevelZeroPlugin): - assert isinstance(model, LowLevelZeroModel) - return model.module - else: - raise RuntimeError(f'Unsupported plugin {type(self.plugin)}') + assert isinstance(model, GeminiModel) + ddp_model = model.unwrap() + assert isinstance(ddp_model, GeminiDDP) + return ddp_model.module def save_pretrained(self, model: nn.Module, path: str, only_rank0: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None: - if isinstance(self.plugin, GeminiPlugin): - raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now') - super().save_pretrained(model, path, only_rank0, tokenizer) + raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now') def get_model_state_dict_shard(self, model: nn.Module, **config): - if not isinstance(self.plugin, GeminiPlugin): - yield from super().get_model_state_dict_shard(model, **config) - else: - # unwrapped_model = self._unwrap_model(model) - # for module in unwrapped_model.modules(): - # if isinstance(module, LoraLinear): - # module.merge_weights = True - # module.eval() - assert isinstance(model, LowLevelZeroModel) - yield from model.state_dict_shard(max_shard_size=1024, only_rank_0=False) + assert isinstance(self.plugin, GeminiPlugin) + yield from super().get_model_state_dict_shard(model, **config) From 80e71e7c28bcec36202d6fd167e50c3f6e6ef3ad Mon Sep 17 00:00:00 2001 From: CWHer Date: Tue, 27 Jun 2023 15:13:13 +0800 Subject: [PATCH 08/10] fix: remove naive --- applications/Chat/benchmarks/ray/1mmt_dummy.py | 8 ++++---- applications/Chat/benchmarks/ray/mmmt_dummy.py | 8 ++++---- applications/Chat/coati/ray/utils.py | 4 +--- .../Chat/examples/community/ray/train_prompts_on_ray.py | 8 +++----- applications/Chat/examples/ray/1mmt_prompt.py | 8 ++++---- applications/Chat/examples/ray/mmmt_prompt.py | 8 ++++---- 6 files changed, 20 insertions(+), 24 deletions(-) diff --git a/applications/Chat/benchmarks/ray/1mmt_dummy.py b/applications/Chat/benchmarks/ray/1mmt_dummy.py index 9e8f36cefc4f..7fc990448805 100644 --- a/applications/Chat/benchmarks/ray/1mmt_dummy.py +++ b/applications/Chat/benchmarks/ray/1mmt_dummy.py @@ -83,8 +83,8 @@ def model_fn(): env_info=env_info_maker, kl_coef=0.1, debug=args.debug, - # sync_models_from_trainers=True, - # generation kwargs: + # sync_models_from_trainers=True, + # generation kwargs: max_length=512, do_sample=True, temperature=1.0, @@ -153,10 +153,10 @@ def build_dataloader(size): parser.add_argument('--num_trainers', type=int, default=1) parser.add_argument('--trainer_strategy', choices=[ - 'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu', + 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu', 'colossalai_zero2_cpu' ], - default='naive') + default='ddp') parser.add_argument('--maker_strategy', choices=['naive'], default='naive') parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama']) parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama']) diff --git a/applications/Chat/benchmarks/ray/mmmt_dummy.py b/applications/Chat/benchmarks/ray/mmmt_dummy.py index 46a0062893b8..ca1df22070fc 100644 --- a/applications/Chat/benchmarks/ray/mmmt_dummy.py +++ b/applications/Chat/benchmarks/ray/mmmt_dummy.py @@ -87,8 +87,8 @@ def model_fn(): env_info=env_info_maker, kl_coef=0.1, debug=args.debug, - # sync_models_from_trainers=True, - # generation kwargs: + # sync_models_from_trainers=True, + # generation kwargs: max_length=512, do_sample=True, temperature=1.0, @@ -164,10 +164,10 @@ def build_dataloader(size): parser.add_argument('--num_trainers', type=int, default=1) parser.add_argument('--trainer_strategy', choices=[ - 'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu', + 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu', 'colossalai_zero2_cpu' ], - default='naive') + default='ddp') parser.add_argument('--maker_strategy', choices=['naive'], default='naive') parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama']) parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama']) diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py index 4361ee236771..3f116487a9a3 100644 --- a/applications/Chat/coati/ray/utils.py +++ b/applications/Chat/coati/ray/utils.py @@ -76,9 +76,7 @@ def get_reward_model_from_args(model: str, pretrained: str = None, config=None): def get_strategy_from_args(strategy: str): - if strategy == 'naive': - strategy_ = NaiveStrategy() - elif strategy == 'ddp': + if strategy == 'ddp': strategy_ = DDPStrategy() elif strategy == 'colossalai_gemini': strategy_ = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5) diff --git a/applications/Chat/examples/community/ray/train_prompts_on_ray.py b/applications/Chat/examples/community/ray/train_prompts_on_ray.py index 289330ad8415..933fe03ef23e 100644 --- a/applications/Chat/examples/community/ray/train_prompts_on_ray.py +++ b/applications/Chat/examples/community/ray/train_prompts_on_ray.py @@ -99,9 +99,7 @@ def make_experience(self, experience_computation_ref: ExperienceCompositionRefs) def _init_strategy(self, strategy: str): # configure strategy - if strategy == 'naive': - self._strategy = NaiveStrategy() - elif strategy == 'ddp': + if strategy == 'ddp': self._strategy = DDPStrategy() elif strategy == 'colossalai_gemini': self._strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5) @@ -534,8 +532,8 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument('--prompt_csv_url', type=str) parser.add_argument('--strategy', - choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'], - default='naive') + choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], + default='ddp') parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt']) parser.add_argument('--pretrain', type=str, default='gpt2') parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt') diff --git a/applications/Chat/examples/ray/1mmt_prompt.py b/applications/Chat/examples/ray/1mmt_prompt.py index afdd6a922cc7..5dd52f1790e6 100644 --- a/applications/Chat/examples/ray/1mmt_prompt.py +++ b/applications/Chat/examples/ray/1mmt_prompt.py @@ -103,8 +103,8 @@ def model_fn(): kl_coef=0.1, debug=args.debug, update_lora_weights=not (args.lora_rank == 0), - # sync_models_from_trainers=True, - # generation kwargs: + # sync_models_from_trainers=True, + # generation kwargs: max_length=512, do_sample=True, temperature=1.0, @@ -150,10 +150,10 @@ def tokenize_fn(texts): parser.add_argument('--num_trainers', type=int, default=1) parser.add_argument('--trainer_strategy', choices=[ - 'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu', + 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu', 'colossalai_zero2_cpu' ], - default='naive') + default='ddp') parser.add_argument('--maker_strategy', choices=['naive'], default='naive') parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama']) parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama']) diff --git a/applications/Chat/examples/ray/mmmt_prompt.py b/applications/Chat/examples/ray/mmmt_prompt.py index fa7b2bd7edfd..60f049bd5b70 100644 --- a/applications/Chat/examples/ray/mmmt_prompt.py +++ b/applications/Chat/examples/ray/mmmt_prompt.py @@ -87,8 +87,8 @@ def model_fn(): kl_coef=0.1, debug=args.debug, update_lora_weights=not (args.lora_rank == 0), - # sync_models_from_trainers=True, - # generation kwargs: + # sync_models_from_trainers=True, + # generation kwargs: max_length=512, do_sample=True, temperature=1.0, @@ -163,10 +163,10 @@ def tokenize_fn(texts): parser.add_argument('--num_trainers', type=int, default=1) parser.add_argument('--trainer_strategy', choices=[ - 'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu', + 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu', 'colossalai_zero2_cpu' ], - default='naive') + default='ddp') parser.add_argument('--maker_strategy', choices=['naive'], default='naive') parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama']) parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama']) From 2e259c3b39b428e1c0bce76b81a361bd23ed7ba0 Mon Sep 17 00:00:00 2001 From: CWHer Date: Tue, 27 Jun 2023 15:20:44 +0800 Subject: [PATCH 09/10] fix: align with modified colossal strategy --- .../benchmarks/benchmark_opt_lora_dummy.py | 20 +++++++++---------- .../Chat/coati/ray/detached_trainer_ppo.py | 4 ++-- applications/Chat/coati/ray/utils.py | 12 +++++------ .../trainer/callbacks/save_checkpoint.py | 4 ++-- applications/Chat/coati/trainer/ppo.py | 7 +++---- applications/Chat/coati/trainer/sft.py | 7 +++---- .../community/peft/train_peft_prompts.py | 6 +++--- .../examples/community/peft/train_peft_sft.py | 16 +++++++-------- .../community/ray/train_prompts_on_ray.py | 8 ++++---- applications/Chat/examples/train_prompts.py | 6 +++--- .../Chat/examples/train_reward_model.py | 6 +++--- applications/Chat/examples/train_sft.py | 8 ++++---- applications/Chat/tests/test_checkpoint.py | 6 +++--- applications/Chat/tests/test_data.py | 4 ++-- 14 files changed, 55 insertions(+), 59 deletions(-) diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py index 39f2f28eca16..90471ed727b0 100644 --- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py +++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py @@ -8,7 +8,7 @@ from coati.models.opt import OPTActor, OPTCritic from coati.trainer import PPOTrainer from coati.trainer.callbacks import PerformanceEvaluator -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, Strategy +from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy, Strategy from torch.optim import Adam from torch.utils.data import DataLoader from transformers import AutoTokenizer @@ -19,10 +19,8 @@ def get_model_numel(model: nn.Module, strategy: Strategy) -> int: numel = sum(p.numel() for p in model.parameters()) - if isinstance(strategy, ColossalAIStrategy): - from colossalai.booster.plugin import GeminiPlugin - if isinstance(strategy.plugin, GeminiPlugin) and strategy.shard_init: - numel *= dist.get_world_size() + if isinstance(strategy, GeminiStrategy) and strategy.shard_init: + numel *= dist.get_world_size() return numel @@ -78,17 +76,17 @@ def main(args): if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': - strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5) + strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5) elif args.strategy == 'colossalai_gemini_cpu': - strategy = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5) + strategy = GeminiStrategy(placement_policy='cpu', initial_scale=2**5) elif args.strategy == 'colossalai_zero2': - strategy = ColossalAIStrategy(stage=2, placement_policy='cuda') + strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') elif args.strategy == 'colossalai_zero2_cpu': - strategy = ColossalAIStrategy(stage=2, placement_policy='cpu') + strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu') elif args.strategy == 'colossalai_zero1': - strategy = ColossalAIStrategy(stage=1, placement_policy='cuda') + strategy = LowLevelZeroStrategy(stage=1, placement_policy='cuda') elif args.strategy == 'colossalai_zero1_cpu': - strategy = ColossalAIStrategy(stage=1, placement_policy='cpu') + strategy = LowLevelZeroStrategy(stage=1, placement_policy='cpu') else: raise ValueError(f'Unsupported strategy "{args.strategy}"') diff --git a/applications/Chat/coati/ray/detached_trainer_ppo.py b/applications/Chat/coati/ray/detached_trainer_ppo.py index 5f0032716f93..2f2aa0e29579 100644 --- a/applications/Chat/coati/ray/detached_trainer_ppo.py +++ b/applications/Chat/coati/ray/detached_trainer_ppo.py @@ -6,7 +6,7 @@ from coati.models.base import Actor, Critic from coati.models.loss import PolicyLoss, ValueLoss from coati.trainer.callbacks import Callback -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy, Strategy +from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy, Strategy from torch.optim import Adam from colossalai.nn.optimizer import HybridAdam @@ -85,7 +85,7 @@ def __init__( evaluator = TrainerPerformanceEvaluator(actor_numel, critic_numel) callbacks = callbacks + [evaluator] - if isinstance(self.strategy, ColossalAIStrategy): + if isinstance(self.strategy, (LowLevelZeroStrategy, GeminiStrategy)): self.actor_optim = HybridAdam(self.actor.parameters(), lr=1e-7) self.critic_optim = HybridAdam(self.critic.parameters(), lr=1e-7) else: diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py index 3f116487a9a3..4f8e0b8a87e9 100644 --- a/applications/Chat/coati/ray/utils.py +++ b/applications/Chat/coati/ray/utils.py @@ -1,6 +1,6 @@ import os -from typing import Any, Callable, Dict, List, Optional from collections import OrderedDict +from typing import Any, Callable, Dict, List, Optional import torch import torch.distributed as dist @@ -10,7 +10,7 @@ from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM from coati.models.opt import OPTRM, OPTActor, OPTCritic from coati.models.roberta import RoBERTaActor, RoBERTaCritic, RoBERTaRM -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy +from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy from coati.utils import prepare_llama_tokenizer_and_embedding from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer, RobertaTokenizer @@ -79,13 +79,13 @@ def get_strategy_from_args(strategy: str): if strategy == 'ddp': strategy_ = DDPStrategy() elif strategy == 'colossalai_gemini': - strategy_ = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5) + strategy_ = GeminiStrategy(placement_policy='cuda', initial_scale=2**5) elif strategy == 'colossalai_zero2': - strategy_ = ColossalAIStrategy(stage=2, placement_policy='cuda') + strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cuda') elif strategy == 'colossalai_gemini_cpu': - strategy_ = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5) + strategy_ = GeminiStrategy(placement_policy='cpu', initial_scale=2**5) elif strategy == 'colossalai_zero2_cpu': - strategy_ = ColossalAIStrategy(stage=2, placement_policy='cpu') + strategy_ = LowLevelZeroStrategy(stage=2, placement_policy='cpu') else: raise ValueError(f'Unsupported strategy "{strategy}"') return strategy_ diff --git a/applications/Chat/coati/trainer/callbacks/save_checkpoint.py b/applications/Chat/coati/trainer/callbacks/save_checkpoint.py index d2dcc0dd4c65..f0d77a191a88 100644 --- a/applications/Chat/coati/trainer/callbacks/save_checkpoint.py +++ b/applications/Chat/coati/trainer/callbacks/save_checkpoint.py @@ -1,7 +1,7 @@ import os import torch.distributed as dist -from coati.trainer.strategies import ColossalAIStrategy, Strategy +from coati.trainer.strategies import GeminiStrategy, LowLevelZeroStrategy, Strategy from coati.trainer.utils import is_rank_0 from torch import nn from torch.optim import Optimizer @@ -69,7 +69,7 @@ def on_episode_end(self, episode: int) -> None: # save optimizer if self.model_dict[model][1] is None: continue - only_rank0 = not isinstance(self.strategy, ColossalAIStrategy) + only_rank0 = not isinstance(self.strategy, (LowLevelZeroStrategy, GeminiStrategy)) rank = 0 if is_rank_0() else dist.get_rank() optim_path = os.path.join(base_path, f'{model}-optim-rank-{rank}.pt') self.strategy.save_optimizer(optimizer=self.model_dict[model][1], path=optim_path, only_rank0=only_rank0) diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py index 451abe2a7438..4c4a1002e96d 100644 --- a/applications/Chat/coati/trainer/ppo.py +++ b/applications/Chat/coati/trainer/ppo.py @@ -15,7 +15,7 @@ from .base import OnPolicyTrainer from .callbacks import Callback -from .strategies import ColossalAIStrategy, Strategy +from .strategies import GeminiStrategy, Strategy from .utils import is_rank_0, to_device @@ -82,9 +82,8 @@ def __init__(self, callbacks: List[Callback] = [], **generate_kwargs ) -> None: - if isinstance(strategy, ColossalAIStrategy): - from colossalai.booster.plugin import GeminiPlugin - assert not (isinstance(strategy.plugin, GeminiPlugin) and offload_inference_models), \ + if isinstance(strategy, GeminiStrategy): + assert not offload_inference_models, \ "GeminiPlugin is not compatible with manual model.to('cpu')" buffer = NaiveReplayBuffer(train_batch_size, buffer_limit, buffer_cpu_offload) diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py index 12c51d7a80c3..0812ba165286 100644 --- a/applications/Chat/coati/trainer/sft.py +++ b/applications/Chat/coati/trainer/sft.py @@ -12,7 +12,7 @@ from colossalai.logging import DistributedLogger from .base import SLTrainer -from .strategies import ColossalAIStrategy, Strategy +from .strategies import GeminiStrategy, Strategy from .utils import is_rank_0, to_device @@ -38,9 +38,8 @@ def __init__( max_epochs: int = 2, accumulation_steps: int = 8, ) -> None: - if accumulation_steps > 1 and isinstance(strategy, ColossalAIStrategy): - from colossalai.booster.plugin import GeminiPlugin - assert not isinstance(strategy.plugin, GeminiPlugin), \ + if accumulation_steps > 1: + assert not isinstance(strategy, GeminiStrategy), \ "Accumulation steps are not supported in stage 3 of ColossalAI" super().__init__(strategy, max_epochs, model, optim) diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py index 83b68a777474..9d8dbb38ae5d 100644 --- a/applications/Chat/examples/community/peft/train_peft_prompts.py +++ b/applications/Chat/examples/community/peft/train_peft_prompts.py @@ -9,7 +9,7 @@ from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM from coati.models.opt import OPTRM, OPTActor, OPTCritic from coati.trainer import PPOTrainer -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy +from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy from coati.utils import prepare_llama_tokenizer_and_embedding from easy_dataset import EasyPromptsDataset, EasySupervisedDataset from easy_models import BLOOMActor @@ -27,9 +27,9 @@ def main(args): if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': - strategy = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5) + strategy = GeminiStrategy(placement_policy='cpu', initial_scale=2**5) elif args.strategy == 'colossalai_zero2': - strategy = ColossalAIStrategy(stage=2, placement_policy='cpu') + strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu') else: raise ValueError(f'Unsupported strategy "{args.strategy}"') diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py index bdbbc5766802..54fe0ad55049 100644 --- a/applications/Chat/examples/community/peft/train_peft_sft.py +++ b/applications/Chat/examples/community/peft/train_peft_sft.py @@ -11,7 +11,7 @@ from coati.models.llama import LlamaLM from coati.models.opt import OPTLM from coati.trainer import SFTTrainer -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy +from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy from coati.utils import prepare_llama_tokenizer_and_embedding from datasets import load_dataset from easy_dataset import EasyDataset @@ -33,9 +33,9 @@ def train(args): if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': - strategy = ColossalAIStrategy(stage=3, placement_policy='cuda') + strategy = GeminiStrategy(placement_policy='cuda') elif args.strategy == 'colossalai_zero2': - strategy = ColossalAIStrategy(stage=2, placement_policy='cuda') + strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') else: raise ValueError(f'Unsupported strategy "{args.strategy}"') @@ -43,15 +43,15 @@ def train(args): with strategy.model_init_context(): print('Warning: currently only bloom is tested, gpt2,llama and opt are not tested') model = AutoModelForCausalLM.from_pretrained(args.pretrain).to(torch.cuda.current_device()) - #if the args.save_path exists and args.save_path+'/adapter_config.json' exists, we'll load the adapter_config.json - if os.path.exists(args.save_path) and os.path.exists(args.save_path+'/adapter_config.json') \ - and os.path.exists(args.save_path+'/adapter_model.bin'): + # if the args.save_path exists and args.save_path+'/adapter_config.json' exists, we'll load the adapter_config.json + if os.path.exists(args.save_path) and os.path.exists(args.save_path + '/adapter_config.json') \ + and os.path.exists(args.save_path + '/adapter_model.bin'): print("loading from saved peft model ", args.save_path) model = PeftModel.from_pretrained(model, args.save_path) else: - #we'll use peft lora library to do the lora + # we'll use peft lora library to do the lora lora_rank = args.lora_rank if args.lora_rank > 0 else 32 - #config lora with rank of lora_rank + # config lora with rank of lora_rank lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=lora_rank, diff --git a/applications/Chat/examples/community/ray/train_prompts_on_ray.py b/applications/Chat/examples/community/ray/train_prompts_on_ray.py index 933fe03ef23e..1bba9ad66fbc 100644 --- a/applications/Chat/examples/community/ray/train_prompts_on_ray.py +++ b/applications/Chat/examples/community/ray/train_prompts_on_ray.py @@ -15,7 +15,7 @@ from coati.models.loss import PolicyLoss, ValueLoss from coati.models.opt import OPTActor, OPTCritic from coati.models.utils import compute_reward -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy +from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy from ray.util.placement_group import placement_group from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from torch.optim import Adam @@ -102,14 +102,14 @@ def _init_strategy(self, strategy: str): if strategy == 'ddp': self._strategy = DDPStrategy() elif strategy == 'colossalai_gemini': - self._strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5) + self._strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5) elif strategy == 'colossalai_zero2': - self._strategy = ColossalAIStrategy(stage=2, placement_policy='cuda') + self._strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') else: raise ValueError(f'Unsupported strategy "{strategy}"') def _init_optimizer(self): - if isinstance(self._strategy, ColossalAIStrategy): + if isinstance(self._strategy, (GeminiStrategy, LowLevelZeroStrategy)): self._optimizer = HybridAdam(self._model.parameters(), lr=5e-6) else: self._optimizer = Adam(self._model.parameters(), lr=5e-6) diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py index b8b32a8176e5..c748eeb21065 100644 --- a/applications/Chat/examples/train_prompts.py +++ b/applications/Chat/examples/train_prompts.py @@ -9,7 +9,7 @@ from coati.models.opt import OPTRM, OPTActor, OPTCritic from coati.models.roberta import RoBERTaActor, RoBERTaCritic, RoBERTaRM from coati.trainer import PPOTrainer -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy +from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy from coati.utils import prepare_llama_tokenizer_and_embedding from torch.optim import Adam from torch.utils.data import DataLoader @@ -24,9 +24,9 @@ def main(args): if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': - strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5) + strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5) elif args.strategy == 'colossalai_zero2': - strategy = ColossalAIStrategy(stage=2, placement_policy='cuda') + strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') else: raise ValueError(f'Unsupported strategy "{args.strategy}"') diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py index d1ab807b0461..e9618e0c1d5e 100644 --- a/applications/Chat/examples/train_reward_model.py +++ b/applications/Chat/examples/train_reward_model.py @@ -14,7 +14,7 @@ from coati.models.opt import OPTRM from coati.models.roberta import RoBERTaRM from coati.trainer import RewardModelTrainer -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy +from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy from coati.utils import prepare_llama_tokenizer_and_embedding from datasets import load_dataset from torch.optim import Adam @@ -32,9 +32,9 @@ def train(args): if args.strategy == 'ddp': strategy = DDPStrategy() elif args.strategy == 'colossalai_gemini': - strategy = ColossalAIStrategy(stage=3, placement_policy='cuda') + strategy = GeminiStrategy(placement_policy='cuda') elif args.strategy == 'colossalai_zero2': - strategy = ColossalAIStrategy(stage=2, placement_policy='cuda') + strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') else: raise ValueError(f'Unsupported strategy "{args.strategy}"') diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py index cee798bac2b7..30becd8a68a1 100644 --- a/applications/Chat/examples/train_sft.py +++ b/applications/Chat/examples/train_sft.py @@ -8,7 +8,7 @@ from coati.dataset import DataCollatorForSupervisedDataset, SFTDataset, SupervisedDataset from coati.models import convert_to_lora_module from coati.trainer import SFTTrainer -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy +from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy from coati.utils import prepare_llama_tokenizer_and_embedding from datasets import load_dataset from torch.optim import Adam @@ -34,11 +34,11 @@ def train(args): elif args.strategy == 'colossalai_gemini': raise NotImplementedError( 'Gemini is not supported .from_pretrained() yet. We will update this after checkpoint io is ready.') - strategy = ColossalAIStrategy(stage=3, placement_policy='cuda') + strategy = GeminiStrategy(placement_policy='cuda') elif args.strategy == 'colossalai_zero2': - strategy = ColossalAIStrategy(stage=2, placement_policy='cuda') + strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') elif args.strategy == 'colossalai_zero2_cpu': - strategy = ColossalAIStrategy(stage=2, placement_policy='cpu') + strategy = LowLevelZeroStrategy(stage=2, placement_policy='cpu') else: raise ValueError(f'Unsupported strategy "{args.strategy}"') diff --git a/applications/Chat/tests/test_checkpoint.py b/applications/Chat/tests/test_checkpoint.py index cfa39e44b476..19338da437ab 100644 --- a/applications/Chat/tests/test_checkpoint.py +++ b/applications/Chat/tests/test_checkpoint.py @@ -7,7 +7,7 @@ import torch.distributed as dist from coati.models.gpt import GPTActor from coati.models.utils import calc_action_log_probs -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy +from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy from transformers.models.gpt2.configuration_gpt2 import GPT2Config from colossalai.nn.optimizer import HybridAdam @@ -28,9 +28,9 @@ def run_test_checkpoint(strategy): if strategy == 'ddp': strategy = DDPStrategy() elif strategy == 'colossalai_gemini': - strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5) + strategy = GeminiStrategy(placement_policy='cuda', initial_scale=2**5) elif strategy == 'colossalai_zero2': - strategy = ColossalAIStrategy(stage=2, placement_policy='cuda') + strategy = LowLevelZeroStrategy(stage=2, placement_policy='cuda') else: raise ValueError(f'Unsupported strategy "{strategy}"') diff --git a/applications/Chat/tests/test_data.py b/applications/Chat/tests/test_data.py index 67016f6ed286..db641a6218b1 100644 --- a/applications/Chat/tests/test_data.py +++ b/applications/Chat/tests/test_data.py @@ -8,7 +8,7 @@ from coati.models.base import RewardModel from coati.models.gpt import GPTActor, GPTCritic from coati.replay_buffer import NaiveReplayBuffer -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy +from coati.trainer.strategies import DDPStrategy, GeminiStrategy from transformers.models.gpt2.configuration_gpt2 import GPT2Config from colossalai.testing import rerun_if_address_is_in_use, spawn @@ -39,7 +39,7 @@ def run_test_data(strategy): if strategy == 'ddp': strategy = DDPStrategy() elif strategy == 'colossalai': - strategy = ColossalAIStrategy(placement_policy='cuda') + strategy = GeminiStrategy(placement_policy='cuda') else: raise ValueError(f'Unsupported strategy "{strategy}"') From f1808a4660007032f20c890ebaffc29784f7b91e Mon Sep 17 00:00:00 2001 From: CWHer Date: Thu, 29 Jun 2023 15:32:59 +0800 Subject: [PATCH 10/10] fix: fix ddp _try_init_dist arg --- applications/Chat/coati/trainer/strategies/ddp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py index 2fdfebd2a6a8..e1c1bbf19f35 100644 --- a/applications/Chat/coati/trainer/strategies/ddp.py +++ b/applications/Chat/coati/trainer/strategies/ddp.py @@ -63,7 +63,7 @@ def _post_init(self) -> None: f'{type(self).__name__}\'s plugin is not initialized properly.' def setup_distributed(self) -> None: - self._try_init_dist(force=False) + self._try_init_dist(force=True) self.set_seed(self.seed) def set_seed(self, seed: int) -> None: