From 4a541aa27c26edde9bc9ef3421e72a5ff6693f04 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Mon, 12 Aug 2024 10:13:03 +0000 Subject: [PATCH 01/42] support pp training --- .../ColossalChat/coati/trainer/base.py | 3 + .../ColossalChat/coati/trainer/sft.py | 131 +++++++++++------- .../examples/training_scripts/train_sft.py | 4 +- 3 files changed, 84 insertions(+), 54 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/base.py b/applications/ColossalChat/coati/trainer/base.py index 63c903a51940..2e63fc5c8971 100755 --- a/applications/ColossalChat/coati/trainer/base.py +++ b/applications/ColossalChat/coati/trainer/base.py @@ -17,6 +17,7 @@ from torch.optim import Optimizer from colossalai.booster import Booster +from colossalai.booster import Plugin from .utils import is_rank_0 @@ -38,6 +39,7 @@ def __init__( max_epochs: int, model: nn.Module, optimizer: Optimizer, + plugin: Plugin, start_epoch: int = 0, ) -> None: super().__init__() @@ -45,6 +47,7 @@ def __init__( self.max_epochs = max_epochs self.model = model self.optimizer = optimizer + self.plugin = plugin self.start_epoch = start_epoch @abstractmethod diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py index d37676ada3e0..ebdfd502491f 100755 --- a/applications/ColossalChat/coati/trainer/sft.py +++ b/applications/ColossalChat/coati/trainer/sft.py @@ -6,14 +6,16 @@ from typing import Optional import torch +import torch.distributed as dist from coati.trainer.utils import all_reduce_mean from coati.utils import AccumulativeMeanMeter, save_checkpoint from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler from torch.utils.data import DataLoader -from tqdm import trange +from tqdm import tqdm, trange from colossalai.booster import Booster +from colossalai.booster.plugin import HybridParallelPlugin, Plugin from colossalai.cluster import DistCoordinator from .base import SLTrainer @@ -40,6 +42,7 @@ def __init__( optim: Optimizer, lr_scheduler: _LRScheduler, max_epochs: int = 2, + plugin: Plugin = None, accumulation_steps: int = 8, apply_loss_mask: bool = True, start_epoch=0, @@ -47,7 +50,7 @@ def __init__( save_dir: str = None, coordinator: Optional[DistCoordinator] = None, ) -> None: - super().__init__(booster, max_epochs, model, optim, start_epoch=start_epoch) + super().__init__(booster, max_epochs, model, optim, plugin, start_epoch=start_epoch) self.accumulation_steps = accumulation_steps self.scheduler = lr_scheduler @@ -94,60 +97,82 @@ def _before_fit( def _train(self, epoch: int): self.model.train() - step_bar = trange( - len(self.train_dataloader) // self.accumulation_steps, - desc=f"Epoch {epoch + 1}/{self.max_epochs}", - disable=not is_rank_0(), - ) - for i, batch in enumerate(self.train_dataloader): - batch = to_device(batch, torch.cuda.current_device()) - batch_size = batch["input_ids"].size(0) - outputs = self.model( - batch["input_ids"], - attention_mask=batch["attention_mask"], - labels=batch["labels"] if self.apply_loss_mask else batch["input_ids"], + if isinstance(self.plugin, HybridParallelPlugin) and self.plugin.pp_size > 1: + data_iter = iter(self.train_dataloader) + step_bar = tqdm( + range(len(self.train_dataloader)), + desc="Step", + disable=not (dist.get_rank() == dist.get_world_size() - 1), ) - loss = outputs.loss - - self.booster.backward(loss=loss, optimizer=self.optimizer) - - loss_mean = all_reduce_mean(tensor=loss) - self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item()) - - # Gradient accumulation - if (i + 1) % self.accumulation_steps == 0: + for step in step_bar: + outputs = self.booster.execute_pipeline( + data_iter, + self.model, + criterion=lambda outputs, inputs: outputs[0], + optimizer=self.optimizer, + return_loss=True, + ) + loss = outputs["loss"] + if dist.get_rank() == dist.get_world_size() - 1: + step_bar.set_postfix({"train/loss": loss.item()}) + step_bar.update() self.optimizer.step() self.optimizer.zero_grad() - self.scheduler.step() - - step_bar.set_postfix({"train/loss": self.accumulative_meter.get("loss")}) - if self.writer: - self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step) - self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], self.num_train_step) - self.num_train_step += 1 - self.accumulative_meter.reset() - step_bar.update() - - # Save checkpoint - if ( - self.save_dir is not None - and self.save_interval is not None - and (self.num_train_step + 1) % self.save_interval == 0 - ): - save_checkpoint( - save_dir=self.save_dir, - booster=self.booster, - model=self.model, - optimizer=self.optimizer, - lr_scheduler=self.scheduler, - epoch=epoch, - step=self.num_train_step + 1, - batch_size=batch_size, - coordinator=self.coordinator, - ) - self.coordinator.print_on_master( - f"Saved checkpoint at epoch {epoch} step {self.num_train_step} at folder {self.save_dir}" - ) + else: + step_bar = trange( + len(self.train_dataloader) // self.accumulation_steps, + desc=f"Epoch {epoch + 1}/{self.max_epochs}", + disable=not is_rank_0(), + ) + for i, batch in enumerate(self.train_dataloader): + batch = to_device(batch, torch.cuda.current_device()) + batch_size = batch["input_ids"].size(0) + outputs = self.model( + batch["input_ids"], + attention_mask=batch["attention_mask"], + labels=batch["labels"] if self.apply_loss_mask else batch["input_ids"], + ) + loss = outputs.loss + + self.booster.backward(loss=loss, optimizer=self.optimizer) + + loss_mean = all_reduce_mean(tensor=loss) + self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item()) + + # Gradient accumulation + if (i + 1) % self.accumulation_steps == 0: + self.optimizer.step() + self.optimizer.zero_grad() + self.scheduler.step() + + step_bar.set_postfix({"train/loss": self.accumulative_meter.get("loss")}) + if self.writer: + self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step) + self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], self.num_train_step) + self.num_train_step += 1 + self.accumulative_meter.reset() + step_bar.update() + + # Save checkpoint + if ( + self.save_dir is not None + and self.save_interval is not None + and (self.num_train_step + 1) % self.save_interval == 0 + ): + save_checkpoint( + save_dir=self.save_dir, + booster=self.booster, + model=self.model, + optimizer=self.optimizer, + lr_scheduler=self.scheduler, + epoch=epoch, + step=self.num_train_step + 1, + batch_size=batch_size, + coordinator=self.coordinator, + ) + self.coordinator.print_on_master( + f"Saved checkpoint at epoch {epoch} step {self.num_train_step} at folder {self.save_dir}" + ) step_bar.close() def _eval(self, epoch: int): diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.py b/applications/ColossalChat/examples/training_scripts/train_sft.py index c4ef3b783d4d..62acad32f66a 100755 --- a/applications/ColossalChat/examples/training_scripts/train_sft.py +++ b/applications/ColossalChat/examples/training_scripts/train_sft.py @@ -114,7 +114,7 @@ def train(args): parallel_output=False, max_norm=args.grad_clip, precision=args.mixed_precision, - microbatch_size=args.batch_size, + microbatch_size=args.microbatch_size, ) else: raise ValueError(f"Unknown plugin {args.plugin}") @@ -269,6 +269,7 @@ def train(args): model=model, booster=booster, optim=optim, + plugin=plugin, lr_scheduler=lr_scheduler, max_epochs=args.max_epochs, accumulation_steps=args.accumulation_steps, @@ -344,6 +345,7 @@ def train(args): parser.add_argument("--use_wandb", default=False, action="store_true") parser.add_argument("--grad_checkpoint", default=False, action="store_true") parser.add_argument("--use_flash_attn", default=False, action="store_true") + parser.add_argument("--microbatch_size", type=int, default=1) args = parser.parse_args() if args.config_file is not None: os.makedirs(os.path.dirname(args.config_file), exist_ok=True) From 515f8e4a438c2520bbdb89561bd502651fa75158 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:15:34 +0000 Subject: [PATCH 02/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- applications/ColossalChat/coati/trainer/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/base.py b/applications/ColossalChat/coati/trainer/base.py index 2e63fc5c8971..bef4ccc3e078 100755 --- a/applications/ColossalChat/coati/trainer/base.py +++ b/applications/ColossalChat/coati/trainer/base.py @@ -16,8 +16,7 @@ from coati.experience_maker import Experience from torch.optim import Optimizer -from colossalai.booster import Booster -from colossalai.booster import Plugin +from colossalai.booster import Booster, Plugin from .utils import is_rank_0 From 123107ff288a5a9d95efd26e1f8968a7a6183009 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Mon, 12 Aug 2024 11:27:42 +0000 Subject: [PATCH 03/42] update rm --- applications/ColossalChat/coati/trainer/rm.py | 5 +++-- .../ColossalChat/examples/training_scripts/train_rm.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/rm.py b/applications/ColossalChat/coati/trainer/rm.py index b9e84ef557fa..849a90a27c16 100755 --- a/applications/ColossalChat/coati/trainer/rm.py +++ b/applications/ColossalChat/coati/trainer/rm.py @@ -15,7 +15,7 @@ from torch.utils.data import DataLoader from transformers import PreTrainedTokenizerBase -from colossalai.booster import Booster +from colossalai.booster import Booster, Plugin from colossalai.cluster import DistCoordinator from colossalai.utils import get_current_device @@ -48,6 +48,7 @@ def __init__( model: Any, booster: Booster, optimizer: Optimizer, + plugin: Plugin, lr_scheduler: _LRScheduler, tokenizer: PreTrainedTokenizerBase, loss_fn: Optional[Callable] = None, @@ -59,7 +60,7 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=model, optimizer=optimizer, start_epoch=start_epoch) + super().__init__(booster, max_epochs=max_epochs, model=model, optimizer=optimizer, plugin=plugin, start_epoch=start_epoch) self.actor_scheduler = lr_scheduler self.tokenizer = tokenizer self.loss_fn = loss_fn if loss_fn is not None else LogSigLoss(beta=beta) diff --git a/applications/ColossalChat/examples/training_scripts/train_rm.py b/applications/ColossalChat/examples/training_scripts/train_rm.py index 4c0a782b4766..5ea1a06acc36 100755 --- a/applications/ColossalChat/examples/training_scripts/train_rm.py +++ b/applications/ColossalChat/examples/training_scripts/train_rm.py @@ -262,6 +262,7 @@ def train(args): model, booster, optim, + plugin, lr_scheduler, tokenizer, loss_fn=loss_fn, From 2c926141f335ccaef5d630287be50588122587e7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:29:21 +0000 Subject: [PATCH 04/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- applications/ColossalChat/coati/trainer/rm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/rm.py b/applications/ColossalChat/coati/trainer/rm.py index 849a90a27c16..82e4625b9c8e 100755 --- a/applications/ColossalChat/coati/trainer/rm.py +++ b/applications/ColossalChat/coati/trainer/rm.py @@ -48,7 +48,7 @@ def __init__( model: Any, booster: Booster, optimizer: Optimizer, - plugin: Plugin, + plugin: Plugin, lr_scheduler: _LRScheduler, tokenizer: PreTrainedTokenizerBase, loss_fn: Optional[Callable] = None, @@ -60,7 +60,9 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=model, optimizer=optimizer, plugin=plugin, start_epoch=start_epoch) + super().__init__( + booster, max_epochs=max_epochs, model=model, optimizer=optimizer, plugin=plugin, start_epoch=start_epoch + ) self.actor_scheduler = lr_scheduler self.tokenizer = tokenizer self.loss_fn = loss_fn if loss_fn is not None else LogSigLoss(beta=beta) From 7d9907f0aef9208a4e933acc041b1346e986574d Mon Sep 17 00:00:00 2001 From: Tong Li Date: Mon, 12 Aug 2024 11:35:14 +0000 Subject: [PATCH 05/42] refactor --- applications/ColossalChat/coati/trainer/dpo.py | 5 +++-- applications/ColossalChat/coati/trainer/kto.py | 5 +++-- applications/ColossalChat/coati/trainer/orpo.py | 5 +++-- .../ColossalChat/examples/training_scripts/train_dpo.py | 1 + .../ColossalChat/examples/training_scripts/train_kto.py | 1 + .../ColossalChat/examples/training_scripts/train_orpo.py | 1 + 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/dpo.py b/applications/ColossalChat/coati/trainer/dpo.py index 24ddca6545c8..063ea233ee39 100755 --- a/applications/ColossalChat/coati/trainer/dpo.py +++ b/applications/ColossalChat/coati/trainer/dpo.py @@ -16,7 +16,7 @@ from tqdm import trange from transformers import PreTrainedTokenizerBase -from colossalai.booster import Booster +from colossalai.booster import Booster, Plugin from colossalai.cluster import DistCoordinator from colossalai.utils import get_current_device @@ -50,6 +50,7 @@ def __init__( ref_model: Any, booster: Booster, actor_optim: Optimizer, + plugin: Plugin, actor_lr_scheduler: _LRScheduler, tokenizer: PreTrainedTokenizerBase, max_epochs: int = 1, @@ -63,7 +64,7 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, start_epoch=start_epoch) + super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch) self.ref_model = ref_model self.actor_scheduler = actor_lr_scheduler self.tokenizer = tokenizer diff --git a/applications/ColossalChat/coati/trainer/kto.py b/applications/ColossalChat/coati/trainer/kto.py index 6462ba816686..dd7dabfe69d7 100755 --- a/applications/ColossalChat/coati/trainer/kto.py +++ b/applications/ColossalChat/coati/trainer/kto.py @@ -17,7 +17,7 @@ from tqdm import trange from transformers import PreTrainedTokenizerBase -from colossalai.booster import Booster +from colossalai.booster import Booster, Plugin from colossalai.cluster import DistCoordinator from colossalai.utils import get_current_device @@ -53,6 +53,7 @@ def __init__( ref_model: Any, booster: Booster, actor_optim: Optimizer, + plugin: Plugin, actor_lr_scheduler: _LRScheduler, tokenizer: PreTrainedTokenizerBase, max_epochs: int = 1, @@ -66,7 +67,7 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, start_epoch=start_epoch) + super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch) self.ref_model = ref_model self.actor_scheduler = actor_lr_scheduler self.tokenizer = tokenizer diff --git a/applications/ColossalChat/coati/trainer/orpo.py b/applications/ColossalChat/coati/trainer/orpo.py index c2f75771cdff..9a3adcd73150 100644 --- a/applications/ColossalChat/coati/trainer/orpo.py +++ b/applications/ColossalChat/coati/trainer/orpo.py @@ -16,7 +16,7 @@ from tqdm import trange from transformers import PreTrainedTokenizerBase -from colossalai.booster import Booster +from colossalai.booster import Booster, Plugin from colossalai.cluster import DistCoordinator from colossalai.utils import get_current_device @@ -48,6 +48,7 @@ def __init__( actor: Any, booster: Booster, actor_optim: Optimizer, + plugin: Plugin, actor_lr_scheduler: _LRScheduler, tokenizer: PreTrainedTokenizerBase, max_epochs: int = 1, @@ -59,7 +60,7 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, start_epoch=start_epoch) + super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch) self.actor_scheduler = actor_lr_scheduler self.tokenizer = tokenizer self.odds_ratio_loss_fn = OddsRatioLoss() diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.py b/applications/ColossalChat/examples/training_scripts/train_dpo.py index d88750aebc8f..3b324ee784e0 100755 --- a/applications/ColossalChat/examples/training_scripts/train_dpo.py +++ b/applications/ColossalChat/examples/training_scripts/train_dpo.py @@ -267,6 +267,7 @@ def train(args): ref_model=ref_model, booster=booster, actor_optim=optim, + plugin=plugin, actor_lr_scheduler=lr_scheduler, tokenizer=tokenizer, max_epochs=args.max_epochs, diff --git a/applications/ColossalChat/examples/training_scripts/train_kto.py b/applications/ColossalChat/examples/training_scripts/train_kto.py index 598fd8062fcf..931c1657710e 100755 --- a/applications/ColossalChat/examples/training_scripts/train_kto.py +++ b/applications/ColossalChat/examples/training_scripts/train_kto.py @@ -286,6 +286,7 @@ def train(args): ref_model=ref_model, booster=booster, actor_optim=optim, + plugin=plugin, actor_lr_scheduler=lr_scheduler, tokenizer=tokenizer, max_epochs=args.max_epochs, diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.py b/applications/ColossalChat/examples/training_scripts/train_orpo.py index 87860f7ea023..0f2fbfa2ba44 100755 --- a/applications/ColossalChat/examples/training_scripts/train_orpo.py +++ b/applications/ColossalChat/examples/training_scripts/train_orpo.py @@ -250,6 +250,7 @@ def train(args): actor=model, booster=booster, actor_optim=optim, + plugin=plugin, actor_lr_scheduler=lr_scheduler, tokenizer=tokenizer, max_epochs=args.max_epochs, From 49f7428cbf5232bc7c3e8cf7bf493adaf0084a25 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:36:42 +0000 Subject: [PATCH 06/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- applications/ColossalChat/coati/trainer/dpo.py | 4 +++- applications/ColossalChat/coati/trainer/kto.py | 4 +++- applications/ColossalChat/coati/trainer/orpo.py | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/dpo.py b/applications/ColossalChat/coati/trainer/dpo.py index 063ea233ee39..faa7a90d92de 100755 --- a/applications/ColossalChat/coati/trainer/dpo.py +++ b/applications/ColossalChat/coati/trainer/dpo.py @@ -64,7 +64,9 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch) + super().__init__( + booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch + ) self.ref_model = ref_model self.actor_scheduler = actor_lr_scheduler self.tokenizer = tokenizer diff --git a/applications/ColossalChat/coati/trainer/kto.py b/applications/ColossalChat/coati/trainer/kto.py index dd7dabfe69d7..f0b23afb667f 100755 --- a/applications/ColossalChat/coati/trainer/kto.py +++ b/applications/ColossalChat/coati/trainer/kto.py @@ -67,7 +67,9 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch) + super().__init__( + booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch + ) self.ref_model = ref_model self.actor_scheduler = actor_lr_scheduler self.tokenizer = tokenizer diff --git a/applications/ColossalChat/coati/trainer/orpo.py b/applications/ColossalChat/coati/trainer/orpo.py index 9a3adcd73150..761fd305a6ff 100644 --- a/applications/ColossalChat/coati/trainer/orpo.py +++ b/applications/ColossalChat/coati/trainer/orpo.py @@ -60,7 +60,9 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch) + super().__init__( + booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch + ) self.actor_scheduler = actor_lr_scheduler self.tokenizer = tokenizer self.odds_ratio_loss_fn = OddsRatioLoss() From a8356da3c7125fdda2d4f7c0a944063589a590a5 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 02:45:53 +0000 Subject: [PATCH 07/42] update test case --- applications/ColossalChat/tests/test_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index c26b25c837e6..621f664498c8 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -30,7 +30,7 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models MODELS_DIR=$TEMP_DIR/models_config # Skip those tests due to CI tests timeout MODELS=('llama') -ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') # pp is still buggy +ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu', 'tp_pp', 'pp') PLUGINS=('zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') LORA_RANK=('0') # skip to reduce CI execution time, can pass all locally LORA_CONFIG_ENABLE="--lora_config $BASE_DIR/examples/training_scripts/lora_config.json" From 8ce504d05cc32e625e5112f83790fa558b5a4997 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 02:47:52 +0000 Subject: [PATCH 08/42] fix --- applications/ColossalChat/tests/test_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 621f664498c8..f81b31550a60 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -30,7 +30,7 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models MODELS_DIR=$TEMP_DIR/models_config # Skip those tests due to CI tests timeout MODELS=('llama') -ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu', 'tp_pp', 'pp') +ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu' 'tp_pp' 'pp') PLUGINS=('zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') LORA_RANK=('0') # skip to reduce CI execution time, can pass all locally LORA_CONFIG_ENABLE="--lora_config $BASE_DIR/examples/training_scripts/lora_config.json" From 4a5bfc55a65e7a54341a4f7ceb32542190a4eeaf Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 04:02:21 +0000 Subject: [PATCH 09/42] change to 4 --- applications/ColossalChat/tests/test_train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index fd8a5960bc85..8bc895c7fdfd 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -15,7 +15,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() { echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" } -set_n_least_used_CUDA_VISIBLE_DEVICES 2 +set_n_least_used_CUDA_VISIBLE_DEVICES 4 set -xu @@ -175,7 +175,7 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split") done - colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ + colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ --pretrain $pretrain \ --tokenizer_dir $tokenizer_dir \ --dataset ${dataset[@]} \ From 0b2b454b97d55d1f974c28951fc5465b4ff24a8b Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 06:48:54 +0000 Subject: [PATCH 10/42] fix eval --- .../ColossalChat/coati/trainer/sft.py | 82 +++++++++++++------ 1 file changed, 59 insertions(+), 23 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py index ebdfd502491f..6322cb8df029 100755 --- a/applications/ColossalChat/coati/trainer/sft.py +++ b/applications/ColossalChat/coati/trainer/sft.py @@ -182,27 +182,63 @@ def _eval(self, epoch: int): self.accumulative_meter.reset() self.model.eval() with torch.no_grad(): - step_bar = trange( - len(self.eval_dataloader), - desc=f"Epoch {epoch + 1}/{self.max_epochs}", - disable=not is_rank_0(), - ) - for batch in self.eval_dataloader: - batch = to_device(batch, torch.cuda.current_device()) - outputs = self.model( - batch["input_ids"], - attention_mask=batch["attention_mask"], - labels=batch["labels"] if self.apply_loss_mask else batch["input_ids"], + if isinstance(self.plugin, HybridParallelPlugin) and self.plugin.pp_size > 1: + data_iter = iter(self.eval_dataloader) + step_bar = tqdm( + range(len(self.eval_dataloader)), + desc="Step", + disable=not (dist.get_rank() == dist.get_world_size() - 1), ) - loss_mean = all_reduce_mean(tensor=outputs.loss) - self.accumulative_meter.add("loss", loss_mean.item(), count_update=batch["input_ids"].size(0)) - step_bar.update() - loss_mean = self.accumulative_meter.get("loss") - msg = "Evaluation Result:\n" - for tag in ["loss"]: - msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n" - self.coordinator.print_on_master(msg) - os.makedirs(self.save_dir, exist_ok=True) - with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f: - f.write(msg) - step_bar.close() + for step in step_bar: + outputs = self.booster.execute_pipeline( + data_iter, + self.model, + criterion=lambda outputs, inputs: outputs[0], + optimizer=self.optimizer, + return_loss=True, + ) + loss = outputs["loss"] + if dist.get_rank() == dist.get_world_size() - 1: + step_bar.set_postfix({"eval/loss": loss.item()}) + self.accumulative_meter.add("loss", loss.item()) + step_bar.update() + + if dist.get_rank() == dist.get_world_size() - 1: + loss_mean = self.accumulative_meter.get("loss") + msg = "Evaluation Result:\n" + for tag in ["loss"]: + msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n" + print(msg) + if self.save_dir is not None: + os.makedirs(self.save_dir, exist_ok=True) + with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f: + f.write(msg) + step_bar.close() + + else: + step_bar = trange( + len(self.eval_dataloader), + desc=f"Epoch {epoch + 1}/{self.max_epochs}", + disable=not is_rank_0(), + ) + for batch in self.eval_dataloader: + batch = to_device(batch, torch.cuda.current_device()) + outputs = self.model( + batch["input_ids"], + attention_mask=batch["attention_mask"], + labels=batch["labels"] if self.apply_loss_mask else batch["input_ids"], + ) + loss_mean = all_reduce_mean(tensor=outputs.loss) + self.accumulative_meter.add("loss", loss_mean.item(), count_update=batch["input_ids"].size(0)) + step_bar.update() + + loss_mean = self.accumulative_meter.get("loss") + msg = "Evaluation Result:\n" + for tag in ["loss"]: + msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n" + self.coordinator.print_on_master(msg) + if self.save_dir is not None: + os.makedirs(self.save_dir, exist_ok=True) + with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f: + f.write(msg) + step_bar.close() From 74ee10e77dfa9cf242d2df5a321831927db679c8 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 07:32:25 +0000 Subject: [PATCH 11/42] test --- applications/ColossalChat/tests/test_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 8bc895c7fdfd..8666b52a556b 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -30,7 +30,7 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models MODELS_DIR=$TEMP_DIR/models_config # Skip those tests due to CI tests timeout MODELS=('llama') -ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu' 'tp_pp' 'pp') +ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') PLUGINS=('zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') LORA_RANK=('0') # skip to reduce CI execution time, can pass all locally LORA_CONFIG_ENABLE="--lora_config $BASE_DIR/examples/training_scripts/lora_config.json" From 22218d31e1f7093f4f117418dfa54d5c35db1790 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 07:53:10 +0000 Subject: [PATCH 12/42] add pp --- applications/ColossalChat/tests/test_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 8666b52a556b..7b3b4ab4ff61 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -30,7 +30,7 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models MODELS_DIR=$TEMP_DIR/models_config # Skip those tests due to CI tests timeout MODELS=('llama') -ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') +ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu' 'pp') PLUGINS=('zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') LORA_RANK=('0') # skip to reduce CI execution time, can pass all locally LORA_CONFIG_ENABLE="--lora_config $BASE_DIR/examples/training_scripts/lora_config.json" From 2422341d0360900062de317bd31ced22e5bb6b07 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 09:35:03 +0000 Subject: [PATCH 13/42] hotfix --- applications/ColossalChat/tests/test_train.sh | 2 +- colossalai/booster/plugin/hybrid_parallel_plugin.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 7b3b4ab4ff61..3b06495cb46a 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -30,7 +30,7 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models MODELS_DIR=$TEMP_DIR/models_config # Skip those tests due to CI tests timeout MODELS=('llama') -ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu' 'pp') +ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu' 'pp' 'tp_pp') PLUGINS=('zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') LORA_RANK=('0') # skip to reduce CI execution time, can pass all locally LORA_CONFIG_ENABLE="--lora_config $BASE_DIR/examples/training_scripts/lora_config.json" diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index d2933a4afe7f..faf1f0218b02 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -1328,7 +1328,7 @@ def execute_pipeline( # run with gradients accumulation if model.require_grad_sync == False or ( isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False - ): + ) or not torch.is_grad_enabled(): return outputs # Synchronize the grads of shared parameters of the model. From 2789c9ee6d4e0c3067f42988ce2b595e797876a6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:36:22 +0000 Subject: [PATCH 14/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- colossalai/booster/plugin/hybrid_parallel_plugin.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index faf1f0218b02..e5acdb05172a 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -1326,9 +1326,11 @@ def execute_pipeline( ) # run with gradients accumulation - if model.require_grad_sync == False or ( - isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False - ) or not torch.is_grad_enabled(): + if ( + model.require_grad_sync == False + or (isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False) + or not torch.is_grad_enabled() + ): return outputs # Synchronize the grads of shared parameters of the model. From 38c84a1aa0d1ed74e2540611cda9f1a64579d0f9 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Mon, 12 Aug 2024 10:13:03 +0000 Subject: [PATCH 15/42] support pp training --- .../ColossalChat/coati/trainer/base.py | 3 + .../ColossalChat/coati/trainer/sft.py | 131 +++++++++++------- .../examples/training_scripts/train_sft.py | 4 +- 3 files changed, 84 insertions(+), 54 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/base.py b/applications/ColossalChat/coati/trainer/base.py index 63c903a51940..2e63fc5c8971 100755 --- a/applications/ColossalChat/coati/trainer/base.py +++ b/applications/ColossalChat/coati/trainer/base.py @@ -17,6 +17,7 @@ from torch.optim import Optimizer from colossalai.booster import Booster +from colossalai.booster import Plugin from .utils import is_rank_0 @@ -38,6 +39,7 @@ def __init__( max_epochs: int, model: nn.Module, optimizer: Optimizer, + plugin: Plugin, start_epoch: int = 0, ) -> None: super().__init__() @@ -45,6 +47,7 @@ def __init__( self.max_epochs = max_epochs self.model = model self.optimizer = optimizer + self.plugin = plugin self.start_epoch = start_epoch @abstractmethod diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py index d37676ada3e0..ebdfd502491f 100755 --- a/applications/ColossalChat/coati/trainer/sft.py +++ b/applications/ColossalChat/coati/trainer/sft.py @@ -6,14 +6,16 @@ from typing import Optional import torch +import torch.distributed as dist from coati.trainer.utils import all_reduce_mean from coati.utils import AccumulativeMeanMeter, save_checkpoint from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler from torch.utils.data import DataLoader -from tqdm import trange +from tqdm import tqdm, trange from colossalai.booster import Booster +from colossalai.booster.plugin import HybridParallelPlugin, Plugin from colossalai.cluster import DistCoordinator from .base import SLTrainer @@ -40,6 +42,7 @@ def __init__( optim: Optimizer, lr_scheduler: _LRScheduler, max_epochs: int = 2, + plugin: Plugin = None, accumulation_steps: int = 8, apply_loss_mask: bool = True, start_epoch=0, @@ -47,7 +50,7 @@ def __init__( save_dir: str = None, coordinator: Optional[DistCoordinator] = None, ) -> None: - super().__init__(booster, max_epochs, model, optim, start_epoch=start_epoch) + super().__init__(booster, max_epochs, model, optim, plugin, start_epoch=start_epoch) self.accumulation_steps = accumulation_steps self.scheduler = lr_scheduler @@ -94,60 +97,82 @@ def _before_fit( def _train(self, epoch: int): self.model.train() - step_bar = trange( - len(self.train_dataloader) // self.accumulation_steps, - desc=f"Epoch {epoch + 1}/{self.max_epochs}", - disable=not is_rank_0(), - ) - for i, batch in enumerate(self.train_dataloader): - batch = to_device(batch, torch.cuda.current_device()) - batch_size = batch["input_ids"].size(0) - outputs = self.model( - batch["input_ids"], - attention_mask=batch["attention_mask"], - labels=batch["labels"] if self.apply_loss_mask else batch["input_ids"], + if isinstance(self.plugin, HybridParallelPlugin) and self.plugin.pp_size > 1: + data_iter = iter(self.train_dataloader) + step_bar = tqdm( + range(len(self.train_dataloader)), + desc="Step", + disable=not (dist.get_rank() == dist.get_world_size() - 1), ) - loss = outputs.loss - - self.booster.backward(loss=loss, optimizer=self.optimizer) - - loss_mean = all_reduce_mean(tensor=loss) - self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item()) - - # Gradient accumulation - if (i + 1) % self.accumulation_steps == 0: + for step in step_bar: + outputs = self.booster.execute_pipeline( + data_iter, + self.model, + criterion=lambda outputs, inputs: outputs[0], + optimizer=self.optimizer, + return_loss=True, + ) + loss = outputs["loss"] + if dist.get_rank() == dist.get_world_size() - 1: + step_bar.set_postfix({"train/loss": loss.item()}) + step_bar.update() self.optimizer.step() self.optimizer.zero_grad() - self.scheduler.step() - - step_bar.set_postfix({"train/loss": self.accumulative_meter.get("loss")}) - if self.writer: - self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step) - self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], self.num_train_step) - self.num_train_step += 1 - self.accumulative_meter.reset() - step_bar.update() - - # Save checkpoint - if ( - self.save_dir is not None - and self.save_interval is not None - and (self.num_train_step + 1) % self.save_interval == 0 - ): - save_checkpoint( - save_dir=self.save_dir, - booster=self.booster, - model=self.model, - optimizer=self.optimizer, - lr_scheduler=self.scheduler, - epoch=epoch, - step=self.num_train_step + 1, - batch_size=batch_size, - coordinator=self.coordinator, - ) - self.coordinator.print_on_master( - f"Saved checkpoint at epoch {epoch} step {self.num_train_step} at folder {self.save_dir}" - ) + else: + step_bar = trange( + len(self.train_dataloader) // self.accumulation_steps, + desc=f"Epoch {epoch + 1}/{self.max_epochs}", + disable=not is_rank_0(), + ) + for i, batch in enumerate(self.train_dataloader): + batch = to_device(batch, torch.cuda.current_device()) + batch_size = batch["input_ids"].size(0) + outputs = self.model( + batch["input_ids"], + attention_mask=batch["attention_mask"], + labels=batch["labels"] if self.apply_loss_mask else batch["input_ids"], + ) + loss = outputs.loss + + self.booster.backward(loss=loss, optimizer=self.optimizer) + + loss_mean = all_reduce_mean(tensor=loss) + self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item()) + + # Gradient accumulation + if (i + 1) % self.accumulation_steps == 0: + self.optimizer.step() + self.optimizer.zero_grad() + self.scheduler.step() + + step_bar.set_postfix({"train/loss": self.accumulative_meter.get("loss")}) + if self.writer: + self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step) + self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], self.num_train_step) + self.num_train_step += 1 + self.accumulative_meter.reset() + step_bar.update() + + # Save checkpoint + if ( + self.save_dir is not None + and self.save_interval is not None + and (self.num_train_step + 1) % self.save_interval == 0 + ): + save_checkpoint( + save_dir=self.save_dir, + booster=self.booster, + model=self.model, + optimizer=self.optimizer, + lr_scheduler=self.scheduler, + epoch=epoch, + step=self.num_train_step + 1, + batch_size=batch_size, + coordinator=self.coordinator, + ) + self.coordinator.print_on_master( + f"Saved checkpoint at epoch {epoch} step {self.num_train_step} at folder {self.save_dir}" + ) step_bar.close() def _eval(self, epoch: int): diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.py b/applications/ColossalChat/examples/training_scripts/train_sft.py index c4ef3b783d4d..62acad32f66a 100755 --- a/applications/ColossalChat/examples/training_scripts/train_sft.py +++ b/applications/ColossalChat/examples/training_scripts/train_sft.py @@ -114,7 +114,7 @@ def train(args): parallel_output=False, max_norm=args.grad_clip, precision=args.mixed_precision, - microbatch_size=args.batch_size, + microbatch_size=args.microbatch_size, ) else: raise ValueError(f"Unknown plugin {args.plugin}") @@ -269,6 +269,7 @@ def train(args): model=model, booster=booster, optim=optim, + plugin=plugin, lr_scheduler=lr_scheduler, max_epochs=args.max_epochs, accumulation_steps=args.accumulation_steps, @@ -344,6 +345,7 @@ def train(args): parser.add_argument("--use_wandb", default=False, action="store_true") parser.add_argument("--grad_checkpoint", default=False, action="store_true") parser.add_argument("--use_flash_attn", default=False, action="store_true") + parser.add_argument("--microbatch_size", type=int, default=1) args = parser.parse_args() if args.config_file is not None: os.makedirs(os.path.dirname(args.config_file), exist_ok=True) From 5a24b0dc31e2e64d5ebfd7db9edc2aa170be9ef7 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Mon, 12 Aug 2024 11:27:42 +0000 Subject: [PATCH 16/42] update rm --- applications/ColossalChat/coati/trainer/rm.py | 5 +++-- .../ColossalChat/examples/training_scripts/train_rm.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/rm.py b/applications/ColossalChat/coati/trainer/rm.py index b9e84ef557fa..849a90a27c16 100755 --- a/applications/ColossalChat/coati/trainer/rm.py +++ b/applications/ColossalChat/coati/trainer/rm.py @@ -15,7 +15,7 @@ from torch.utils.data import DataLoader from transformers import PreTrainedTokenizerBase -from colossalai.booster import Booster +from colossalai.booster import Booster, Plugin from colossalai.cluster import DistCoordinator from colossalai.utils import get_current_device @@ -48,6 +48,7 @@ def __init__( model: Any, booster: Booster, optimizer: Optimizer, + plugin: Plugin, lr_scheduler: _LRScheduler, tokenizer: PreTrainedTokenizerBase, loss_fn: Optional[Callable] = None, @@ -59,7 +60,7 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=model, optimizer=optimizer, start_epoch=start_epoch) + super().__init__(booster, max_epochs=max_epochs, model=model, optimizer=optimizer, plugin=plugin, start_epoch=start_epoch) self.actor_scheduler = lr_scheduler self.tokenizer = tokenizer self.loss_fn = loss_fn if loss_fn is not None else LogSigLoss(beta=beta) diff --git a/applications/ColossalChat/examples/training_scripts/train_rm.py b/applications/ColossalChat/examples/training_scripts/train_rm.py index 4c0a782b4766..5ea1a06acc36 100755 --- a/applications/ColossalChat/examples/training_scripts/train_rm.py +++ b/applications/ColossalChat/examples/training_scripts/train_rm.py @@ -262,6 +262,7 @@ def train(args): model, booster, optim, + plugin, lr_scheduler, tokenizer, loss_fn=loss_fn, From f965ac856622362731e4e9e5ab69512dc6d88ed1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:15:34 +0000 Subject: [PATCH 17/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- applications/ColossalChat/coati/trainer/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/base.py b/applications/ColossalChat/coati/trainer/base.py index 2e63fc5c8971..bef4ccc3e078 100755 --- a/applications/ColossalChat/coati/trainer/base.py +++ b/applications/ColossalChat/coati/trainer/base.py @@ -16,8 +16,7 @@ from coati.experience_maker import Experience from torch.optim import Optimizer -from colossalai.booster import Booster -from colossalai.booster import Plugin +from colossalai.booster import Booster, Plugin from .utils import is_rank_0 From ba80449d62819de52d27389d00ebc097c91ce3ab Mon Sep 17 00:00:00 2001 From: Tong Li Date: Mon, 12 Aug 2024 11:35:14 +0000 Subject: [PATCH 18/42] refactor --- applications/ColossalChat/coati/trainer/dpo.py | 5 +++-- applications/ColossalChat/coati/trainer/kto.py | 5 +++-- applications/ColossalChat/coati/trainer/orpo.py | 5 +++-- .../ColossalChat/examples/training_scripts/train_dpo.py | 1 + .../ColossalChat/examples/training_scripts/train_kto.py | 1 + .../ColossalChat/examples/training_scripts/train_orpo.py | 1 + 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/dpo.py b/applications/ColossalChat/coati/trainer/dpo.py index 24ddca6545c8..063ea233ee39 100755 --- a/applications/ColossalChat/coati/trainer/dpo.py +++ b/applications/ColossalChat/coati/trainer/dpo.py @@ -16,7 +16,7 @@ from tqdm import trange from transformers import PreTrainedTokenizerBase -from colossalai.booster import Booster +from colossalai.booster import Booster, Plugin from colossalai.cluster import DistCoordinator from colossalai.utils import get_current_device @@ -50,6 +50,7 @@ def __init__( ref_model: Any, booster: Booster, actor_optim: Optimizer, + plugin: Plugin, actor_lr_scheduler: _LRScheduler, tokenizer: PreTrainedTokenizerBase, max_epochs: int = 1, @@ -63,7 +64,7 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, start_epoch=start_epoch) + super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch) self.ref_model = ref_model self.actor_scheduler = actor_lr_scheduler self.tokenizer = tokenizer diff --git a/applications/ColossalChat/coati/trainer/kto.py b/applications/ColossalChat/coati/trainer/kto.py index 6462ba816686..dd7dabfe69d7 100755 --- a/applications/ColossalChat/coati/trainer/kto.py +++ b/applications/ColossalChat/coati/trainer/kto.py @@ -17,7 +17,7 @@ from tqdm import trange from transformers import PreTrainedTokenizerBase -from colossalai.booster import Booster +from colossalai.booster import Booster, Plugin from colossalai.cluster import DistCoordinator from colossalai.utils import get_current_device @@ -53,6 +53,7 @@ def __init__( ref_model: Any, booster: Booster, actor_optim: Optimizer, + plugin: Plugin, actor_lr_scheduler: _LRScheduler, tokenizer: PreTrainedTokenizerBase, max_epochs: int = 1, @@ -66,7 +67,7 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, start_epoch=start_epoch) + super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch) self.ref_model = ref_model self.actor_scheduler = actor_lr_scheduler self.tokenizer = tokenizer diff --git a/applications/ColossalChat/coati/trainer/orpo.py b/applications/ColossalChat/coati/trainer/orpo.py index c2f75771cdff..9a3adcd73150 100644 --- a/applications/ColossalChat/coati/trainer/orpo.py +++ b/applications/ColossalChat/coati/trainer/orpo.py @@ -16,7 +16,7 @@ from tqdm import trange from transformers import PreTrainedTokenizerBase -from colossalai.booster import Booster +from colossalai.booster import Booster, Plugin from colossalai.cluster import DistCoordinator from colossalai.utils import get_current_device @@ -48,6 +48,7 @@ def __init__( actor: Any, booster: Booster, actor_optim: Optimizer, + plugin: Plugin, actor_lr_scheduler: _LRScheduler, tokenizer: PreTrainedTokenizerBase, max_epochs: int = 1, @@ -59,7 +60,7 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, start_epoch=start_epoch) + super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch) self.actor_scheduler = actor_lr_scheduler self.tokenizer = tokenizer self.odds_ratio_loss_fn = OddsRatioLoss() diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.py b/applications/ColossalChat/examples/training_scripts/train_dpo.py index d88750aebc8f..3b324ee784e0 100755 --- a/applications/ColossalChat/examples/training_scripts/train_dpo.py +++ b/applications/ColossalChat/examples/training_scripts/train_dpo.py @@ -267,6 +267,7 @@ def train(args): ref_model=ref_model, booster=booster, actor_optim=optim, + plugin=plugin, actor_lr_scheduler=lr_scheduler, tokenizer=tokenizer, max_epochs=args.max_epochs, diff --git a/applications/ColossalChat/examples/training_scripts/train_kto.py b/applications/ColossalChat/examples/training_scripts/train_kto.py index 598fd8062fcf..931c1657710e 100755 --- a/applications/ColossalChat/examples/training_scripts/train_kto.py +++ b/applications/ColossalChat/examples/training_scripts/train_kto.py @@ -286,6 +286,7 @@ def train(args): ref_model=ref_model, booster=booster, actor_optim=optim, + plugin=plugin, actor_lr_scheduler=lr_scheduler, tokenizer=tokenizer, max_epochs=args.max_epochs, diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.py b/applications/ColossalChat/examples/training_scripts/train_orpo.py index 87860f7ea023..0f2fbfa2ba44 100755 --- a/applications/ColossalChat/examples/training_scripts/train_orpo.py +++ b/applications/ColossalChat/examples/training_scripts/train_orpo.py @@ -250,6 +250,7 @@ def train(args): actor=model, booster=booster, actor_optim=optim, + plugin=plugin, actor_lr_scheduler=lr_scheduler, tokenizer=tokenizer, max_epochs=args.max_epochs, From e6245485fc3086dc7b3ba2833e2ab385a718ef3c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:29:21 +0000 Subject: [PATCH 19/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- applications/ColossalChat/coati/trainer/rm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/rm.py b/applications/ColossalChat/coati/trainer/rm.py index 849a90a27c16..82e4625b9c8e 100755 --- a/applications/ColossalChat/coati/trainer/rm.py +++ b/applications/ColossalChat/coati/trainer/rm.py @@ -48,7 +48,7 @@ def __init__( model: Any, booster: Booster, optimizer: Optimizer, - plugin: Plugin, + plugin: Plugin, lr_scheduler: _LRScheduler, tokenizer: PreTrainedTokenizerBase, loss_fn: Optional[Callable] = None, @@ -60,7 +60,9 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=model, optimizer=optimizer, plugin=plugin, start_epoch=start_epoch) + super().__init__( + booster, max_epochs=max_epochs, model=model, optimizer=optimizer, plugin=plugin, start_epoch=start_epoch + ) self.actor_scheduler = lr_scheduler self.tokenizer = tokenizer self.loss_fn = loss_fn if loss_fn is not None else LogSigLoss(beta=beta) From 0ed8efcaa0ffff7fbb5e801f85b23017eeaf8aac Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:36:42 +0000 Subject: [PATCH 20/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- applications/ColossalChat/coati/trainer/dpo.py | 4 +++- applications/ColossalChat/coati/trainer/kto.py | 4 +++- applications/ColossalChat/coati/trainer/orpo.py | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/dpo.py b/applications/ColossalChat/coati/trainer/dpo.py index 063ea233ee39..faa7a90d92de 100755 --- a/applications/ColossalChat/coati/trainer/dpo.py +++ b/applications/ColossalChat/coati/trainer/dpo.py @@ -64,7 +64,9 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch) + super().__init__( + booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch + ) self.ref_model = ref_model self.actor_scheduler = actor_lr_scheduler self.tokenizer = tokenizer diff --git a/applications/ColossalChat/coati/trainer/kto.py b/applications/ColossalChat/coati/trainer/kto.py index dd7dabfe69d7..f0b23afb667f 100755 --- a/applications/ColossalChat/coati/trainer/kto.py +++ b/applications/ColossalChat/coati/trainer/kto.py @@ -67,7 +67,9 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch) + super().__init__( + booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch + ) self.ref_model = ref_model self.actor_scheduler = actor_lr_scheduler self.tokenizer = tokenizer diff --git a/applications/ColossalChat/coati/trainer/orpo.py b/applications/ColossalChat/coati/trainer/orpo.py index 9a3adcd73150..761fd305a6ff 100644 --- a/applications/ColossalChat/coati/trainer/orpo.py +++ b/applications/ColossalChat/coati/trainer/orpo.py @@ -60,7 +60,9 @@ def __init__( save_dir: str = None, coordinator: DistCoordinator = None, ) -> None: - super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch) + super().__init__( + booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, plugin=plugin, start_epoch=start_epoch + ) self.actor_scheduler = actor_lr_scheduler self.tokenizer = tokenizer self.odds_ratio_loss_fn = OddsRatioLoss() From 5e968b9e80923138f80af77cb20e063fe3dfd7c3 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 02:45:53 +0000 Subject: [PATCH 21/42] update test case --- applications/ColossalChat/tests/test_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 69036de635c9..b31b0af197b2 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -30,7 +30,7 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models MODELS_DIR=$TEMP_DIR/models_config # Skip those tests due to CI tests timeout MODELS=('llama') -ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') # pp is still buggy +ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu', 'tp_pp', 'pp') PLUGINS=('zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') LORA_RANK=('0') # skip to reduce CI execution time, can pass all locally LORA_CONFIG_ENABLE="--lora_config $BASE_DIR/examples/training_scripts/lora_config.json" From 9f31a261d940bb0086a19aa8d1f170122ef15ec1 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 02:47:52 +0000 Subject: [PATCH 22/42] fix --- applications/ColossalChat/tests/test_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index b31b0af197b2..fd8a5960bc85 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -30,7 +30,7 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models MODELS_DIR=$TEMP_DIR/models_config # Skip those tests due to CI tests timeout MODELS=('llama') -ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu', 'tp_pp', 'pp') +ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu' 'tp_pp' 'pp') PLUGINS=('zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') LORA_RANK=('0') # skip to reduce CI execution time, can pass all locally LORA_CONFIG_ENABLE="--lora_config $BASE_DIR/examples/training_scripts/lora_config.json" From fc5299c694e9bc6a3156dccf158a55a7adca6a0b Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 04:02:21 +0000 Subject: [PATCH 23/42] change to 4 --- applications/ColossalChat/tests/test_train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index fd8a5960bc85..8bc895c7fdfd 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -15,7 +15,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() { echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" } -set_n_least_used_CUDA_VISIBLE_DEVICES 2 +set_n_least_used_CUDA_VISIBLE_DEVICES 4 set -xu @@ -175,7 +175,7 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split") done - colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ + colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ --pretrain $pretrain \ --tokenizer_dir $tokenizer_dir \ --dataset ${dataset[@]} \ From 024764884423dad36bf8ca4dede99090d613a028 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 06:48:54 +0000 Subject: [PATCH 24/42] fix eval --- .../ColossalChat/coati/trainer/sft.py | 82 +++++++++++++------ 1 file changed, 59 insertions(+), 23 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py index ebdfd502491f..6322cb8df029 100755 --- a/applications/ColossalChat/coati/trainer/sft.py +++ b/applications/ColossalChat/coati/trainer/sft.py @@ -182,27 +182,63 @@ def _eval(self, epoch: int): self.accumulative_meter.reset() self.model.eval() with torch.no_grad(): - step_bar = trange( - len(self.eval_dataloader), - desc=f"Epoch {epoch + 1}/{self.max_epochs}", - disable=not is_rank_0(), - ) - for batch in self.eval_dataloader: - batch = to_device(batch, torch.cuda.current_device()) - outputs = self.model( - batch["input_ids"], - attention_mask=batch["attention_mask"], - labels=batch["labels"] if self.apply_loss_mask else batch["input_ids"], + if isinstance(self.plugin, HybridParallelPlugin) and self.plugin.pp_size > 1: + data_iter = iter(self.eval_dataloader) + step_bar = tqdm( + range(len(self.eval_dataloader)), + desc="Step", + disable=not (dist.get_rank() == dist.get_world_size() - 1), ) - loss_mean = all_reduce_mean(tensor=outputs.loss) - self.accumulative_meter.add("loss", loss_mean.item(), count_update=batch["input_ids"].size(0)) - step_bar.update() - loss_mean = self.accumulative_meter.get("loss") - msg = "Evaluation Result:\n" - for tag in ["loss"]: - msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n" - self.coordinator.print_on_master(msg) - os.makedirs(self.save_dir, exist_ok=True) - with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f: - f.write(msg) - step_bar.close() + for step in step_bar: + outputs = self.booster.execute_pipeline( + data_iter, + self.model, + criterion=lambda outputs, inputs: outputs[0], + optimizer=self.optimizer, + return_loss=True, + ) + loss = outputs["loss"] + if dist.get_rank() == dist.get_world_size() - 1: + step_bar.set_postfix({"eval/loss": loss.item()}) + self.accumulative_meter.add("loss", loss.item()) + step_bar.update() + + if dist.get_rank() == dist.get_world_size() - 1: + loss_mean = self.accumulative_meter.get("loss") + msg = "Evaluation Result:\n" + for tag in ["loss"]: + msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n" + print(msg) + if self.save_dir is not None: + os.makedirs(self.save_dir, exist_ok=True) + with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f: + f.write(msg) + step_bar.close() + + else: + step_bar = trange( + len(self.eval_dataloader), + desc=f"Epoch {epoch + 1}/{self.max_epochs}", + disable=not is_rank_0(), + ) + for batch in self.eval_dataloader: + batch = to_device(batch, torch.cuda.current_device()) + outputs = self.model( + batch["input_ids"], + attention_mask=batch["attention_mask"], + labels=batch["labels"] if self.apply_loss_mask else batch["input_ids"], + ) + loss_mean = all_reduce_mean(tensor=outputs.loss) + self.accumulative_meter.add("loss", loss_mean.item(), count_update=batch["input_ids"].size(0)) + step_bar.update() + + loss_mean = self.accumulative_meter.get("loss") + msg = "Evaluation Result:\n" + for tag in ["loss"]: + msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n" + self.coordinator.print_on_master(msg) + if self.save_dir is not None: + os.makedirs(self.save_dir, exist_ok=True) + with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f: + f.write(msg) + step_bar.close() From bf8e3a0e1554561de09aa0f6383e1bd89dbbf04a Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 07:32:25 +0000 Subject: [PATCH 25/42] test --- applications/ColossalChat/tests/test_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 8bc895c7fdfd..8666b52a556b 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -30,7 +30,7 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models MODELS_DIR=$TEMP_DIR/models_config # Skip those tests due to CI tests timeout MODELS=('llama') -ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu' 'tp_pp' 'pp') +ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') PLUGINS=('zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') LORA_RANK=('0') # skip to reduce CI execution time, can pass all locally LORA_CONFIG_ENABLE="--lora_config $BASE_DIR/examples/training_scripts/lora_config.json" From dd05dd0d83462b237392ef18bb1873f14420b7cb Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 07:53:10 +0000 Subject: [PATCH 26/42] add pp --- applications/ColossalChat/tests/test_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 8666b52a556b..7b3b4ab4ff61 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -30,7 +30,7 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models MODELS_DIR=$TEMP_DIR/models_config # Skip those tests due to CI tests timeout MODELS=('llama') -ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') +ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu' 'pp') PLUGINS=('zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') LORA_RANK=('0') # skip to reduce CI execution time, can pass all locally LORA_CONFIG_ENABLE="--lora_config $BASE_DIR/examples/training_scripts/lora_config.json" From a8840a090f40d7e9d5f1b8e13921976775605c90 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 13 Aug 2024 09:35:03 +0000 Subject: [PATCH 27/42] hotfix --- applications/ColossalChat/tests/test_train.sh | 2 +- colossalai/booster/plugin/hybrid_parallel_plugin.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 7b3b4ab4ff61..3b06495cb46a 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -30,7 +30,7 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models MODELS_DIR=$TEMP_DIR/models_config # Skip those tests due to CI tests timeout MODELS=('llama') -ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu' 'pp') +ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu' 'pp' 'tp_pp') PLUGINS=('zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu') LORA_RANK=('0') # skip to reduce CI execution time, can pass all locally LORA_CONFIG_ENABLE="--lora_config $BASE_DIR/examples/training_scripts/lora_config.json" diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index e5acdb05172a..faf1f0218b02 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -1326,11 +1326,9 @@ def execute_pipeline( ) # run with gradients accumulation - if ( - model.require_grad_sync == False - or (isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False) - or not torch.is_grad_enabled() - ): + if model.require_grad_sync == False or ( + isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False + ) or not torch.is_grad_enabled(): return outputs # Synchronize the grads of shared parameters of the model. From 3629b36517eb637a410f98ab69a19768890ff5aa Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:36:22 +0000 Subject: [PATCH 28/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- colossalai/booster/plugin/hybrid_parallel_plugin.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index faf1f0218b02..e5acdb05172a 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -1326,9 +1326,11 @@ def execute_pipeline( ) # run with gradients accumulation - if model.require_grad_sync == False or ( - isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False - ) or not torch.is_grad_enabled(): + if ( + model.require_grad_sync == False + or (isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False) + or not torch.is_grad_enabled() + ): return outputs # Synchronize the grads of shared parameters of the model. From 409f4b5ab39fa48f7494815de704bc171e37522a Mon Sep 17 00:00:00 2001 From: Tong Li Date: Wed, 14 Aug 2024 07:19:34 +0000 Subject: [PATCH 29/42] update --- colossalai/booster/plugin/hybrid_parallel_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index e5acdb05172a..e359957f579d 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -1332,7 +1332,7 @@ def execute_pipeline( or not torch.is_grad_enabled() ): return outputs - + print("Show torch status:", torch.is_grad_enabled()) # Synchronize the grads of shared parameters of the model. model.sync_shared_params() # Synchronize sequence parallelism gradients of the model. From e87cd8bcfb33f6a8274f4beb0ec002e555329989 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Wed, 14 Aug 2024 10:07:45 +0000 Subject: [PATCH 30/42] skip pp eval --- applications/ColossalChat/tests/test_train.sh | 74 +++++++++++++------ .../booster/plugin/hybrid_parallel_plugin.py | 2 +- 2 files changed, 52 insertions(+), 24 deletions(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 3b06495cb46a..2935a6369986 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -91,7 +91,7 @@ SKIPPED_TESTS=( llama-gemini_auto-20 # gemini_auto plugin doesn't support lora llama-gemini-20 # gemini doesn't support lora ) - +skip_eval=false GRAD_CKPTS=('--grad_checkpoint') for lora_rank in ${LORA_RANK[@]}; do for model in ${MODELS[@]}; do @@ -129,15 +129,18 @@ for lora_rank in ${LORA_RANK[@]}; do plugin='3d' fi if [[ $plugin == "tp_pp" ]]; then + echo "Here" tp='2' bs='8' pp='2' plugin='3d' + skip_eval=true fi if [[ $plugin == "pp" ]]; then bs='8' pp='2' plugin='3d' + skip_eval=true fi if [[ $plugin == "sp_split_gather" ]]; then enable_sequence_parallelism='--enable_sequence_parallelism' @@ -175,28 +178,53 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split") done - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ - --pretrain $pretrain \ - --tokenizer_dir $tokenizer_dir \ - --dataset ${dataset[@]} \ - --eval_dataset ${dataset[@]} \ - --save_path $MODEL_SAVE_PATH \ - --config_file $MODELS_DIR/config.jsonl \ - $lora_config \ - --plugin $plugin \ - --batch_size $bs \ - --max_epochs 1 \ - --accumulation_steps $grad_accu \ - --tp $tp \ - --pp $pp \ - --zero_stage $zero_stage \ - --sp $sp \ - --sp_mode $sp_mode \ - $enable_sequence_parallelism \ - --lr 2e-5 \ - $grad_ckpt \ - --max_len 400 \ - --use_flash_attn + + if [[ $skip_eval ]]; then + colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ + --pretrain $pretrain \ + --tokenizer_dir $tokenizer_dir \ + --dataset ${dataset[@]} \ + --save_path $MODEL_SAVE_PATH \ + --config_file $MODELS_DIR/config.jsonl \ + $lora_config \ + --plugin $plugin \ + --batch_size $bs \ + --max_epochs 1 \ + --accumulation_steps $grad_accu \ + --tp $tp \ + --pp $pp \ + --zero_stage $zero_stage \ + --sp $sp \ + --sp_mode $sp_mode \ + $enable_sequence_parallelism \ + --lr 2e-5 \ + $grad_ckpt \ + --max_len 400 \ + --use_flash_attn + else + colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ + --pretrain $pretrain \ + --tokenizer_dir $tokenizer_dir \ + --dataset ${dataset[@]} \ + --eval_dataset ${dataset[@]} \ + --save_path $MODEL_SAVE_PATH \ + --config_file $MODELS_DIR/config.jsonl \ + $lora_config \ + --plugin $plugin \ + --batch_size $bs \ + --max_epochs 1 \ + --accumulation_steps $grad_accu \ + --tp $tp \ + --pp $pp \ + --zero_stage $zero_stage \ + --sp $sp \ + --sp_mode $sp_mode \ + $enable_sequence_parallelism \ + --lr 2e-5 \ + $grad_ckpt \ + --max_len 400 \ + --use_flash_attn + fi passed=$? if [ $passed -eq 0 ]; then rm -rf ${MODEL_SAVE_PATH:?}/* diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index e359957f579d..e5acdb05172a 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -1332,7 +1332,7 @@ def execute_pipeline( or not torch.is_grad_enabled() ): return outputs - print("Show torch status:", torch.is_grad_enabled()) + # Synchronize the grads of shared parameters of the model. model.sync_shared_params() # Synchronize sequence parallelism gradients of the model. From 4191f21f70cb23ead05075a0f24321cdc89bfeaa Mon Sep 17 00:00:00 2001 From: Tong Li Date: Thu, 15 Aug 2024 03:46:43 +0000 Subject: [PATCH 31/42] update all reduce --- applications/ColossalChat/coati/trainer/sft.py | 3 ++- applications/ColossalChat/coati/trainer/utils.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py index 6322cb8df029..fb2f9a76536b 100755 --- a/applications/ColossalChat/coati/trainer/sft.py +++ b/applications/ColossalChat/coati/trainer/sft.py @@ -114,7 +114,8 @@ def _train(self, epoch: int): ) loss = outputs["loss"] if dist.get_rank() == dist.get_world_size() - 1: - step_bar.set_postfix({"train/loss": loss.item()}) + global_loss = all_reduce_mean(loss, self.booster) + step_bar.set_postfix({"train/loss": global_loss.item()}) step_bar.update() self.optimizer.step() self.optimizer.zero_grad() diff --git a/applications/ColossalChat/coati/trainer/utils.py b/applications/ColossalChat/coati/trainer/utils.py index 3c836b4b4db1..c15c291b4a05 100755 --- a/applications/ColossalChat/coati/trainer/utils.py +++ b/applications/ColossalChat/coati/trainer/utils.py @@ -9,6 +9,8 @@ from torch.utils._pytree import tree_map from torch.utils.data import DataLoader +from colossalai.booster import Booster + class CycledDataLoader: """ @@ -85,7 +87,7 @@ def _to(t: Any): return tree_map(_to, x) -def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor: +def all_reduce_mean(tensor: torch.Tensor, booster: Booster) -> torch.Tensor: """ Perform all-reduce operation on the given tensor and compute the mean across all processes. @@ -95,8 +97,12 @@ def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor: Returns: torch.Tensor: The reduced tensor with mean computed across all processes. """ - dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM) - tensor.div_(dist.get_world_size()) + if booster is not None: + dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM, group=booster.plugin.dp_group) + tensor.div_(booster.plugin.dp_size) + else: + dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM) + tensor.div_(dist.get_world_size()) return tensor From 4516a4ed6aabf6cef4cd568605f7e77144570394 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Thu, 15 Aug 2024 03:47:54 +0000 Subject: [PATCH 32/42] update sft --- applications/ColossalChat/coati/trainer/sft.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py index fb2f9a76536b..298fb30eec3b 100755 --- a/applications/ColossalChat/coati/trainer/sft.py +++ b/applications/ColossalChat/coati/trainer/sft.py @@ -200,8 +200,9 @@ def _eval(self, epoch: int): ) loss = outputs["loss"] if dist.get_rank() == dist.get_world_size() - 1: - step_bar.set_postfix({"eval/loss": loss.item()}) - self.accumulative_meter.add("loss", loss.item()) + global_loss = all_reduce_mean(loss, self.booster) + step_bar.set_postfix({"eval/loss": global_loss.item()}) + self.accumulative_meter.add("loss", global_loss.item()) step_bar.update() if dist.get_rank() == dist.get_world_size() - 1: From 10b72a32b1f7c57eea961c3629d8736424c9a863 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Thu, 15 Aug 2024 05:52:50 +0000 Subject: [PATCH 33/42] update ignore --- applications/ColossalChat/.gitignore | 6 ++++++ applications/ColossalChat/coati/trainer/sft.py | 9 +++------ applications/ColossalChat/coati/trainer/utils.py | 10 +++------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/applications/ColossalChat/.gitignore b/applications/ColossalChat/.gitignore index 7b361d38e6d0..5a4bb905f4ea 100755 --- a/applications/ColossalChat/.gitignore +++ b/applications/ColossalChat/.gitignore @@ -161,3 +161,9 @@ applications/ColossalChat/sft_data applications/ColossalChat/prompt_data applications/ColossalChat/preference_data applications/ColossalChat/temp + +# Testing data +/kto_data/ +/preference_data/ +/prompt_data/ +/sft_data/ diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py index 298fb30eec3b..33b241c054c2 100755 --- a/applications/ColossalChat/coati/trainer/sft.py +++ b/applications/ColossalChat/coati/trainer/sft.py @@ -114,9 +114,7 @@ def _train(self, epoch: int): ) loss = outputs["loss"] if dist.get_rank() == dist.get_world_size() - 1: - global_loss = all_reduce_mean(loss, self.booster) - step_bar.set_postfix({"train/loss": global_loss.item()}) - step_bar.update() + step_bar.set_postfix({"train/loss": loss.item()}) self.optimizer.step() self.optimizer.zero_grad() else: @@ -200,9 +198,8 @@ def _eval(self, epoch: int): ) loss = outputs["loss"] if dist.get_rank() == dist.get_world_size() - 1: - global_loss = all_reduce_mean(loss, self.booster) - step_bar.set_postfix({"eval/loss": global_loss.item()}) - self.accumulative_meter.add("loss", global_loss.item()) + step_bar.set_postfix({"eval/loss": loss.item()}) + self.accumulative_meter.add("loss", loss.item()) step_bar.update() if dist.get_rank() == dist.get_world_size() - 1: diff --git a/applications/ColossalChat/coati/trainer/utils.py b/applications/ColossalChat/coati/trainer/utils.py index c15c291b4a05..e87993c384cf 100755 --- a/applications/ColossalChat/coati/trainer/utils.py +++ b/applications/ColossalChat/coati/trainer/utils.py @@ -87,7 +87,7 @@ def _to(t: Any): return tree_map(_to, x) -def all_reduce_mean(tensor: torch.Tensor, booster: Booster) -> torch.Tensor: +def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor: """ Perform all-reduce operation on the given tensor and compute the mean across all processes. @@ -97,12 +97,8 @@ def all_reduce_mean(tensor: torch.Tensor, booster: Booster) -> torch.Tensor: Returns: torch.Tensor: The reduced tensor with mean computed across all processes. """ - if booster is not None: - dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM, group=booster.plugin.dp_group) - tensor.div_(booster.plugin.dp_size) - else: - dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM) - tensor.div_(dist.get_world_size()) + dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM) + tensor.div_(dist.get_world_size()) return tensor From b0c89bf29822940e80b08eca9056e3522adf03c7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 15 Aug 2024 05:53:49 +0000 Subject: [PATCH 34/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- applications/ColossalChat/coati/trainer/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/utils.py b/applications/ColossalChat/coati/trainer/utils.py index e87993c384cf..3c836b4b4db1 100755 --- a/applications/ColossalChat/coati/trainer/utils.py +++ b/applications/ColossalChat/coati/trainer/utils.py @@ -9,8 +9,6 @@ from torch.utils._pytree import tree_map from torch.utils.data import DataLoader -from colossalai.booster import Booster - class CycledDataLoader: """ From 42fcc16772c2c269bac2f119763632d6b6324f78 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Fri, 16 Aug 2024 09:40:34 +0000 Subject: [PATCH 35/42] update no cache --- .github/workflows/run_chatgpt_examples.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml index d0b5c2164119..b7522ffbdf74 100644 --- a/.github/workflows/run_chatgpt_examples.yml +++ b/.github/workflows/run_chatgpt_examples.yml @@ -31,18 +31,18 @@ jobs: - name: Install Colossal-AI run: | - BUILD_EXT=1 pip install -v -e . + BUILD_EXT=1 pip install --no-cache-dir -v -e . - name: Install ChatGPT run: | cd applications/ColossalChat - pip install -v . + pip install --no-cache-dir -v . export BUILD_EXT=1 - pip install -r examples/requirements.txt + pip install --no-cache-dir -r examples/requirements.txt - name: Install Transformers run: | - pip install transformers==4.36.2 + pip install --no-cache-dir transformers==4.36.2 - name: Execute Examples run: | From 3ab2f6fa329b6d12959fb3c668d278b4b225c5f0 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Mon, 19 Aug 2024 07:53:01 +0000 Subject: [PATCH 36/42] add eval --- applications/ColossalChat/tests/test_train.sh | 74 ++++++------------- 1 file changed, 24 insertions(+), 50 deletions(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 2935a6369986..ee916ead290e 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -91,7 +91,7 @@ SKIPPED_TESTS=( llama-gemini_auto-20 # gemini_auto plugin doesn't support lora llama-gemini-20 # gemini doesn't support lora ) -skip_eval=false + GRAD_CKPTS=('--grad_checkpoint') for lora_rank in ${LORA_RANK[@]}; do for model in ${MODELS[@]}; do @@ -134,13 +134,11 @@ for lora_rank in ${LORA_RANK[@]}; do bs='8' pp='2' plugin='3d' - skip_eval=true fi if [[ $plugin == "pp" ]]; then bs='8' pp='2' plugin='3d' - skip_eval=true fi if [[ $plugin == "sp_split_gather" ]]; then enable_sequence_parallelism='--enable_sequence_parallelism' @@ -178,53 +176,29 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split") done - - if [[ $skip_eval ]]; then - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ - --pretrain $pretrain \ - --tokenizer_dir $tokenizer_dir \ - --dataset ${dataset[@]} \ - --save_path $MODEL_SAVE_PATH \ - --config_file $MODELS_DIR/config.jsonl \ - $lora_config \ - --plugin $plugin \ - --batch_size $bs \ - --max_epochs 1 \ - --accumulation_steps $grad_accu \ - --tp $tp \ - --pp $pp \ - --zero_stage $zero_stage \ - --sp $sp \ - --sp_mode $sp_mode \ - $enable_sequence_parallelism \ - --lr 2e-5 \ - $grad_ckpt \ - --max_len 400 \ - --use_flash_attn - else - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ - --pretrain $pretrain \ - --tokenizer_dir $tokenizer_dir \ - --dataset ${dataset[@]} \ - --eval_dataset ${dataset[@]} \ - --save_path $MODEL_SAVE_PATH \ - --config_file $MODELS_DIR/config.jsonl \ - $lora_config \ - --plugin $plugin \ - --batch_size $bs \ - --max_epochs 1 \ - --accumulation_steps $grad_accu \ - --tp $tp \ - --pp $pp \ - --zero_stage $zero_stage \ - --sp $sp \ - --sp_mode $sp_mode \ - $enable_sequence_parallelism \ - --lr 2e-5 \ - $grad_ckpt \ - --max_len 400 \ - --use_flash_attn - fi + colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ + --pretrain $pretrain \ + --tokenizer_dir $tokenizer_dir \ + --dataset ${dataset[@]} \ + --eval_dataset ${dataset[@]} \ + --save_path $MODEL_SAVE_PATH \ + --config_file $MODELS_DIR/config.jsonl \ + $lora_config \ + --plugin $plugin \ + --batch_size $bs \ + --max_epochs 1 \ + --accumulation_steps $grad_accu \ + --tp $tp \ + --pp $pp \ + --zero_stage $zero_stage \ + --sp $sp \ + --sp_mode $sp_mode \ + $enable_sequence_parallelism \ + --lr 2e-5 \ + $grad_ckpt \ + --max_len 400 \ + --use_flash_attn + # fi passed=$? if [ $passed -eq 0 ]; then rm -rf ${MODEL_SAVE_PATH:?}/* From 6b8f0ba552f3195db4108cf7711b06cebbd3ae64 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Mon, 19 Aug 2024 07:54:28 +0000 Subject: [PATCH 37/42] remove fi --- applications/ColossalChat/tests/test_train.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index ee916ead290e..0960d9fab599 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -198,7 +198,6 @@ for lora_rank in ${LORA_RANK[@]}; do $grad_ckpt \ --max_len 400 \ --use_flash_attn - # fi passed=$? if [ $passed -eq 0 ]; then rm -rf ${MODEL_SAVE_PATH:?}/* From e458fd0ebde9ac5c0468da6e2d13e367eebf55e5 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Mon, 19 Aug 2024 07:56:14 +0000 Subject: [PATCH 38/42] remove debug --- applications/ColossalChat/tests/test_train.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 0960d9fab599..3b06495cb46a 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -129,7 +129,6 @@ for lora_rank in ${LORA_RANK[@]}; do plugin='3d' fi if [[ $plugin == "tp_pp" ]]; then - echo "Here" tp='2' bs='8' pp='2' From 4f148bacfd697a061b7ad555fe703e283e39d1e1 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Mon, 19 Aug 2024 08:05:51 +0000 Subject: [PATCH 39/42] remove parentheses to avoid warning --- applications/ColossalChat/tests/test_lora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/ColossalChat/tests/test_lora.py b/applications/ColossalChat/tests/test_lora.py index 7787592105b6..8edffa64bdf8 100755 --- a/applications/ColossalChat/tests/test_lora.py +++ b/applications/ColossalChat/tests/test_lora.py @@ -61,7 +61,7 @@ def test_overfit(): _, predicted = torch.max(outputs.data, 1) total = labels.size(0) correct = (predicted == Y).sum().item() - assert (correct / total > 0.95, "The model has not overfitted to the synthesized dataset") + assert (correct / total > 0.95) assert (weight_to_compare - model.fc1.weight).sum() < 0.01 From b1431d7aa27ba5100075ca7d2fa56728ffeeb45a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 Aug 2024 08:07:58 +0000 Subject: [PATCH 40/42] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- applications/ColossalChat/tests/test_lora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/ColossalChat/tests/test_lora.py b/applications/ColossalChat/tests/test_lora.py index 8edffa64bdf8..a6365051758f 100755 --- a/applications/ColossalChat/tests/test_lora.py +++ b/applications/ColossalChat/tests/test_lora.py @@ -61,7 +61,7 @@ def test_overfit(): _, predicted = torch.max(outputs.data, 1) total = labels.size(0) correct = (predicted == Y).sum().item() - assert (correct / total > 0.95) + assert correct / total > 0.95 assert (weight_to_compare - model.fc1.weight).sum() < 0.01 From 038f1e0950af5dc65d5fdbe39d95154058a043aa Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 20 Aug 2024 05:46:24 +0000 Subject: [PATCH 41/42] Revert "add eval" This reverts commit 3ab2f6fa329b6d12959fb3c668d278b4b225c5f0. --- applications/ColossalChat/tests/test_lora.py | 2 +- applications/ColossalChat/tests/test_train.sh | 74 +++++++++++++------ 2 files changed, 52 insertions(+), 24 deletions(-) diff --git a/applications/ColossalChat/tests/test_lora.py b/applications/ColossalChat/tests/test_lora.py index 8edffa64bdf8..7787592105b6 100755 --- a/applications/ColossalChat/tests/test_lora.py +++ b/applications/ColossalChat/tests/test_lora.py @@ -61,7 +61,7 @@ def test_overfit(): _, predicted = torch.max(outputs.data, 1) total = labels.size(0) correct = (predicted == Y).sum().item() - assert (correct / total > 0.95) + assert (correct / total > 0.95, "The model has not overfitted to the synthesized dataset") assert (weight_to_compare - model.fc1.weight).sum() < 0.01 diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 3b06495cb46a..2935a6369986 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -91,7 +91,7 @@ SKIPPED_TESTS=( llama-gemini_auto-20 # gemini_auto plugin doesn't support lora llama-gemini-20 # gemini doesn't support lora ) - +skip_eval=false GRAD_CKPTS=('--grad_checkpoint') for lora_rank in ${LORA_RANK[@]}; do for model in ${MODELS[@]}; do @@ -129,15 +129,18 @@ for lora_rank in ${LORA_RANK[@]}; do plugin='3d' fi if [[ $plugin == "tp_pp" ]]; then + echo "Here" tp='2' bs='8' pp='2' plugin='3d' + skip_eval=true fi if [[ $plugin == "pp" ]]; then bs='8' pp='2' plugin='3d' + skip_eval=true fi if [[ $plugin == "sp_split_gather" ]]; then enable_sequence_parallelism='--enable_sequence_parallelism' @@ -175,28 +178,53 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split") done - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ - --pretrain $pretrain \ - --tokenizer_dir $tokenizer_dir \ - --dataset ${dataset[@]} \ - --eval_dataset ${dataset[@]} \ - --save_path $MODEL_SAVE_PATH \ - --config_file $MODELS_DIR/config.jsonl \ - $lora_config \ - --plugin $plugin \ - --batch_size $bs \ - --max_epochs 1 \ - --accumulation_steps $grad_accu \ - --tp $tp \ - --pp $pp \ - --zero_stage $zero_stage \ - --sp $sp \ - --sp_mode $sp_mode \ - $enable_sequence_parallelism \ - --lr 2e-5 \ - $grad_ckpt \ - --max_len 400 \ - --use_flash_attn + + if [[ $skip_eval ]]; then + colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ + --pretrain $pretrain \ + --tokenizer_dir $tokenizer_dir \ + --dataset ${dataset[@]} \ + --save_path $MODEL_SAVE_PATH \ + --config_file $MODELS_DIR/config.jsonl \ + $lora_config \ + --plugin $plugin \ + --batch_size $bs \ + --max_epochs 1 \ + --accumulation_steps $grad_accu \ + --tp $tp \ + --pp $pp \ + --zero_stage $zero_stage \ + --sp $sp \ + --sp_mode $sp_mode \ + $enable_sequence_parallelism \ + --lr 2e-5 \ + $grad_ckpt \ + --max_len 400 \ + --use_flash_attn + else + colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ + --pretrain $pretrain \ + --tokenizer_dir $tokenizer_dir \ + --dataset ${dataset[@]} \ + --eval_dataset ${dataset[@]} \ + --save_path $MODEL_SAVE_PATH \ + --config_file $MODELS_DIR/config.jsonl \ + $lora_config \ + --plugin $plugin \ + --batch_size $bs \ + --max_epochs 1 \ + --accumulation_steps $grad_accu \ + --tp $tp \ + --pp $pp \ + --zero_stage $zero_stage \ + --sp $sp \ + --sp_mode $sp_mode \ + $enable_sequence_parallelism \ + --lr 2e-5 \ + $grad_ckpt \ + --max_len 400 \ + --use_flash_attn + fi passed=$? if [ $passed -eq 0 ]; then rm -rf ${MODEL_SAVE_PATH:?}/* From 9af4b69d23a62cb8efea48e11e2b72dc0f818366 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 20 Aug 2024 09:36:57 +0000 Subject: [PATCH 42/42] add all reduce --- applications/ColossalChat/coati/trainer/sft.py | 17 +++++++++++------ .../ColossalChat/coati/trainer/utils.py | 13 ++++++++++--- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py index 33b241c054c2..3aedcf7a99af 100755 --- a/applications/ColossalChat/coati/trainer/sft.py +++ b/applications/ColossalChat/coati/trainer/sft.py @@ -113,8 +113,12 @@ def _train(self, epoch: int): return_loss=True, ) loss = outputs["loss"] - if dist.get_rank() == dist.get_world_size() - 1: - step_bar.set_postfix({"train/loss": loss.item()}) + + if self.booster.plugin.stage_manager.is_last_stage(): + global_loss = all_reduce_mean(loss, self.plugin) + if dist.get_rank() == dist.get_world_size() - 1: + step_bar.set_postfix({"train/loss": global_loss.item()}) + self.optimizer.step() self.optimizer.zero_grad() else: @@ -197,10 +201,11 @@ def _eval(self, epoch: int): return_loss=True, ) loss = outputs["loss"] - if dist.get_rank() == dist.get_world_size() - 1: - step_bar.set_postfix({"eval/loss": loss.item()}) - self.accumulative_meter.add("loss", loss.item()) - step_bar.update() + if self.booster.plugin.stage_manager.is_last_stage(): + global_loss = all_reduce_mean(loss, self.plugin) + if dist.get_rank() == dist.get_world_size() - 1: + step_bar.set_postfix({"eval/loss": global_loss.item()}) + self.accumulative_meter.add("loss", global_loss.item()) if dist.get_rank() == dist.get_world_size() - 1: loss_mean = self.accumulative_meter.get("loss") diff --git a/applications/ColossalChat/coati/trainer/utils.py b/applications/ColossalChat/coati/trainer/utils.py index 3c836b4b4db1..217a87cf0419 100755 --- a/applications/ColossalChat/coati/trainer/utils.py +++ b/applications/ColossalChat/coati/trainer/utils.py @@ -9,6 +9,8 @@ from torch.utils._pytree import tree_map from torch.utils.data import DataLoader +from colossalai.booster import Plugin + class CycledDataLoader: """ @@ -85,7 +87,7 @@ def _to(t: Any): return tree_map(_to, x) -def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor: +def all_reduce_mean(tensor: torch.Tensor, plugin: Plugin = None) -> torch.Tensor: """ Perform all-reduce operation on the given tensor and compute the mean across all processes. @@ -95,8 +97,13 @@ def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor: Returns: torch.Tensor: The reduced tensor with mean computed across all processes. """ - dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM) - tensor.div_(dist.get_world_size()) + # All reduce mean across DP group + if plugin is not None: + dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM, group=plugin.dp_group) + tensor.div_(plugin.dp_size) + else: + dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM) + tensor.div_(dist.get_world_size()) return tensor