From 6ccf527f2561b7a5cd8eac43909326f23bd81586 Mon Sep 17 00:00:00 2001 From: CZYCW Date: Mon, 18 Sep 2023 16:17:37 +0800 Subject: [PATCH 1/6] add tensorboard support for coati --- applications/Chat/coati/trainer/sft.py | 14 ++++++++++++-- applications/Chat/examples/train_sft.py | 4 +++- applications/Chat/requirements.txt | 1 + 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py index e4d0a970740d..98ea5e78d702 100644 --- a/applications/Chat/coati/trainer/sft.py +++ b/applications/Chat/coati/trainer/sft.py @@ -8,7 +8,8 @@ from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler from torch.utils.data import DataLoader - +from torch.utils.tensorboard import SummaryWriter +import torch.distributed as dist from colossalai.logging import DistributedLogger from .base import SLTrainer @@ -37,6 +38,7 @@ def __init__( lr_scheduler: _LRScheduler, max_epochs: int = 2, accumulation_steps: int = 8, + tensorboard_dir: str = None, ) -> None: if accumulation_steps > 1: assert not isinstance(strategy, GeminiStrategy), \ @@ -72,6 +74,10 @@ def _train(self, epoch: int): self.strategy.optimizer_step(self.optimizer) self.optimizer.zero_grad() self.scheduler.step() + if is_rank_0() and self.tensorboard_writer: + self.tensorboard_writer.add_scalar('loss', self.total_loss / self.accumulation_steps) + self.tensorboard_writer.add_scalar('lr', self.scheduler.get_last_lr()[0]) + self.tensorboard_writer.flush() if is_rank_0() and self.use_wandb: wandb.log({ "loss": self.total_loss / self.accumulation_steps, @@ -105,7 +111,8 @@ def _before_fit(self, train_dataloader: DataLoader, eval_dataloader: Optional[DataLoader] = None, logger: Optional[DistributedLogger] = None, - use_wandb: bool = False): + use_wandb: bool = False, + tensorboard_dir: str = None): """ Args: train_dataloader: the dataloader to use for training @@ -116,9 +123,12 @@ def _before_fit(self, self.logger = logger self.use_wandb = use_wandb + self.tensorboard_writer = None if use_wandb: wandb.init(project="Coati", name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) wandb.watch(self.model) + if tensorboard_dir: + self.tensorboard_writer = SummaryWriter(tensorboard_dir) if tensorboard_dir and dist.get_rank() == 0 else None self.total_loss = 0 self.no_epoch_bar = True diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py index f068ea2bf5de..f2d9d8d47cec 100644 --- a/applications/Chat/examples/train_sft.py +++ b/applications/Chat/examples/train_sft.py @@ -175,7 +175,8 @@ def train(args): trainer.fit(train_dataloader=train_dataloader, eval_dataloader=eval_dataloader, logger=logger, - use_wandb=args.use_wandb) + use_wandb=args.use_wandb, + tensorboard_dir=args.tensorboard_dir) # save model checkpoint after fitting on only rank0 strategy.save_pretrained(model, path=args.save_path, only_rank0=True, tokenizer=tokenizer) @@ -206,6 +207,7 @@ def train(args): parser.add_argument('--lr', type=float, default=5e-6) parser.add_argument('--accumulation_steps', type=int, default=8) parser.add_argument('--use_wandb', default=False, action='store_true') + parser.add_argument('--tensorboard_dir', default="", action='store_true') parser.add_argument('--grad_checkpoint', default=False, action='store_true') args = parser.parse_args() train(args) diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt index e5f5ca0932a8..afb0dddb644f 100644 --- a/applications/Chat/requirements.txt +++ b/applications/Chat/requirements.txt @@ -11,3 +11,4 @@ sse_starlette wandb sentencepiece gpustat +tensorboard From 92f9404dbbf2e077b14b4533eec6201b443c1119 Mon Sep 17 00:00:00 2001 From: CZYCW Date: Mon, 18 Sep 2023 16:25:17 +0800 Subject: [PATCH 2/6] remove unused condition --- applications/Chat/coati/trainer/sft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py index 98ea5e78d702..ebed4bbc5945 100644 --- a/applications/Chat/coati/trainer/sft.py +++ b/applications/Chat/coati/trainer/sft.py @@ -128,7 +128,7 @@ def _before_fit(self, wandb.init(project="Coati", name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) wandb.watch(self.model) if tensorboard_dir: - self.tensorboard_writer = SummaryWriter(tensorboard_dir) if tensorboard_dir and dist.get_rank() == 0 else None + self.tensorboard_writer = SummaryWriter(log_dir=tensorboard_dir) if dist.get_rank() == 0 else None self.total_loss = 0 self.no_epoch_bar = True From ae9671cb7c536952ee7ae11733f771f5ee6cc1d9 Mon Sep 17 00:00:00 2001 From: CZYCW Date: Mon, 18 Sep 2023 17:56:59 +0800 Subject: [PATCH 3/6] fix arg parse bug --- applications/Chat/examples/train_sft.py | 2 +- applications/Chat/examples/train_sft.sh | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py index f2d9d8d47cec..118c34461b31 100644 --- a/applications/Chat/examples/train_sft.py +++ b/applications/Chat/examples/train_sft.py @@ -207,7 +207,7 @@ def train(args): parser.add_argument('--lr', type=float, default=5e-6) parser.add_argument('--accumulation_steps', type=int, default=8) parser.add_argument('--use_wandb', default=False, action='store_true') - parser.add_argument('--tensorboard_dir', default="", action='store_true') + parser.add_argument('--tensorboard_dir', type=str, default="") parser.add_argument('--grad_checkpoint', default=False, action='store_true') args = parser.parse_args() train(args) diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh index 1a5cd069011d..f64d4cbac223 100755 --- a/applications/Chat/examples/train_sft.sh +++ b/applications/Chat/examples/train_sft.sh @@ -16,12 +16,13 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() { set_n_least_used_CUDA_VISIBLE_DEVICES 4 torchrun --standalone --nproc_per_node=4 train_sft.py \ - --pretrain "/path/to/LLaMa-7B/" \ - --model 'llama' \ + --pretrain 'bigscience/bloom-560m' \ + --model 'bloom' \ --strategy colossalai_zero2 \ --log_interval 10 \ - --save_path /path/to/Coati-7B \ - --dataset /path/to/data.json \ + --tensorboard_dir "/root/test-coati/ColossalAI/applications/Chat/examples/tensorboard" \ + --save_path "/root/test-coati/ColossalAI/applications/Chat/examples/output" \ + --dataset "/root/test-coati/ColossalAI/applications/Chat/examples/data.json" \ --batch_size 4 \ --accumulation_steps 8 \ --lr 2e-5 \ From 9b8480d5ea71a1b95bf6a3b6cb2659453be33452 Mon Sep 17 00:00:00 2001 From: CZYCW Date: Mon, 18 Sep 2023 18:57:53 +0800 Subject: [PATCH 4/6] fix inconsistency get rank funciton --- applications/Chat/coati/trainer/sft.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/applications/Chat/coati/trainer/sft.py b/applications/Chat/coati/trainer/sft.py index ebed4bbc5945..12d4a7fda5ad 100644 --- a/applications/Chat/coati/trainer/sft.py +++ b/applications/Chat/coati/trainer/sft.py @@ -9,7 +9,6 @@ from torch.optim.lr_scheduler import _LRScheduler from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter -import torch.distributed as dist from colossalai.logging import DistributedLogger from .base import SLTrainer @@ -128,7 +127,7 @@ def _before_fit(self, wandb.init(project="Coati", name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) wandb.watch(self.model) if tensorboard_dir: - self.tensorboard_writer = SummaryWriter(log_dir=tensorboard_dir) if dist.get_rank() == 0 else None + self.tensorboard_writer = SummaryWriter(log_dir=tensorboard_dir) if is_rank_0() else None self.total_loss = 0 self.no_epoch_bar = True From 955b1c98c8207902b34b7ce407b4215985681ce4 Mon Sep 17 00:00:00 2001 From: CZYCW Date: Mon, 18 Sep 2023 18:59:42 +0800 Subject: [PATCH 5/6] revert personal setup --- applications/Chat/examples/train_sft.sh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh index f64d4cbac223..191ce41a9a3c 100755 --- a/applications/Chat/examples/train_sft.sh +++ b/applications/Chat/examples/train_sft.sh @@ -16,15 +16,14 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() { set_n_least_used_CUDA_VISIBLE_DEVICES 4 torchrun --standalone --nproc_per_node=4 train_sft.py \ - --pretrain 'bigscience/bloom-560m' \ - --model 'bloom' \ + --pretrain "/path/to/LLaMa-7B/" \ + --model 'llama' \ --strategy colossalai_zero2 \ --log_interval 10 \ - --tensorboard_dir "/root/test-coati/ColossalAI/applications/Chat/examples/tensorboard" \ - --save_path "/root/test-coati/ColossalAI/applications/Chat/examples/output" \ - --dataset "/root/test-coati/ColossalAI/applications/Chat/examples/data.json" \ + --save_path /path/to/Coati-7B \ + --dataset /path/to/data.json \ --batch_size 4 \ --accumulation_steps 8 \ --lr 2e-5 \ --max_datasets_size 512 \ - --max_epochs 1 + --max_epochs 1 \ No newline at end of file From 5504b2fefbf1375c9e9e768b5a953619440cfa31 Mon Sep 17 00:00:00 2001 From: CZYCW Date: Mon, 18 Sep 2023 19:00:26 +0800 Subject: [PATCH 6/6] revert personal setup --- applications/Chat/examples/train_sft.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh index 191ce41a9a3c..1a5cd069011d 100755 --- a/applications/Chat/examples/train_sft.sh +++ b/applications/Chat/examples/train_sft.sh @@ -26,4 +26,4 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \ --accumulation_steps 8 \ --lr 2e-5 \ --max_datasets_size 512 \ - --max_epochs 1 \ No newline at end of file + --max_epochs 1