diff --git a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py index fef5b0d16d60..37d727851f32 100644 --- a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py +++ b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py @@ -13,6 +13,7 @@ from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler as LRScheduler +from colossalai.cluster import DistCoordinator from colossalai.interface import OptimizerWrapper from .general_checkpoint_io import GeneralCheckpointIO @@ -71,6 +72,7 @@ def __init__(self, self.verbose = verbose self.working_to_master_map = None self.master_to_working_map = None + self.coordinator = DistCoordinator() @staticmethod def _model_sharder(model: nn.Module, @@ -655,7 +657,7 @@ def gather_from_sharded_optimizer_state(state: OrderedDict, param: torch.Tensor, dist.all_gather(gather_tensor, v, group=tp_group) v = torch.cat(gather_tensor, dim=partition_dim) - state_[k] = v.detach().clone().cpu() + state_[k] = v.detach().clone().cpu() return state_ diff --git a/examples/language/bert/finetune.py b/examples/language/bert/finetune.py index 2e8780806f19..fb6e4332c2f9 100644 --- a/examples/language/bert/finetune.py +++ b/examples/language/bert/finetune.py @@ -129,14 +129,13 @@ def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, _criterion: use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1 is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage() + print_flag = (not use_pipeline and coordinator.is_master()) or (use_pipeline and is_pp_last_stage) total_step = len(train_dataloader) model.train() optimizer.zero_grad() train_dataloader_iter = iter(train_dataloader) - with tqdm(range(total_step), - desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', - disable=not (coordinator.is_master() or is_pp_last_stage)) as pbar: + with tqdm(range(total_step), desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not print_flag) as pbar: # Forward pass for _ in pbar: if use_pipeline: @@ -192,13 +191,13 @@ def main(): model_name = "albert-xxlarge-v2" else: raise RuntimeError + # ============================== # Launch Distributed Environment # ============================== colossalai.launch_from_torch(config={}, seed=42) coordinator = DistCoordinator() - # local_batch_size = BATCH_SIZE // coordinator.world_size lr = LEARNING_RATE * coordinator.world_size # ============================== diff --git a/examples/language/llama2/README.md b/examples/language/llama2/README.md index 483eae88ae32..2f2fefd41389 100644 --- a/examples/language/llama2/README.md +++ b/examples/language/llama2/README.md @@ -6,7 +6,7 @@

- 70 billion parameter LLaMA2 model training accelerated by 195% -[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama) +[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/llama2) [[blog]](https://www.hpc-ai.tech/blog/70b-llama2-training) ### LLaMA1 @@ -92,7 +92,7 @@ Make sure master node can access all nodes (including itself) by ssh without pas Here is details about CLI arguments: - Model configuration: `-c`, `--config`. `7b`, `13b`, `30b` and `65b` are supported for LLaMA-1, `7b`, `13b`, and `70b` are supported for LLaMA-2. -- Booster plugin: `-p`, `--plugin`. `gemini`, `gemini_auto`, `zero2` and `zero2_cpu` are supported. For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins). +- Booster plugin: `-p`, `--plugin`. `gemini`, `gemini_auto`, `zero2`, `hybrid_parallel` and `zero2_cpu` are supported. For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins). - Dataset path: `-d`, `--dataset`. The default dataset is `togethercomputer/RedPajama-Data-1T-Sample`. It support any dataset from `datasets` with the same data format as RedPajama. - Number of epochs: `-e`, `--num_epochs`. The default value is 1. - Local batch size: `-b`, `--batch_size`. Batch size per GPU. The default value is 2. @@ -192,3 +192,40 @@ If you run the above command successfully, you will get the following results: year={2023} } ``` + + +# Fine-tune Llama2 + +We also provide a example to fine-tune llama2 in `finetune.py`, + +Make sure master node can access all nodes (including itself) by ssh without password. + +Here is details about CLI arguments: + +- Pretrained checkpoint path: `--model_path`, the path of your model checkpoint, it can be your local directory or a Hugging Face tag. +- Booster plugin: `-p`, `--plugin`. `gemini`, `gemini_auto`, `zero2`, `hybrid_parallel` and `zero2_cpu` are supported. For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins). +- Dataset path: `-d`, `--dataset`. The default dataset is `yizhongw/self_instruct`. It support any dataset from `datasets` with the same data format as `yizhongw/self_instruct`. +- task name: `--task_name`, the task to fine-tune, it's also related to the target of loading dataset, The default value is `super_natural_instructions`. +- Number of epochs: `-e`, `--num_epochs`. The default value is 1. +- Local batch size: `-b`, `--batch_size`. Batch size per GPU. The default value is 2. +- Learning rate: `--lr`. The default value is 3e-4. +- Weight decay: `-w`, `--weight_decay`. The default value is 0.1. +- Gradient checkpointing: `-g`, `--gradient_checkpoint`. The default value is `False`. This saves memory at the cost of speed. You'd better enable this option when training with a large batch size. +- Max length: `-l`, `--max_length`. The default value is 4096. +- Mixed precision: `-x`, `--mixed_precision`. The default value is "fp16". "fp16" and "bf16" are supported. +- Save interval: `-i`, `--save_interval`. The interval (steps) of saving checkpoints. The default value is 1000. +- Checkpoint directory: `-o`, `--save_dir`. The directoty path to save checkpoints. The default value is `checkpoint`. +- Checkpoint to load: `-f`, `--load`. The checkpoint path to load. The default value is `None`. +- Gradient clipping: `--gradient_clipping`. The default value is 1.0. +- Tensorboard log directory: `-t`, `--tensorboard_dir`. The directory path to save tensorboard logs. The default value is `tb_logs`. +- Flash attention: `-a`, `--flash_attention`. If you want to use flash attention, you must install `flash-attn`. The default value is `False`. This is helpful to accelerate training while saving memory. We recommend you always use flash attention. + + +```shell +torchrun --standalone --nproc_per_node 8 finetune.py \ + --plugin "hybrid_parallel" \ + --dataset "yizhongw/self_instruct" \ + --model_path "/path/llama" \ + --task_name "super_natural_instructions" \ + --save_dir "/path/output" +``` diff --git a/examples/language/llama2/finetune.py b/examples/language/llama2/finetune.py new file mode 100644 index 000000000000..0efbf193c9a9 --- /dev/null +++ b/examples/language/llama2/finetune.py @@ -0,0 +1,295 @@ +import argparse +import math +import os +import resource +from contextlib import nullcontext +from functools import partial +from typing import Optional, Tuple + +import torch +import torch.distributed as dist +import torch.nn as nn +from attn import SUPPORT_XFORMERS, replace_xformers +from data_utils import load_json, prepare_dataloader, save_json +from datasets import load_dataset +from torch.optim import Optimizer +from torch.optim.lr_scheduler import _LRScheduler +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm +from transformers.models.llama.configuration_llama import LlamaConfig +from transformers.models.llama.modeling_llama import LlamaForCausalLM +from transformers.models.llama.tokenization_llama import LlamaTokenizer + +import colossalai +from colossalai.booster import Booster +from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin +from colossalai.cluster import DistCoordinator +from colossalai.lazy import LazyInitContext +from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR +from colossalai.nn.optimizer import HybridAdam +from colossalai.utils import get_current_device + + +def get_model_numel(model: nn.Module) -> int: + return sum(p.numel() for p in model.parameters()) + + +def format_numel_str(numel: int) -> str: + B = 1024**3 + M = 1024**2 + K = 1024 + if numel >= B: + return f'{numel / B:.2f} B' + elif numel >= M: + return f'{numel / M:.2f} M' + elif numel >= K: + return f'{numel / K:.2f} K' + else: + return f'{numel}' + + +def tokenize_batch_for_finetune(batch, tokenizer: Optional[LlamaTokenizer] = None, max_length: int = 2048): + texts = [sample['prompt'] + sample['completion'] for sample in batch] + data = tokenizer(texts, return_tensors="pt", padding='max_length', truncation=True, max_length=max_length) + data = {k: v.cuda() for k, v in data.items()} + data['labels'] = data['input_ids'].clone() + return data + + +def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor: + dist.all_reduce(tensor, op=dist.ReduceOp.SUM) + tensor.div_(dist.get_world_size()) + return tensor + + +def save(booster: Booster, model: nn.Module, optimizer: Optimizer, lr_scheduler: _LRScheduler, epoch: int, step: int, + batch_size: int, coordinator: DistCoordinator, save_dir: str): + save_dir = os.path.join(save_dir, f'epoch{epoch}-step{step}') + os.makedirs(os.path.join(save_dir, 'model'), exist_ok=True) + + booster.save_model(model, os.path.join(save_dir, 'model'), shard=True) + booster.save_optimizer(optimizer, os.path.join(save_dir, 'optimizer'), shard=True) + booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, 'lr_scheduler')) + running_states = { + 'epoch': epoch, + 'step': step, + 'sample_start_index': step * batch_size, + } + if coordinator.is_master(): + save_json(running_states, os.path.join(save_dir, 'running_states.json')) + + +def load(booster: Booster, model: nn.Module, optimizer: Optimizer, lr_scheduler: _LRScheduler, + load_dir: str) -> Tuple[int, int, int]: + booster.load_model(model, os.path.join(load_dir, 'model')) + booster.load_optimizer(optimizer, os.path.join(load_dir, 'optimizer')) + booster.load_lr_scheduler(lr_scheduler, os.path.join(load_dir, 'lr_scheduler')) + running_states = load_json(os.path.join(load_dir, 'running_states.json')) + return running_states['epoch'], running_states['step'], running_states['sample_start_index'] + + +def _criterion(outputs, inputs): + return outputs.loss + + +def main(): + # ============================== + # Parse Arguments + # ============================== + parser = argparse.ArgumentParser() + parser.add_argument('--model_path', type=str, help="pretrained checkpoint path, used with mode==finetune") + parser.add_argument('-p', + '--plugin', + choices=['gemini', 'gemini_auto', 'zero2', 'zero2_cpu', 'hybrid_parallel'], + default='gemini', + help='Choose which plugin to use') + parser.add_argument('-d', '--dataset', type=str, default='yizhongw/self_instruct', help='Data set path') + parser.add_argument('--task_name', type=str, default="super_natural_instructions", help='task to run') + parser.add_argument('-e', '--num_epochs', type=int, default=1, help='Number of epochs') + parser.add_argument('-b', '--batch_size', type=int, default=2, help='Local batch size') + parser.add_argument('--lr', type=float, default=3e-4, help='Learning rate') + parser.add_argument('-w', '--weigth_decay', type=float, default=0.1, help='Weight decay') + parser.add_argument('-g', '--grad_checkpoint', action='store_true', help='Use gradient checkpointing') + parser.add_argument('-l', '--max_length', type=int, default=4096, help='Max sequence length') + parser.add_argument('-x', '--mixed_precision', default='fp16', choices=['fp16', 'bf16'], help='Mixed precision') + parser.add_argument('-i', '--save_interval', type=int, default=1000, help='Save interval') + parser.add_argument('-o', '--save_dir', type=str, default='checkpoint', help='Checkpoint directory') + parser.add_argument('-f', '--load', type=str, default=None, help='Load checkpoint') + parser.add_argument('--grad_clip', type=float, default=1.0, help='Gradient clipping') + parser.add_argument('-t', '--tensorboard_dir', type=str, default='tb_logs', help='Tensorboard directory') + parser.add_argument('-a', '--flash_attention', action='store_true', help='Use Flash Attention') + args = parser.parse_args() + + # ============================== + # Initialize Distributed Training + # ============================== + colossalai.launch_from_torch({}) + coordinator = DistCoordinator() + + # ============================== + # Initialize Booster + # ============================== + if args.plugin == 'gemini': + plugin = GeminiPlugin(precision=args.mixed_precision, initial_scale=2**16, max_norm=args.grad_clip) + elif args.plugin == 'gemini_auto': + plugin = GeminiPlugin(precision=args.mixed_precision, + placement_policy='auto', + initial_scale=2**16, + max_norm=args.grad_clip) + elif args.plugin == 'zero2': + plugin = LowLevelZeroPlugin(stage=2, + precision=args.mixed_precision, + initial_scale=2**16, + max_norm=args.grad_clip) + elif args.plugin == 'zero2_cpu': + plugin = LowLevelZeroPlugin(stage=2, + precision=args.mixed_precision, + initial_scale=2**16, + cpu_offload=True, + max_norm=args.grad_clip) + elif args.plugin == 'hybrid_parallel': + # modify the param accordingly, default configuration is for llama2-7b + plugin = HybridParallelPlugin(tp_size=4, + pp_size=2, + num_microbatches=None, + microbatch_size=1, + enable_jit_fused=False, + zero_stage=0, + precision='fp32', + initial_scale=1) + else: + raise ValueError(f'Unknown plugin {args.plugin}') + + booster = Booster(plugin=plugin) + + use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1 + is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage() + print_flag = (not use_pipeline and coordinator.is_master()) or (use_pipeline and is_pp_last_stage) + + # ============================== + # Initialize Tensorboard + # ============================== + if print_flag: + os.makedirs(args.tensorboard_dir, exist_ok=True) + writer = SummaryWriter(args.tensorboard_dir) + + # ============================== + # Initialize Model, Optimizer and LR Scheduler + # ============================== + + config = LlamaConfig.from_pretrained(args.model_path) + # use lazy init when using GeminiPlugin + init_ctx = LazyInitContext( + default_device=get_current_device()) if isinstance(plugin, GeminiPlugin) else nullcontext() + + with init_ctx: + model = LlamaForCausalLM(config) + + # ============================== + # Initialize Tokenizer, Dataset and Dataloader + # ============================== + tokenizer = LlamaTokenizer.from_pretrained('hf-internal-testing/llama-tokenizer') + # follows fast chat: https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py#L257 + tokenizer.pad_token = tokenizer.unk_token + + dataset = load_dataset(args.dataset, args.task_name) + train_ds = dataset['train'] + dataloader = prepare_dataloader(train_ds, + batch_size=args.batch_size, + shuffle=True, + drop_last=True, + collate_fn=partial(tokenize_batch_for_finetune, + tokenizer=tokenizer, + max_length=args.max_length)) + + if args.grad_checkpoint: + model.gradient_checkpointing_enable() + if args.flash_attention: + assert SUPPORT_XFORMERS, 'Use flash attention while xfomers is not installed' + replace_xformers(model) + + model_numel = get_model_numel(model) + coordinator.print_on_master(f'Model params: {format_numel_str(model_numel)}') + + optimizer = HybridAdam(model.parameters(), lr=args.lr, betas=(0.9, 0.95), weight_decay=args.weigth_decay) + total_step = args.num_epochs * len(dataloader) + lr_scheduler = CosineAnnealingWarmupLR(optimizer, + total_steps=total_step, + warmup_steps=math.ceil(total_step * 0.03), + eta_min=0.1 * args.lr) + default_dtype = torch.float16 if args.mixed_precision == 'fp16' else torch.bfloat16 + torch.set_default_dtype(default_dtype) + model, optimizer, _, dataloader, lr_scheduler = booster.boost(model, + optimizer, + dataloader=dataloader, + lr_scheduler=lr_scheduler) + torch.set_default_dtype(torch.float) + + booster.load_model(model, args.model_path) + + coordinator.print_on_master(f'Booster init max CUDA memory: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB') + coordinator.print_on_master( + f'Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB') + + # load checkpoint if specified + start_epoch = 0 + start_step = 0 + sampler_start_idx = 0 + if args.load is not None: + coordinator.print_on_master('Loading checkpoint') + start_epoch, start_step, sampler_start_idx = load(booster, model, optimizer, lr_scheduler, args.load) + coordinator.print_on_master(f'Loaded checkpoint {args.load} at epoch {start_epoch} step {start_step}') + + num_steps_per_epoch = len(dataloader) + + # if resume training, set the sampler start index to the correct value + dataloader.sampler.set_start_index(sampler_start_idx) + for epoch in range(start_epoch, args.num_epochs): + dataloader.sampler.set_epoch(epoch) + step_nums = num_steps_per_epoch - start_step + dataloader_iter = iter(dataloader) + + with tqdm(range(step_nums), + desc=f'Epoch {epoch}', + disable=not print_flag, + total=num_steps_per_epoch, + initial=start_step) as pbar: + for step in pbar: + if use_pipeline: + outputs = booster.execute_pipeline(dataloader_iter, + model, + _criterion, + optimizer, + return_loss=True, + return_outputs=True) + loss = outputs["loss"] + else: + batch = next(dataloader_iter) + outputs = model(**batch) + loss = outputs[0] + booster.backward(loss, optimizer) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + if not use_pipeline: + all_reduce_mean(loss) + if print_flag: + pbar.set_postfix({'loss': loss.item()}) + writer.add_scalar('loss', loss.item(), epoch * num_steps_per_epoch + step) + + if args.save_interval > 0 and (step + 1) % args.save_interval == 0: + coordinator.print_on_master(f'Saving checkpoint') + save(booster, model, optimizer, lr_scheduler, epoch, step + 1, args.batch_size, coordinator, + args.save_dir) + coordinator.print_on_master(f'Saved checkpoint at epoch {epoch} step {step + 1}') + # the continue epochs are not resumed, so we need to reset the sampler start index and start step + dataloader.sampler.set_start_index(0) + start_step = 0 + + coordinator.print_on_master(f'Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB') + + +if __name__ == '__main__': + main() diff --git a/examples/language/llama2/pretrain.py b/examples/language/llama2/pretrain.py index b72a3019692e..0eeac4035401 100644 --- a/examples/language/llama2/pretrain.py +++ b/examples/language/llama2/pretrain.py @@ -21,7 +21,7 @@ import colossalai from colossalai.booster import Booster -from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin +from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin from colossalai.cluster import DistCoordinator from colossalai.lazy import LazyInitContext from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR @@ -65,9 +65,10 @@ def format_numel_str(numel: int) -> str: return f'{numel}' -def tokenize_batch(batch, tokenizer: Optional[LlamaTokenizer] = None, max_length: int = 2048): +def tokenize_batch_for_pretrain(batch, tokenizer: Optional[LlamaTokenizer] = None, max_length: int = 2048): texts = [sample['text'] for sample in batch] data = tokenizer(texts, return_tensors="pt", padding='max_length', truncation=True, max_length=max_length) + data = {k: v.cuda() for k, v in data.items()} data['labels'] = data['input_ids'].clone() return data @@ -104,6 +105,10 @@ def load(booster: Booster, model: nn.Module, optimizer: Optimizer, lr_scheduler: return running_states['epoch'], running_states['step'], running_states['sample_start_index'] +def _criterion(outputs, inputs): + return outputs.loss + + def main(): # ============================== # Parse Arguments @@ -112,7 +117,7 @@ def main(): parser.add_argument('-c', '--config', type=str, default='7b', help='Model configuration') parser.add_argument('-p', '--plugin', - choices=['gemini', 'gemini_auto', 'zero2', 'zero2_cpu'], + choices=['gemini', 'gemini_auto', 'zero2', 'zero2_cpu', 'hybrid_parallel'], default='gemini', help='Choose which plugin to use') parser.add_argument('-d', @@ -142,13 +147,6 @@ def main(): colossalai.launch_from_torch({}) coordinator = DistCoordinator() - # ============================== - # Initialize Tensorboard - # ============================== - if coordinator.is_master(): - os.makedirs(args.tensorboard_dir, exist_ok=True) - writer = SummaryWriter(args.tensorboard_dir) - # ============================== # Initialize Booster # ============================== @@ -170,11 +168,32 @@ def main(): initial_scale=2**16, cpu_offload=True, max_norm=args.grad_clip) + elif args.plugin == 'hybrid_parallel': + # modify the param accordingly, default configuration is for llama2-7b + plugin = HybridParallelPlugin(tp_size=4, + pp_size=2, + num_microbatches=None, + microbatch_size=1, + enable_jit_fused=False, + zero_stage=0, + precision='fp32', + initial_scale=1) else: raise ValueError(f'Unknown plugin {args.plugin}') booster = Booster(plugin=plugin) + use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1 + is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage() + print_flag = (not use_pipeline and coordinator.is_master()) or (use_pipeline and is_pp_last_stage) + + # ============================== + # Initialize Tensorboard + # ============================== + if print_flag: + os.makedirs(args.tensorboard_dir, exist_ok=True) + writer = SummaryWriter(args.tensorboard_dir) + # ============================== # Initialize Tokenizer, Dataset and Dataloader # ============================== @@ -188,12 +207,15 @@ def main(): batch_size=args.batch_size, shuffle=True, drop_last=True, - collate_fn=partial(tokenize_batch, tokenizer=tokenizer, max_length=args.max_length)) + collate_fn=partial(tokenize_batch_for_pretrain, + tokenizer=tokenizer, + max_length=args.max_length)) # ============================== # Initialize Model, Optimizer and LR Scheduler # ============================== config = MODEL_CONFIGS[args.config] + # use lazy init when using GeminiPlugin init_ctx = LazyInitContext( default_device=get_current_device()) if isinstance(plugin, GeminiPlugin) else nullcontext() @@ -236,27 +258,42 @@ def main(): coordinator.print_on_master(f'Loaded checkpoint {args.load} at epoch {start_epoch} step {start_step}') num_steps_per_epoch = len(dataloader) + # if resume training, set the sampler start index to the correct value dataloader.sampler.set_start_index(sampler_start_idx) for epoch in range(start_epoch, args.num_epochs): dataloader.sampler.set_epoch(epoch) - with tqdm(enumerate(dataloader), + step_nums = num_steps_per_epoch - start_step + dataloader_iter = iter(dataloader) + + with tqdm(range(step_nums), desc=f'Epoch {epoch}', - disable=not coordinator.is_master(), + disable=not print_flag, total=num_steps_per_epoch, initial=start_step) as pbar: - for step, batch in pbar: - batch = {k: v.cuda() for k, v in batch.items()} - outputs = model(**batch) - loss = outputs[0] - booster.backward(loss, optimizer) + for step in pbar: + if use_pipeline: + outputs = booster.execute_pipeline(dataloader_iter, + model, + _criterion, + optimizer, + return_loss=True, + return_outputs=True) + loss = outputs["loss"] + else: + batch = next(dataloader_iter) + outputs = model(**batch) + loss = outputs[0] + booster.backward(loss, optimizer) + optimizer.step() lr_scheduler.step() optimizer.zero_grad() - all_reduce_mean(loss) - pbar.set_postfix({'loss': loss.item()}) - if coordinator.is_master(): + if not use_pipeline: + all_reduce_mean(loss) + if print_flag: + pbar.set_postfix({'loss': loss.item()}) writer.add_scalar('loss', loss.item(), epoch * num_steps_per_epoch + step) if args.save_interval > 0 and (step + 1) % args.save_interval == 0: diff --git a/examples/language/llama2/requirements.txt b/examples/language/llama2/requirements.txt index 3ddf21ffe534..6b475682dad0 100644 --- a/examples/language/llama2/requirements.txt +++ b/examples/language/llama2/requirements.txt @@ -1,4 +1,4 @@ -colossalai>=0.3.0 +colossalai>=0.3.2 datasets numpy torch>=1.12.0,<=2.0.0 diff --git a/examples/language/opt/README.md b/examples/language/opt/README.md index 37e1ff4d9008..af1e794374ed 100644 --- a/examples/language/opt/README.md +++ b/examples/language/opt/README.md @@ -23,9 +23,9 @@ The following example of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) ## Our Modifications We are using the pre-training weights of the OPT model provided by Hugging Face Hub on the raw WikiText-2 (no tokens were replaced before -the tokenization). +the tokenization). -We adapt the OPT training code to ColossalAI by leveraging [Boosting API](https://colossalai.org/docs/basics/booster_api) loaded with a chosen plugin, where each plugin corresponds to a specific kind of training strategy. This example supports plugins including TorchDDPPlugin, LowLevelZeroPlugin, and GeminiPlugin. +We adapt the OPT training code to ColossalAI by leveraging [Boosting API](https://colossalai.org/docs/basics/booster_api) loaded with a chosen plugin, where each plugin corresponds to a specific kind of training strategy. This example supports plugins including TorchDDPPlugin, LowLevelZeroPlugin, HybridParallelPlugin and GeminiPlugin. ## Run Demo @@ -48,6 +48,3 @@ You can run benchmark for OPT model by running the following script: bash run_benchmark.sh ``` The script will test performance (throughput & peak memory usage) for each combination of hyperparameters. You can also play with this script to configure your set of hyperparameters for testing. - - - diff --git a/examples/language/opt/requirements.txt b/examples/language/opt/requirements.txt index 4422216e6a1c..45bfbc37195f 100644 --- a/examples/language/opt/requirements.txt +++ b/examples/language/opt/requirements.txt @@ -1,4 +1,4 @@ -colossalai >= 0.1.12 +colossalai >= 0.3.2 torch >= 1.8.1 datasets >= 1.8.0 -transformers >= 4.20.0 \ No newline at end of file +transformers >= 4.30.2