From 955091e848190c030e027248c89a1ac6cc0f5614 Mon Sep 17 00:00:00 2001 From: ver217 Date: Thu, 24 Aug 2023 11:56:51 +0800 Subject: [PATCH 01/12] [example] transfer llama-1 example --- examples/language/llama/README.md | 174 ++++++++++- examples/language/llama/attn.py | 80 +++++ examples/language/llama/benchmark.py | 198 +++++++++++++ .../3d/batch2_seq2048_flash_attn.sh | 19 ++ .../3d/batch2_seq2048_flash_attn_offload.sh | 19 ++ .../benchmark_65B/3d/hostfile_example.txt | 4 + .../gemini_auto/batch12_seq2048_flash_attn.sh | 18 ++ .../gemini_auto/batch2_seq2048_flash_attn.sh | 18 ++ .../gemini_auto/batch4_seq1024.sh | 18 ++ .../gemini_auto/hostfile_example.txt | 4 + .../gemini_cuda/batch16_seq512.sh | 18 ++ .../gemini_cuda/hostfile_example.txt | 4 + .../benchmark_7B/gemini_auto/batch8_seq512.sh | 16 + .../gemini_auto/hostfile_example.txt | 4 + .../gemini_cuda/batch16_seq512.sh | 18 ++ .../gemini_cuda/hostfile_example.txt | 4 + examples/language/llama/data_utils.py | 119 ++++++++ examples/language/llama/model_utils.py | 32 ++ .../language/llama/performance_evaluator.py | 102 +++++++ examples/language/llama/pretrain.py | 273 ++++++++++++++++++ examples/language/llama/requirements.txt | 6 + 21 files changed, 1146 insertions(+), 2 deletions(-) create mode 100644 examples/language/llama/attn.py create mode 100644 examples/language/llama/benchmark.py create mode 100644 examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn.sh create mode 100644 examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh create mode 100644 examples/language/llama/benchmark_65B/3d/hostfile_example.txt create mode 100644 examples/language/llama/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh create mode 100644 examples/language/llama/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh create mode 100644 examples/language/llama/benchmark_65B/gemini_auto/batch4_seq1024.sh create mode 100644 examples/language/llama/benchmark_65B/gemini_auto/hostfile_example.txt create mode 100644 examples/language/llama/benchmark_65B/gemini_cuda/batch16_seq512.sh create mode 100644 examples/language/llama/benchmark_65B/gemini_cuda/hostfile_example.txt create mode 100644 examples/language/llama/benchmark_7B/gemini_auto/batch8_seq512.sh create mode 100644 examples/language/llama/benchmark_7B/gemini_auto/hostfile_example.txt create mode 100644 examples/language/llama/benchmark_7B/gemini_cuda/batch16_seq512.sh create mode 100644 examples/language/llama/benchmark_7B/gemini_cuda/hostfile_example.txt create mode 100644 examples/language/llama/data_utils.py create mode 100644 examples/language/llama/model_utils.py create mode 100644 examples/language/llama/performance_evaluator.py create mode 100644 examples/language/llama/pretrain.py create mode 100644 examples/language/llama/requirements.txt diff --git a/examples/language/llama/README.md b/examples/language/llama/README.md index 871804f2ca86..04645ebecfd8 100644 --- a/examples/language/llama/README.md +++ b/examples/language/llama/README.md @@ -1,11 +1,181 @@ # Pretraining LLaMA: best practices for building LLaMA-like base models

- +

- 65-billion-parameter large model pretraining accelerated by 38% [[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama) [[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining) -> Since the main branch is being updated, in order to maintain the stability of the code, this example is temporarily kept as an [independent branch](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama). + +## Dataset + +Different from the original LLaMA, we use [RedPajama](https://www.together.xyz/blog/redpajama) dataset, which is a reproduction of the LLaMA training dataset containing over 1.2 trillion tokens. The full dataset is ~5TB unzipped on disk and ~3TB to download compressed. + +A smaller, more consumable random sample can be downloaded through [Hugging Face](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T). If you just want to try out the pretraining script, you can use a 1B-token sample subset of RedPajama, which is available at [Hugging Face](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample). + +RedPajama-Data-1T consists of seven data slices: + +| | RedPajama | LLaMA | +|---------------|--------------|---------------| +| CommonCrawl | 878 billion | 852 billion | +| C4 | 175 billion | 190 billion | +| Github | 59 billion | 100 billion | +| Books | 26 billion | 25 billion | +| ArXiv | 28 billion | 33 billion | +| Wikipedia | 24 billion | 25 billion | +| StackExchange | 20 billion | 27 billion | +| Total | 1.2 trillion | 1.25 trillion | + +## Training + +We follow the hyperparameter settings from the original LLaMA paper. We use AdamW with $beta1=0.9$ and $beta2=0.95$. We use a cosine learning rate schedule, such that the final learning rate is equal to 10% of the maximal learning rate. We use a weight decay of 0.1 and gradient clipping of 1.0. We use 2,000 warmup steps. + +| params | learning rate | batch size | +|--------|---------------|------------| +| 6.7B | 3.0e-4 | 4M | +| 13.0B | 3.0e-4 | 4M | +| 32.5B | 1.5e-4 | 4M | +| 65.2B | 1.5e-4 | 4M | + +## Usage + +### 1. Installation + +You should install ColossalAI of this branch from source. + +```bash +git clone -b example/llama https://github.com/hpcaitech/ColossalAI.git +``` + +At the root directory of ColossalAI, run + +```bash +CUDA_EXT=1 pip install . +``` + +Then install other dependencies. + +```bash +pip install -r requirements.txt +``` + +If you want to use flash attention, which can accelerate training while saving memory, you should install: +```bash +pip install xformers +``` + +Additionally, we recommend you to use torch 1.13.1. We've tested our code on torch 1.13.1 and found it's compatible with our code and xformers. + +### 2. Download the dataset + +The dataset can be automatically downloaded by using `huggingface/datasets`. You can specify the dataset path by `-d` or `--dataset`. The default dataset is `togethercomputer/RedPajama-Data-1T-Sample`. + +### 3. Command line arguments + +Yon can use colossalai run to launch multi-nodes training: +```bash +colossalai run --nproc_per_node YOUR_GPU_PER_NODE --hostfile YOUR_HOST_FILE \ +--master_addr YOUR_MASTER_ADDR pretrain.py --OTHER_CONFIGURATIONS +``` + +Here is a sample hostfile: + +```text +hostname1 +hostname2 +hostname3 +hostname4 +``` + +Make sure master node can access all nodes (including itself) by ssh without password. + +Here is details about CLI arguments: + +- Model configuration: `-c`, `--config`. `7b`, `13b`, `30b` and `65b` are supported. +- Booster plugin: `-p`, `--plugin`. `gemini`, `gemini_cpu`, `zero2` and `zero2_cpu` are supported. For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins). +- Dataset path: `-d`, `--dataset`. The default dataset is `togethercomputer/RedPajama-Data-1T-Sample`. It support any dataset from `datasets` with the same data format as RedPajama. +- Number of epochs: `-e`, `--num_epochs`. The default value is 1. +- Local batch size: `-b`, `--batch_size`. Batch size per GPU. The default value is 2. +- Learning rate: `--lr`. The default value is 3e-4. +- Weight decay: `-w`, `--weight_decay`. The default value is 0.1. +- Warmup steps: `-s`, `--warmup_steps`. The default value is 2000. +- Gradient checkpointing: `-g`, `--gradient_checkpoint`. The default value is `False`. This saves memory at the cost of speed. You'd better enable this option when training with a large batch size. +- Max length: `-l`, `--max_length`. The default value is 2048. +- Mixed precision: `-x`, `--mixed_precision`. The default value is "fp16". "fp16" and "bf16" are supported. +- Save interval: `-i`, `--save_interval`. The interval (steps) of saving checkpoints. The default value is 1000. +- Checkpoint directory: `-o`, `--save_dir`. The directoty path to save checkpoints. The default value is `checkpoint`. +- Checkpoint to load: `-f`, `--load`. The checkpoint path to load. The default value is `None`. +- Gradient clipping: `--gradient_clipping`. The default value is 1.0. +- Tensorboard log directory: `-t`, `--tensorboard_dir`. The directory path to save tensorboard logs. The default value is `tb_logs`. +- Flash attention: `-a`, `--flash_attention`. If you want to use flash attention, you must install [xformers](https://github.com/facebookresearch/xformers) first. The default value is `False`. This is helpful to accelerate training while saving memory. We recommend you always use flash attention. + + +### 4. Shell Script Examples + +For your convenience, we provide some shell scripts to run benchmark with various gemini configurations. +You can find them in `benchmark_65B` and `benchmark_7B` directory. The main command should be in the format of: +```bash +colossalai run --nproc_per_node YOUR_GPU_PER_NODE --hostfile YOUR_HOST_FILE \ +--master_addr YOUR_MASTER_ADDR benchmark.py --OTHER_CONFIGURATIONS +``` +Here we will show an example of how to run training +llama pretraining with `gemini(gemini_auto plugin), batch_size=12, sequence_length=2048, gradient_checkpoint=True`. + +#### a. Running environment +This experiment was performed on 4 computing nodes with 32 A800 GPUs in total. The nodes are +connected with RDMA and GPUs within one node are fully connected with NVLink. + +#### b. Running command +```bash +cd examples/language/llama/benchmark_65B/gemini_auto/ +# First, modify hostfile_example.txt with your real host ip or host name. +# Second, replace the hostfile path and the master address in the shell. +# Third, add the system environment variables and load the running Python environment to the shell +# if needed. +bash batch12_seq2048_flash_attn.sh +``` +#### c. Results +If you run the above command successfully, you will get the following results: +`max memory usage: 58500.20 MB, throughput: 5.29 samples/s, TFLOPS/GPU: 176.84`. + + +## Reference +``` +@article{bian2021colossal, + title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training}, + author={Bian, Zhengda and Liu, Hongxin and Wang, Boxiang and Huang, Haichen and Li, Yongbin and Wang, Chuanrui and Cui, Fan and You, Yang}, + journal={arXiv preprint arXiv:2110.14883}, + year={2021} +} +``` + +```bibtex +@software{openlm2023openllama, + author = {Geng, Xinyang and Liu, Hao}, + title = {OpenLLaMA: An Open Reproduction of LLaMA}, + month = May, + year = 2023, + url = {https://github.com/openlm-research/open_llama} +} +``` + +```bibtex +@software{together2023redpajama, + author = {Together Computer}, + title = {RedPajama-Data: An Open Source Recipe to Reproduce LLaMA training dataset}, + month = April, + year = 2023, + url = {https://github.com/togethercomputer/RedPajama-Data} +} +``` + +```bibtex +@article{touvron2023llama, + title={Llama: Open and efficient foundation language models}, + author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others}, + journal={arXiv preprint arXiv:2302.13971}, + year={2023} +} +``` diff --git a/examples/language/llama/attn.py b/examples/language/llama/attn.py new file mode 100644 index 000000000000..d0f442e3b436 --- /dev/null +++ b/examples/language/llama/attn.py @@ -0,0 +1,80 @@ +import math +from types import MethodType +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotary_pos_emb + +SUPPORT_XFORMERS = False +SUPPORT_FLASH2 = False +try: + import xformers.ops as xops + SUPPORT_XFORMERS = True +except ImportError: + pass + +try: + from flash_attn import flash_attn_func, flash_attn_qkvpacked_func + SUPPORT_FLASH2 = True +except ImportError: + pass + +SUPPORT_FLASH = SUPPORT_XFORMERS or SUPPORT_FLASH2 + + +def llama_flash_attention( + self: LlamaAttention, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + # q, k, v is [B, H, S, K] and xformers need [B, S, H, K]. returns [B, S, H, K] + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + if SUPPORT_FLASH2: + attn_output = flash_attn_func(query_states, key_states, value_states, causal=True) + else: + attn_output = xops.memory_efficient_attention(query_states, + key_states, + value_states, + attn_bias=xops.LowerTriangularMask()) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +def replace_xformers(model: nn.Module): + for module in model.modules(): + if isinstance(module, LlamaAttention): + module.forward = MethodType(llama_flash_attention, module) diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py new file mode 100644 index 000000000000..fc49c3bdc801 --- /dev/null +++ b/examples/language/llama/benchmark.py @@ -0,0 +1,198 @@ +import argparse +import resource +from contextlib import nullcontext + +import torch +from attn import SUPPORT_FLASH, replace_xformers +from data_utils import RandomDataset +from model_utils import format_numel_str, get_model_numel +from performance_evaluator import PerformanceEvaluator +from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision +from tqdm import tqdm +from transformers.models.llama.configuration_llama import LlamaConfig +from transformers.models.llama.modeling_llama import LlamaForCausalLM + +import colossalai +from colossalai.booster import Booster +from colossalai.booster.plugin import GeminiPlugin, ThreeDimParallelPlugin, TorchFSDPPlugin +from colossalai.cluster import DistCoordinator +from colossalai.lazy import LazyInitContext +from colossalai.nn.optimizer import HybridAdam +from colossalai.utils import get_current_device +from colossalai.zero.gemini.placement_policy import AutoPlacementPolicy, ConstPlacementPolicy + +# ============================== +# Constants +# ============================== + +MODEL_CONFIGS = { + '7b': LlamaConfig(), + '13b': LlamaConfig(hidden_size=5120, intermediate_size=13760, num_hidden_layers=40, num_attention_heads=40), + '30b': LlamaConfig(hidden_size=6656, intermediate_size=17888, num_hidden_layers=60, num_attention_heads=52), + '65b': LlamaConfig(hidden_size=8192, intermediate_size=22016, num_hidden_layers=80, num_attention_heads=64), +} + + +def main(): + # ============================== + # Parse Arguments + # ============================== + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, default='7b', help='Model configuration') + parser.add_argument('-p', + '--plugin', + choices=['gemini', 'gemini_cuda', 'gemini_cpu', 'fsdp', 'fsdp_cpu', '3d', '3d_cpu'], + default='gemini', + help='Choose which plugin to use') + parser.add_argument('-b', '--batch_size', type=int, default=2, help='Batch size') + parser.add_argument('-s', '--num_steps', type=int, default=5, help='Number of steps to run') + parser.add_argument('-i', '--ignore_steps', type=int, default=2, help='Number of steps to ignore') + parser.add_argument('-g', '--grad_checkpoint', action='store_true', help='Use gradient checkpointing') + parser.add_argument('-l', '--max_length', type=int, default=2048, help='Max sequence length') + parser.add_argument('-w', '--warmup_ratio', type=float, default=0.8, help='warm up ratio for auto placement policy') + parser.add_argument('-m', '--memory_limit', type=int, help='Gemini memory limit in mb') + parser.add_argument('-x', '--xformers', action='store_true', help='Use xformers') + parser.add_argument('--tp', type=int, default=1, help='Tensor parallel size') + parser.add_argument('--pp', type=int, default=1, help='Pipeline parallel size') + parser.add_argument('--edp', type=int, default=1, help='Extra data parallel size') + parser.add_argument('--mbs', type=int, default=1) + parser.add_argument('--zero', type=int, default=0) + args = parser.parse_args() + + colossalai.launch_from_torch({}) + coordinator = DistCoordinator() + + def empty_init(): + pass + + # ============================== + # Initialize Booster + # ============================== + use_empty_init = True + if args.plugin == 'gemini': + AutoPlacementPolicy.set_warmup_non_model_data_ratio(args.warmup_ratio) + plugin = GeminiPlugin(placement_policy='auto', precision='bf16', extra_dp_size=args.edp) + elif args.plugin == 'gemini_cuda': + plugin = GeminiPlugin(placement_policy='cuda', precision='bf16', extra_dp_size=args.edp) + elif args.plugin == 'gemini_cpu': + plugin = GeminiPlugin(placement_policy='cpu', precision='bf16', extra_dp_size=args.edp) + elif args.plugin == 'const': + ConstPlacementPolicy.set_const_memory_boundary(args.memory_limit) + plugin = GeminiPlugin(placement_policy='const', precision='bf16') + elif args.plugin == 'fsdp': + if use_empty_init: + plugin = TorchFSDPPlugin( + mixed_precision=MixedPrecision(param_dtype=torch.float16, + reduce_dtype=torch.float16, + buffer_dtype=torch.float16), + param_init_fn=empty_init(), + ) + else: + plugin = TorchFSDPPlugin(mixed_precision=MixedPrecision( + param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16)) + elif args.plugin == 'fsdp_cpu': + if use_empty_init: + plugin = TorchFSDPPlugin( + mixed_precision=MixedPrecision(param_dtype=torch.float16, + reduce_dtype=torch.float16, + buffer_dtype=torch.float16), + cpu_offload=CPUOffload(offload_params=True), + param_init_fn=empty_init(), + ) + else: + plugin = TorchFSDPPlugin(mixed_precision=MixedPrecision(param_dtype=torch.float16, + reduce_dtype=torch.float16, + buffer_dtype=torch.float16), + cpu_offload=CPUOffload(offload_params=True)) + elif args.plugin == '3d': + plugin = ThreeDimParallelPlugin(tp_size=args.tp, + pp_size=args.pp, + zero_stage=args.zero, + enable_fused_normalization=True, + num_microbatches=args.mbs, + precision='bf16') + elif args.plugin == '3d_cpu': + plugin = ThreeDimParallelPlugin(tp_size=args.tp, + pp_size=args.pp, + zero_stage=args.zero, + cpu_offload=True, + enable_fused_normalization=True, + num_microbatches=args.mbs, + initial_scale=2**8, + precision='bf16') + else: + raise ValueError(f'Unknown plugin {args.plugin}') + + booster = Booster(plugin=plugin) + + # ============================== + # Initialize Dataset and Dataloader + # ============================== + dp_size = plugin.dp_size if isinstance(plugin, ThreeDimParallelPlugin) else coordinator.world_size + + config = MODEL_CONFIGS[args.config] + dataset = RandomDataset(num_samples=args.batch_size * args.num_steps * dp_size, + max_length=args.max_length, + vocab_size=config.vocab_size) + dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) + + # ============================== + # Initialize Model and Optimizer + # ============================== + init_ctx = LazyInitContext( + default_device=get_current_device()) if isinstance(plugin, + (GeminiPlugin, ThreeDimParallelPlugin)) else nullcontext() + + with init_ctx: + model = LlamaForCausalLM(config) + + if args.grad_checkpoint: + model.gradient_checkpointing_enable() + + if args.xformers: + assert SUPPORT_FLASH, 'Use flash attention while xfomers is not installed' + replace_xformers(model) + + model_numel = get_model_numel(model) + coordinator.print_on_master(f'Model params: {format_numel_str(model_numel)}') + performance_evaluator = PerformanceEvaluator(model_numel, + args.grad_checkpoint, + args.ignore_steps, + dp_world_size=dp_size) + + optimizer = HybridAdam(model.parameters()) + torch.set_default_dtype(torch.bfloat16) + model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader) + torch.set_default_dtype(torch.float) + coordinator.print_on_master(f'Booster init max CUDA memory: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB') + coordinator.print_on_master( + f'Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB') + + if isinstance(plugin, ThreeDimParallelPlugin) and args.pp > 1: + data_iter = iter(dataloader) + for step in tqdm(range(len(dataloader)), desc='Step', disable=not coordinator.is_master()): + performance_evaluator.on_step_start(step) + booster.execute_pipeline(data_iter, + model, + criterion=lambda outputs, inputs: outputs[0], + optimizer=optimizer, + return_loss=False) + optimizer.step() + optimizer.zero_grad() + performance_evaluator.on_step_end(input_ids=torch.empty(args.batch_size, args.max_length)) + else: + for step, batch in enumerate(tqdm(dataloader, desc='Step', disable=not coordinator.is_master())): + performance_evaluator.on_step_start(step) + outputs = model(**batch) + loss = outputs[0] + booster.backward(loss, optimizer) + optimizer.step() + optimizer.zero_grad() + performance_evaluator.on_step_end(**batch) + + performance_evaluator.on_fit_end() + coordinator.print_on_master(f'Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB') + + +if __name__ == '__main__': + main() diff --git a/examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn.sh b/examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn.sh new file mode 100644 index 000000000000..80356a55709d --- /dev/null +++ b/examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +################ +#Load your environments and modules here +################ + + +cd ../.. + +# NCCL IB environment variables +export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 +export NCCL_IB_DISABLE=0 +export NCCL_SOCKET_IFNAME=eth0 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_TIMEOUT=23 +export NCCL_IB_RETRY_CNT=7 + +# 4-tp + 4-pp + 2-zero1, num microbatches=8 +colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '65b' --plugin "3d" -l 2048 -g -b 16 -x --tp 4 --pp 4 --zero 1 --mbs 8 diff --git a/examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh b/examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh new file mode 100644 index 000000000000..6a23b117fa76 --- /dev/null +++ b/examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +################ +#Load your environments and modules here +################ + + +cd ../.. + +# NCCL IB environment variables +export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 +export NCCL_IB_DISABLE=0 +export NCCL_SOCKET_IFNAME=eth0 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_TIMEOUT=23 +export NCCL_IB_RETRY_CNT=7 + +# 4-tp + 4-pp + 2-zero1, num microbatches=8, cpu offload is enabled +colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '65b' --plugin "3d_cpu" -l 2048 -g -b 16 -x --tp 4 --pp 4 --zero 1 --mbs 8 diff --git a/examples/language/llama/benchmark_65B/3d/hostfile_example.txt b/examples/language/llama/benchmark_65B/3d/hostfile_example.txt new file mode 100644 index 000000000000..4150e1be488e --- /dev/null +++ b/examples/language/llama/benchmark_65B/3d/hostfile_example.txt @@ -0,0 +1,4 @@ +host1 +host2 +host3 +host4 diff --git a/examples/language/llama/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh b/examples/language/llama/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh new file mode 100644 index 000000000000..22d757fe2196 --- /dev/null +++ b/examples/language/llama/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +################ +#Load your environments and modules here +################ + + +cd ../.. + +# NCCL IB environment variables +export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 +export NCCL_IB_DISABLE=0 +export NCCL_SOCKET_IFNAME=eth0 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_TIMEOUT=23 +export NCCL_IB_RETRY_CNT=7 + +colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '65b' --plugin "gemini" -l 2048 -g -b 12 -x diff --git a/examples/language/llama/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh b/examples/language/llama/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh new file mode 100644 index 000000000000..4c1f4125c8e9 --- /dev/null +++ b/examples/language/llama/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +################ +#Load your environments and modules here +################ + + +cd ../.. + +# NCCL IB environment variables +export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 +export NCCL_IB_DISABLE=0 +export NCCL_SOCKET_IFNAME=eth0 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_TIMEOUT=23 +export NCCL_IB_RETRY_CNT=7 + +colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '65b' --plugin "gemini" -l 2048 -g -b 2 -x diff --git a/examples/language/llama/benchmark_65B/gemini_auto/batch4_seq1024.sh b/examples/language/llama/benchmark_65B/gemini_auto/batch4_seq1024.sh new file mode 100644 index 000000000000..05b1ed1913f2 --- /dev/null +++ b/examples/language/llama/benchmark_65B/gemini_auto/batch4_seq1024.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +################ +#Load your environments and modules here +################ + + +cd ../.. + +# NCCL IB environment variables +export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 +export NCCL_IB_DISABLE=0 +export NCCL_SOCKET_IFNAME=eth0 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_TIMEOUT=23 +export NCCL_IB_RETRY_CNT=7 + +colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '65b' --plugin "gemini" -l 1024 -g -b 4 diff --git a/examples/language/llama/benchmark_65B/gemini_auto/hostfile_example.txt b/examples/language/llama/benchmark_65B/gemini_auto/hostfile_example.txt new file mode 100644 index 000000000000..4150e1be488e --- /dev/null +++ b/examples/language/llama/benchmark_65B/gemini_auto/hostfile_example.txt @@ -0,0 +1,4 @@ +host1 +host2 +host3 +host4 diff --git a/examples/language/llama/benchmark_65B/gemini_cuda/batch16_seq512.sh b/examples/language/llama/benchmark_65B/gemini_cuda/batch16_seq512.sh new file mode 100644 index 000000000000..b46b8a5a2023 --- /dev/null +++ b/examples/language/llama/benchmark_65B/gemini_cuda/batch16_seq512.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +################ +#Load your environments and modules here +################ + + +cd ../.. + +# NCCL IB environment variables +export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 +export NCCL_IB_DISABLE=0 +export NCCL_SOCKET_IFNAME=eth0 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_TIMEOUT=23 +export NCCL_IB_RETRY_CNT=7 + +colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '65b' --plugin "gemini_cuda" -l 512 -g -b 16 diff --git a/examples/language/llama/benchmark_65B/gemini_cuda/hostfile_example.txt b/examples/language/llama/benchmark_65B/gemini_cuda/hostfile_example.txt new file mode 100644 index 000000000000..4150e1be488e --- /dev/null +++ b/examples/language/llama/benchmark_65B/gemini_cuda/hostfile_example.txt @@ -0,0 +1,4 @@ +host1 +host2 +host3 +host4 diff --git a/examples/language/llama/benchmark_7B/gemini_auto/batch8_seq512.sh b/examples/language/llama/benchmark_7B/gemini_auto/batch8_seq512.sh new file mode 100644 index 000000000000..d9413ef43e28 --- /dev/null +++ b/examples/language/llama/benchmark_7B/gemini_auto/batch8_seq512.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +################ +#Load your environments and modules here +################ + +cd ../.. + +export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 +export NCCL_IB_DISABLE=0 +export NCCL_SOCKET_IFNAME=eth0 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_TIMEOUT=23 +export NCCL_IB_RETRY_CNT=7 + +colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py --plugin "gemini" -l 512 -g -b 8 diff --git a/examples/language/llama/benchmark_7B/gemini_auto/hostfile_example.txt b/examples/language/llama/benchmark_7B/gemini_auto/hostfile_example.txt new file mode 100644 index 000000000000..4150e1be488e --- /dev/null +++ b/examples/language/llama/benchmark_7B/gemini_auto/hostfile_example.txt @@ -0,0 +1,4 @@ +host1 +host2 +host3 +host4 diff --git a/examples/language/llama/benchmark_7B/gemini_cuda/batch16_seq512.sh b/examples/language/llama/benchmark_7B/gemini_cuda/batch16_seq512.sh new file mode 100644 index 000000000000..0d3628f6750a --- /dev/null +++ b/examples/language/llama/benchmark_7B/gemini_cuda/batch16_seq512.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +################ +#Load your environments and modules here +################ + + +cd ../.. + +# NCCL IB environment variables +export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 +export NCCL_IB_DISABLE=0 +export NCCL_SOCKET_IFNAME=eth0 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_TIMEOUT=23 +export NCCL_IB_RETRY_CNT=7 + +colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '7b' --plugin "gemini_cuda" -l 512 -g -b 16 diff --git a/examples/language/llama/benchmark_7B/gemini_cuda/hostfile_example.txt b/examples/language/llama/benchmark_7B/gemini_cuda/hostfile_example.txt new file mode 100644 index 000000000000..4150e1be488e --- /dev/null +++ b/examples/language/llama/benchmark_7B/gemini_cuda/hostfile_example.txt @@ -0,0 +1,4 @@ +host1 +host2 +host3 +host4 diff --git a/examples/language/llama/data_utils.py b/examples/language/llama/data_utils.py new file mode 100644 index 000000000000..25d0e1bd9f46 --- /dev/null +++ b/examples/language/llama/data_utils.py @@ -0,0 +1,119 @@ +import json +import random +from typing import Iterator, Optional + +import numpy as np +import torch +from torch.distributed import ProcessGroup +from torch.distributed.distributed_c10d import _get_default_group +from torch.utils.data import DataLoader, Dataset, DistributedSampler + +from colossalai.utils import get_current_device + + +class StatefulDistributedSampler(DistributedSampler): + + def __init__(self, + dataset: Dataset, + num_replicas: Optional[int] = None, + rank: Optional[int] = None, + shuffle: bool = True, + seed: int = 0, + drop_last: bool = False) -> None: + super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last) + self.start_index: int = 0 + + def __iter__(self) -> Iterator: + iterator = super().__iter__() + indices = list(iterator) + indices = indices[self.start_index:] + return iter(indices) + + def __len__(self) -> int: + return self.num_samples - self.start_index + + def set_start_index(self, start_index: int) -> None: + self.start_index = start_index + + +def prepare_dataloader(dataset, + batch_size, + shuffle=False, + seed=1024, + drop_last=False, + pin_memory=False, + num_workers=0, + process_group: Optional[ProcessGroup] = None, + **kwargs): + r""" + Prepare a dataloader for distributed training. The dataloader will be wrapped by + `torch.utils.data.DataLoader` and `StatefulDistributedSampler`. + + + Args: + dataset (`torch.utils.data.Dataset`): The dataset to be loaded. + shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False. + seed (int, optional): Random worker seed for sampling, defaults to 1024. + add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True. + drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size + is not divisible by the batch size. If False and the size of dataset is not divisible by + the batch size, then the last batch will be smaller, defaults to False. + pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False. + num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0. + kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in + `DataLoader `_. + + Returns: + :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing. + """ + _kwargs = kwargs.copy() + process_group = process_group or _get_default_group() + sampler = StatefulDistributedSampler(dataset, + num_replicas=process_group.size(), + rank=process_group.rank(), + shuffle=shuffle) + + # Deterministic dataloader + def seed_worker(worker_id): + worker_seed = seed + np.random.seed(worker_seed) + torch.manual_seed(worker_seed) + random.seed(worker_seed) + + return DataLoader(dataset, + batch_size=batch_size, + sampler=sampler, + worker_init_fn=seed_worker, + drop_last=drop_last, + pin_memory=pin_memory, + num_workers=num_workers, + **_kwargs) + + +def load_json(file_path: str): + with open(file_path, 'r') as f: + return json.load(f) + + +def save_json(data, file_path: str): + with open(file_path, 'w') as f: + json.dump(data, f, indent=4) + + +class RandomDataset(Dataset): + + def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000): + self.num_samples = num_samples + self.max_length = max_length + self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device()) + self.attention_mask = torch.ones_like(self.input_ids) + + def __len__(self): + return self.num_samples + + def __getitem__(self, idx): + return { + 'input_ids': self.input_ids[idx], + 'attention_mask': self.attention_mask[idx], + 'labels': self.input_ids[idx] + } diff --git a/examples/language/llama/model_utils.py b/examples/language/llama/model_utils.py new file mode 100644 index 000000000000..431ff5cfb446 --- /dev/null +++ b/examples/language/llama/model_utils.py @@ -0,0 +1,32 @@ +from contextlib import contextmanager + +import torch +import torch.nn as nn + + +@contextmanager +def low_precision_init(target_dtype: torch.dtype = torch.float16): + dtype = torch.get_default_dtype() + try: + torch.set_default_dtype(target_dtype) + yield + finally: + torch.set_default_dtype(dtype) + + +def get_model_numel(model: nn.Module) -> int: + return sum(p.numel() for p in model.parameters()) + + +def format_numel_str(numel: int) -> str: + B = 1024**3 + M = 1024**2 + K = 1024 + if numel >= B: + return f'{numel / B:.2f} B' + elif numel >= M: + return f'{numel / M:.2f} M' + elif numel >= K: + return f'{numel / K:.2f} K' + else: + return f'{numel}' diff --git a/examples/language/llama/performance_evaluator.py b/examples/language/llama/performance_evaluator.py new file mode 100644 index 000000000000..711b99c54360 --- /dev/null +++ b/examples/language/llama/performance_evaluator.py @@ -0,0 +1,102 @@ +from time import time +from typing import Optional + +import torch +import torch.distributed as dist +from torch import Tensor + +from colossalai.cluster import DistCoordinator + + +def divide(x: float, y: float) -> float: + if y == 0: + return float('inf') + elif y == float('inf'): + return float('nan') + return x / y + + +@torch.no_grad() +def all_reduce_mean(x: float, world_size: int) -> float: + if world_size == 1: + return x + tensor = torch.tensor([x], device=torch.cuda.current_device()) + dist.all_reduce(tensor) + tensor = tensor / world_size + return tensor.item() + + +class Timer: + + def __init__(self) -> None: + self.start_time: Optional[float] = None + self.duration: float = 0. + + def start(self) -> None: + self.start_time = time() + + def end(self) -> None: + assert self.start_time is not None + self.duration += time() - self.start_time + self.start_time = None + + def reset(self) -> None: + self.duration = 0. + + +class PerformanceEvaluator: + """ + Callback for valuate the performance of the model. + Args: + actor_num_params: The number of parameters of the actor model. + critic_num_params: The number of parameters of the critic model. + initial_model_num_params: The number of parameters of the initial model. + reward_model_num_params: The number of parameters of the reward model. + enable_grad_checkpoint: Whether to enable gradient checkpointing. + ignore_episodes: The number of episodes to ignore when calculating the performance. + """ + + def __init__(self, + model_numel: int, + enable_grad_checkpoint: bool = False, + ignore_steps: int = 0, + dp_world_size: Optional[int] = None) -> None: + self.model_numel = model_numel + self.enable_grad_checkpoint = enable_grad_checkpoint + self.ignore_steps = ignore_steps + + self.coordinator = DistCoordinator() + self.dp_world_size = dp_world_size or self.coordinator.world_size + self.disable: bool = False + self.timer = Timer() + self.num_samples: int = 0 + self.flop: int = 0 + + def on_step_start(self, step: int) -> None: + self.disable = self.ignore_steps > 0 and step < self.ignore_steps + if self.disable: + return + torch.cuda.synchronize() + self.timer.start() + + def on_step_end(self, input_ids: Tensor, **kwargs) -> None: + if self.disable: + return + torch.cuda.synchronize() + self.timer.end() + + batch_size, seq_len = input_ids.shape + + self.num_samples += batch_size + self.flop += batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint)) + + def on_fit_end(self) -> None: + avg_duration = all_reduce_mean(self.timer.duration, self.coordinator.world_size) + avg_throughput = self.num_samples * self.dp_world_size / (avg_duration + 1e-12) + mp_world_size = self.coordinator.world_size // self.dp_world_size + avg_tflops_per_gpu = self.flop / 1e12 / (avg_duration + 1e-12) / mp_world_size + self.coordinator.print_on_master( + f'num_samples: {self.num_samples}, dp_world_size: {self.dp_world_size}, flop: {self.flop}, avg_duration: {avg_duration}, ' + f'avg_throughput: {avg_throughput}') + self.coordinator.print_on_master( + f'Throughput: {avg_throughput:.2f} samples/sec, TFLOPS per GPU: {avg_tflops_per_gpu:.2f}') diff --git a/examples/language/llama/pretrain.py b/examples/language/llama/pretrain.py new file mode 100644 index 000000000000..ce902b23bb50 --- /dev/null +++ b/examples/language/llama/pretrain.py @@ -0,0 +1,273 @@ +import argparse +import os +import resource +from contextlib import nullcontext +from functools import partial +from typing import Optional, Tuple + +import torch +import torch.distributed as dist +import torch.nn as nn +from attn import SUPPORT_XFORMERS, replace_xformers +from data_utils import load_json, prepare_dataloader, save_json +from datasets import load_dataset +from torch.optim import Optimizer +from torch.optim.lr_scheduler import _LRScheduler +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm +from transformers.models.llama.configuration_llama import LlamaConfig +from transformers.models.llama.modeling_llama import LlamaForCausalLM +from transformers.models.llama.tokenization_llama import LlamaTokenizer + +import colossalai +from colossalai.booster import Booster +from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin +from colossalai.cluster import DistCoordinator +from colossalai.lazy import LazyInitContext +from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR +from colossalai.nn.optimizer import HybridAdam +from colossalai.utils import get_current_device + +MODEL_CONFIGS = { + '7b': LlamaConfig(), + '13b': LlamaConfig(hidden_size=5120, intermediate_size=13760, num_hidden_layers=40, num_attention_heads=40), + '30b': LlamaConfig(hidden_size=6656, intermediate_size=17888, num_hidden_layers=60, num_attention_heads=52), + '65b': LlamaConfig(hidden_size=8192, intermediate_size=22016, num_hidden_layers=80, num_attention_heads=64), +} + + +def get_model_numel(model: nn.Module) -> int: + return sum(p.numel() for p in model.parameters()) + + +def format_numel_str(numel: int) -> str: + B = 1024**3 + M = 1024**2 + K = 1024 + if numel >= B: + return f'{numel / B:.2f} B' + elif numel >= M: + return f'{numel / M:.2f} M' + elif numel >= K: + return f'{numel / K:.2f} K' + else: + return f'{numel}' + + +def tokenize_batch(batch, tokenizer: Optional[LlamaTokenizer] = None, max_length: int = 2048): + texts = [sample['text'] for sample in batch] + data = tokenizer(texts, return_tensors="pt", padding='max_length', truncation=True, max_length=max_length) + data['labels'] = data['input_ids'].clone() + return data + + +def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor: + dist.all_reduce(tensor, op=dist.ReduceOp.SUM) + tensor.div_(dist.get_world_size()) + return tensor + + +def save(booster: Booster, model: nn.Module, optimizer: Optimizer, lr_scheduler: _LRScheduler, epoch: int, step: int, + batch_size: int, coordinator: DistCoordinator, save_dir: str): + save_dir = os.path.join(save_dir, f'epoch{epoch}-step{step}') + os.makedirs(os.path.join(save_dir, 'model'), exist_ok=True) + + booster.save_model(model, os.path.join(save_dir, 'model'), shard=True) + # TODO: sharded optimizer is not supported yet + booster.save_optimizer(optimizer, os.path.join(save_dir, 'optimizer'), shard=False) + booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, 'lr_scheduler')) + running_states = { + 'epoch': epoch, + 'step': step, + 'sample_start_index': step * batch_size, + } + if coordinator.is_master(): + save_json(running_states, os.path.join(save_dir, 'running_states.json')) + + +def load(booster: Booster, model: nn.Module, optimizer: Optimizer, lr_scheduler: _LRScheduler, + load_dir: str) -> Tuple[int, int, int]: + booster.load_model(model, os.path.join(load_dir, 'model')) + booster.load_optimizer(optimizer, os.path.join(load_dir, 'optimizer')) + booster.load_lr_scheduler(lr_scheduler, os.path.join(load_dir, 'lr_scheduler')) + running_states = load_json(os.path.join(load_dir, 'running_states.json')) + return running_states['epoch'], running_states['step'], running_states['sample_start_index'] + + +def main(): + # ============================== + # Parse Arguments + # ============================== + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, default='7b', help='Model configuration') + parser.add_argument('-p', + '--plugin', + choices=['gemini', 'gemini_cuda', 'gemini_cpu', 'zero2', 'zero2_cpu'], + default='gemini', + help='Choose which plugin to use') + parser.add_argument('-d', + '--dataset', + type=str, + default='togethercomputer/RedPajama-Data-1T-Sample', + help='Data set path') + parser.add_argument('-e', '--num_epochs', type=int, default=1, help='Number of epochs') + parser.add_argument('-b', '--batch_size', type=int, default=2, help='Local batch size') + parser.add_argument('--lr', type=float, default=3e-4, help='Learning rate') + parser.add_argument('-w', '--weigth_decay', type=float, default=0.1, help='Weight decay') + parser.add_argument('-s', '--warmup_steps', type=int, default=2000, help='Warmup steps') + parser.add_argument('-g', '--grad_checkpoint', action='store_true', help='Use gradient checkpointing') + parser.add_argument('-l', '--max_length', type=int, default=2048, help='Max sequence length') + parser.add_argument('-x', '--mixed_precision', default='fp16', choices=['fp16', 'bf16'], help='Mixed precision') + parser.add_argument('-i', '--save_interval', type=int, default=1000, help='Save interval') + parser.add_argument('-o', '--save_dir', type=str, default='checkpoint', help='Checkpoint directory') + parser.add_argument('-f', '--load', type=str, default=None, help='Load checkpoint') + parser.add_argument('--grad_clip', type=float, default=1.0, help='Gradient clipping') + parser.add_argument('-t', '--tensorboard_dir', type=str, default='tb_logs', help='Tensorboard directory') + parser.add_argument('-a', '--flash_attention', action='store_true', help='Use Flash Attention') + args = parser.parse_args() + + # ============================== + # Initialize Distributed Training + # ============================== + colossalai.launch_from_torch({}) + coordinator = DistCoordinator() + + # ============================== + # Initialize Tensorboard + # ============================== + if coordinator.is_master(): + os.makedirs(args.tensorboard_dir, exist_ok=True) + writer = SummaryWriter(args.tensorboard_dir) + + # ============================== + # Initialize Booster + # ============================== + if args.plugin == 'gemini': + plugin = GeminiPlugin(precision=args.mixed_precision, + placement_policy='auto', + initial_scale=2**16, + max_norm=args.grad_clip) + elif args.plugin == 'gemini_cuda': + plugin = GeminiPlugin(precision=args.mixed_precision, + placement_policy='cuda', + initial_scale=2**16, + max_norm=args.grad_clip) + elif args.plugin == 'gemini_cpu': + plugin = GeminiPlugin(precision=args.mixed_precision, + placement_policy='cpu', + initial_scale=2**16, + max_norm=args.grad_clip) + elif args.plugin == 'zero2': + plugin = LowLevelZeroPlugin(stage=2, + precision=args.mixed_precision, + initial_scale=2**16, + max_norm=args.grad_clip) + elif args.plugin == 'zero2_cpu': + plugin = LowLevelZeroPlugin(stage=2, + precision=args.mixed_precision, + initial_scale=2**16, + cpu_offload=True, + max_norm=args.grad_clip) + else: + raise ValueError(f'Unknown plugin {args.plugin}') + + booster = Booster(plugin=plugin) + + # ============================== + # Initialize Tokenizer, Dataset and Dataloader + # ============================== + tokenizer = LlamaTokenizer.from_pretrained('hf-internal-testing/llama-tokenizer') + # follows fast chat: https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py#L257 + tokenizer.pad_token = tokenizer.unk_token + + dataset = load_dataset(args.dataset) + train_ds = dataset['train'] + dataloader = prepare_dataloader(train_ds, + batch_size=args.batch_size, + shuffle=True, + drop_last=True, + collate_fn=partial(tokenize_batch, tokenizer=tokenizer, max_length=args.max_length)) + + # ============================== + # Initialize Model, Optimizer and LR Scheduler + # ============================== + config = MODEL_CONFIGS[args.config] + init_ctx = LazyInitContext( + default_device=get_current_device()) if isinstance(plugin, GeminiPlugin) else nullcontext() + + with init_ctx: + model = LlamaForCausalLM(config) + + if args.grad_checkpoint: + model.gradient_checkpointing_enable() + if args.flash_attention: + assert SUPPORT_XFORMERS, 'Use flash attention while xfomers is not installed' + replace_xformers(model) + + model_numel = get_model_numel(model) + coordinator.print_on_master(f'Model params: {format_numel_str(model_numel)}') + + optimizer = HybridAdam(model.parameters(), lr=args.lr, betas=(0.9, 0.95), weight_decay=args.weigth_decay) + lr_scheduler = CosineAnnealingWarmupLR(optimizer, + total_steps=args.num_epochs * len(dataloader), + warmup_steps=args.warmup_steps, + eta_min=0.1 * args.lr) + default_dtype = torch.float16 if args.mixed_precision == 'fp16' else torch.bfloat16 + torch.set_default_dtype(default_dtype) + model, optimizer, _, dataloader, lr_scheduler = booster.boost(model, + optimizer, + dataloader=dataloader, + lr_scheduler=lr_scheduler) + torch.set_default_dtype(torch.float) + + coordinator.print_on_master(f'Booster init max CUDA memory: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB') + coordinator.print_on_master( + f'Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB') + + # load checkpoint if specified + start_epoch = 0 + start_step = 0 + sampler_start_idx = 0 + if args.load is not None: + coordinator.print_on_master('Loading checkpoint') + start_epoch, start_step, sampler_start_idx = load(booster, model, optimizer, lr_scheduler, args.load) + coordinator.print_on_master(f'Loaded checkpoint {args.load} at epoch {start_epoch} step {start_step}') + + num_steps_per_epoch = len(dataloader) + # if resume training, set the sampler start index to the correct value + dataloader.sampler.set_start_index(sampler_start_idx) + for epoch in range(start_epoch, args.num_epochs): + dataloader.sampler.set_epoch(epoch) + with tqdm(enumerate(dataloader), + desc=f'Epoch {epoch}', + disable=not coordinator.is_master(), + total=num_steps_per_epoch, + initial=start_step) as pbar: + for step, batch in pbar: + batch = {k: v.cuda() for k, v in batch.items()} + outputs = model(**batch) + loss = outputs[0] + booster.backward(loss, optimizer) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + all_reduce_mean(loss) + pbar.set_postfix({'loss': loss.item()}) + if coordinator.is_master(): + writer.add_scalar('loss', loss.item(), epoch * num_steps_per_epoch + step) + + if args.save_interval > 0 and (step + 1) % args.save_interval == 0: + coordinator.print_on_master(f'Saving checkpoint') + save(booster, model, optimizer, lr_scheduler, epoch, step + 1, args.batch_size, coordinator, + args.save_dir) + coordinator.print_on_master(f'Saved checkpoint at epoch {epoch} step {step + 1}') + # the continue epochs are not resumed, so we need to reset the sampler start index and start step + dataloader.sampler.set_start_index(0) + start_step = 0 + + coordinator.print_on_master(f'Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB') + + +if __name__ == '__main__': + main() diff --git a/examples/language/llama/requirements.txt b/examples/language/llama/requirements.txt new file mode 100644 index 000000000000..30dd31681747 --- /dev/null +++ b/examples/language/llama/requirements.txt @@ -0,0 +1,6 @@ +colossalai>=0.3.0 +datasets +numpy +torch +tqdm +transformers From 45b9ad4775724397166d76c7e9a4448957cb97aa Mon Sep 17 00:00:00 2001 From: ver217 Date: Thu, 24 Aug 2023 12:00:10 +0800 Subject: [PATCH 02/12] [example] fit llama-2 --- examples/language/llama/attn.py | 15 +++++++++------ examples/language/llama/benchmark.py | 19 +++++++++++++++---- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/examples/language/llama/attn.py b/examples/language/llama/attn.py index d0f442e3b436..15f76647c87b 100644 --- a/examples/language/llama/attn.py +++ b/examples/language/llama/attn.py @@ -1,10 +1,9 @@ -import math from types import MethodType -from typing import Optional, Tuple, Union +from typing import Optional, Tuple import torch import torch.nn as nn -from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotary_pos_emb +from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotary_pos_emb, repeat_kv SUPPORT_XFORMERS = False SUPPORT_FLASH2 = False @@ -15,7 +14,7 @@ pass try: - from flash_attn import flash_attn_func, flash_attn_qkvpacked_func + from flash_attn import flash_attn_func SUPPORT_FLASH2 = True except ImportError: pass @@ -35,8 +34,8 @@ def llama_flash_attention( bsz, q_len, _ = hidden_states.size() query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) kv_seq_len = key_states.shape[-2] if past_key_value is not None: @@ -52,6 +51,10 @@ def llama_flash_attention( past_key_value = (key_states, value_states) if use_cache else None + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + # q, k, v is [B, H, S, K] and xformers need [B, S, H, K]. returns [B, S, H, K] query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py index fc49c3bdc801..0d52a1d9201c 100644 --- a/examples/language/llama/benchmark.py +++ b/examples/language/llama/benchmark.py @@ -26,10 +26,21 @@ # ============================== MODEL_CONFIGS = { - '7b': LlamaConfig(), - '13b': LlamaConfig(hidden_size=5120, intermediate_size=13760, num_hidden_layers=40, num_attention_heads=40), - '30b': LlamaConfig(hidden_size=6656, intermediate_size=17888, num_hidden_layers=60, num_attention_heads=52), - '65b': LlamaConfig(hidden_size=8192, intermediate_size=22016, num_hidden_layers=80, num_attention_heads=64), + '7b': + LlamaConfig(max_position_embeddings=4096), + '13b': + LlamaConfig(hidden_size=5120, + intermediate_size=13824, + num_hidden_layers=40, + num_attention_heads=40, + max_position_embeddings=4096), + '70b': + LlamaConfig(hidden_size=8192, + intermediate_size=28672, + num_hidden_layers=80, + num_attention_heads=64, + max_position_embeddings=4096, + num_key_value_heads=8), } From 515bf740a91d49a2940839fdc35c8acd4ffaf354 Mon Sep 17 00:00:00 2001 From: ver217 Date: Thu, 24 Aug 2023 12:01:52 +0800 Subject: [PATCH 03/12] [example] refactor scripts folder --- .../{ => scripts}/benchmark_65B/3d/batch2_seq2048_flash_attn.sh | 2 +- .../benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh | 2 +- .../llama/{ => scripts}/benchmark_65B/3d/hostfile_example.txt | 0 .../benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh | 2 +- .../benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh | 2 +- .../{ => scripts}/benchmark_65B/gemini_auto/batch4_seq1024.sh | 2 +- .../benchmark_65B/gemini_auto/hostfile_example.txt | 0 .../{ => scripts}/benchmark_65B/gemini_cuda/batch16_seq512.sh | 2 +- .../benchmark_65B/gemini_cuda/hostfile_example.txt | 0 .../{ => scripts}/benchmark_7B/gemini_auto/batch8_seq512.sh | 2 +- .../{ => scripts}/benchmark_7B/gemini_auto/hostfile_example.txt | 0 .../{ => scripts}/benchmark_7B/gemini_cuda/batch16_seq512.sh | 2 +- .../{ => scripts}/benchmark_7B/gemini_cuda/hostfile_example.txt | 0 13 files changed, 8 insertions(+), 8 deletions(-) rename examples/language/llama/{ => scripts}/benchmark_65B/3d/batch2_seq2048_flash_attn.sh (97%) rename examples/language/llama/{ => scripts}/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh (97%) rename examples/language/llama/{ => scripts}/benchmark_65B/3d/hostfile_example.txt (100%) rename examples/language/llama/{ => scripts}/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh (97%) rename examples/language/llama/{ => scripts}/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh (97%) rename examples/language/llama/{ => scripts}/benchmark_65B/gemini_auto/batch4_seq1024.sh (97%) rename examples/language/llama/{ => scripts}/benchmark_65B/gemini_auto/hostfile_example.txt (100%) rename examples/language/llama/{ => scripts}/benchmark_65B/gemini_cuda/batch16_seq512.sh (97%) rename examples/language/llama/{ => scripts}/benchmark_65B/gemini_cuda/hostfile_example.txt (100%) rename examples/language/llama/{ => scripts}/benchmark_7B/gemini_auto/batch8_seq512.sh (97%) rename examples/language/llama/{ => scripts}/benchmark_7B/gemini_auto/hostfile_example.txt (100%) rename examples/language/llama/{ => scripts}/benchmark_7B/gemini_cuda/batch16_seq512.sh (97%) rename examples/language/llama/{ => scripts}/benchmark_7B/gemini_cuda/hostfile_example.txt (100%) diff --git a/examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn.sh b/examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn.sh similarity index 97% rename from examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn.sh rename to examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn.sh index 80356a55709d..fd530b9e5655 100644 --- a/examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn.sh +++ b/examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn.sh @@ -5,7 +5,7 @@ ################ -cd ../.. +cd ../../.. # NCCL IB environment variables export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 diff --git a/examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh b/examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh similarity index 97% rename from examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh rename to examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh index 6a23b117fa76..80b29bd8f46b 100644 --- a/examples/language/llama/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh +++ b/examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh @@ -5,7 +5,7 @@ ################ -cd ../.. +cd ../../.. # NCCL IB environment variables export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 diff --git a/examples/language/llama/benchmark_65B/3d/hostfile_example.txt b/examples/language/llama/scripts/benchmark_65B/3d/hostfile_example.txt similarity index 100% rename from examples/language/llama/benchmark_65B/3d/hostfile_example.txt rename to examples/language/llama/scripts/benchmark_65B/3d/hostfile_example.txt diff --git a/examples/language/llama/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh b/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh similarity index 97% rename from examples/language/llama/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh rename to examples/language/llama/scripts/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh index 22d757fe2196..0a3944280c41 100644 --- a/examples/language/llama/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh +++ b/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh @@ -5,7 +5,7 @@ ################ -cd ../.. +cd ../../.. # NCCL IB environment variables export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 diff --git a/examples/language/llama/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh b/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh similarity index 97% rename from examples/language/llama/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh rename to examples/language/llama/scripts/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh index 4c1f4125c8e9..385589ddc488 100644 --- a/examples/language/llama/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh +++ b/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh @@ -5,7 +5,7 @@ ################ -cd ../.. +cd ../../.. # NCCL IB environment variables export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 diff --git a/examples/language/llama/benchmark_65B/gemini_auto/batch4_seq1024.sh b/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch4_seq1024.sh similarity index 97% rename from examples/language/llama/benchmark_65B/gemini_auto/batch4_seq1024.sh rename to examples/language/llama/scripts/benchmark_65B/gemini_auto/batch4_seq1024.sh index 05b1ed1913f2..b377db01ba88 100644 --- a/examples/language/llama/benchmark_65B/gemini_auto/batch4_seq1024.sh +++ b/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch4_seq1024.sh @@ -5,7 +5,7 @@ ################ -cd ../.. +cd ../../.. # NCCL IB environment variables export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 diff --git a/examples/language/llama/benchmark_65B/gemini_auto/hostfile_example.txt b/examples/language/llama/scripts/benchmark_65B/gemini_auto/hostfile_example.txt similarity index 100% rename from examples/language/llama/benchmark_65B/gemini_auto/hostfile_example.txt rename to examples/language/llama/scripts/benchmark_65B/gemini_auto/hostfile_example.txt diff --git a/examples/language/llama/benchmark_65B/gemini_cuda/batch16_seq512.sh b/examples/language/llama/scripts/benchmark_65B/gemini_cuda/batch16_seq512.sh similarity index 97% rename from examples/language/llama/benchmark_65B/gemini_cuda/batch16_seq512.sh rename to examples/language/llama/scripts/benchmark_65B/gemini_cuda/batch16_seq512.sh index b46b8a5a2023..ce15ca09f73a 100644 --- a/examples/language/llama/benchmark_65B/gemini_cuda/batch16_seq512.sh +++ b/examples/language/llama/scripts/benchmark_65B/gemini_cuda/batch16_seq512.sh @@ -5,7 +5,7 @@ ################ -cd ../.. +cd ../../.. # NCCL IB environment variables export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 diff --git a/examples/language/llama/benchmark_65B/gemini_cuda/hostfile_example.txt b/examples/language/llama/scripts/benchmark_65B/gemini_cuda/hostfile_example.txt similarity index 100% rename from examples/language/llama/benchmark_65B/gemini_cuda/hostfile_example.txt rename to examples/language/llama/scripts/benchmark_65B/gemini_cuda/hostfile_example.txt diff --git a/examples/language/llama/benchmark_7B/gemini_auto/batch8_seq512.sh b/examples/language/llama/scripts/benchmark_7B/gemini_auto/batch8_seq512.sh similarity index 97% rename from examples/language/llama/benchmark_7B/gemini_auto/batch8_seq512.sh rename to examples/language/llama/scripts/benchmark_7B/gemini_auto/batch8_seq512.sh index d9413ef43e28..8d78a659f1df 100644 --- a/examples/language/llama/benchmark_7B/gemini_auto/batch8_seq512.sh +++ b/examples/language/llama/scripts/benchmark_7B/gemini_auto/batch8_seq512.sh @@ -4,7 +4,7 @@ #Load your environments and modules here ################ -cd ../.. +cd ../../.. export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 export NCCL_IB_DISABLE=0 diff --git a/examples/language/llama/benchmark_7B/gemini_auto/hostfile_example.txt b/examples/language/llama/scripts/benchmark_7B/gemini_auto/hostfile_example.txt similarity index 100% rename from examples/language/llama/benchmark_7B/gemini_auto/hostfile_example.txt rename to examples/language/llama/scripts/benchmark_7B/gemini_auto/hostfile_example.txt diff --git a/examples/language/llama/benchmark_7B/gemini_cuda/batch16_seq512.sh b/examples/language/llama/scripts/benchmark_7B/gemini_cuda/batch16_seq512.sh similarity index 97% rename from examples/language/llama/benchmark_7B/gemini_cuda/batch16_seq512.sh rename to examples/language/llama/scripts/benchmark_7B/gemini_cuda/batch16_seq512.sh index 0d3628f6750a..04e857bbe28c 100644 --- a/examples/language/llama/benchmark_7B/gemini_cuda/batch16_seq512.sh +++ b/examples/language/llama/scripts/benchmark_7B/gemini_cuda/batch16_seq512.sh @@ -5,7 +5,7 @@ ################ -cd ../.. +cd ../../.. # NCCL IB environment variables export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 diff --git a/examples/language/llama/benchmark_7B/gemini_cuda/hostfile_example.txt b/examples/language/llama/scripts/benchmark_7B/gemini_cuda/hostfile_example.txt similarity index 100% rename from examples/language/llama/benchmark_7B/gemini_cuda/hostfile_example.txt rename to examples/language/llama/scripts/benchmark_7B/gemini_cuda/hostfile_example.txt From a088b6ddff49a30f77d8daf7f044cf08de654c52 Mon Sep 17 00:00:00 2001 From: ver217 Date: Thu, 24 Aug 2023 14:03:48 +0800 Subject: [PATCH 04/12] [example] fit new gemini plugin --- .../kernel/cuda_native/mha/mem_eff_attn.py | 15 ++--- examples/language/llama/benchmark.py | 66 ++++++++++--------- examples/language/llama/pretrain.py | 33 +++++----- 3 files changed, 59 insertions(+), 55 deletions(-) diff --git a/colossalai/kernel/cuda_native/mha/mem_eff_attn.py b/colossalai/kernel/cuda_native/mha/mem_eff_attn.py index e83beb8b2429..8a898080877c 100644 --- a/colossalai/kernel/cuda_native/mha/mem_eff_attn.py +++ b/colossalai/kernel/cuda_native/mha/mem_eff_attn.py @@ -2,7 +2,13 @@ HAS_MEM_EFF_ATTN = False try: - from xformers.ops.fmha import memory_efficient_attention + from xformers.ops.fmha import MemoryEfficientAttentionCutlassOp, memory_efficient_attention + from xformers.ops.fmha.attn_bias import ( + BlockDiagonalCausalMask, + BlockDiagonalMask, + LowerTriangularMask, + LowerTriangularMaskWithTensorBias, + ) HAS_MEM_EFF_ATTN = True except ImportError: warnings.warn('please install xformers from https://github.com/facebookresearch/xformers') @@ -16,13 +22,6 @@ from typing import Optional import torch - from xformers.ops.fmha import MemoryEfficientAttentionCutlassOp - from xformers.ops.fmha.attn_bias import ( - BlockDiagonalCausalMask, - BlockDiagonalMask, - LowerTriangularMask, - LowerTriangularMaskWithTensorBias, - ) from .utils import SeqLenInfo diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py index 0d52a1d9201c..1b947cef9080 100644 --- a/examples/language/llama/benchmark.py +++ b/examples/language/llama/benchmark.py @@ -14,12 +14,11 @@ import colossalai from colossalai.booster import Booster -from colossalai.booster.plugin import GeminiPlugin, ThreeDimParallelPlugin, TorchFSDPPlugin +from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, TorchFSDPPlugin from colossalai.cluster import DistCoordinator from colossalai.lazy import LazyInitContext from colossalai.nn.optimizer import HybridAdam from colossalai.utils import get_current_device -from colossalai.zero.gemini.placement_policy import AutoPlacementPolicy, ConstPlacementPolicy # ============================== # Constants @@ -52,20 +51,26 @@ def main(): parser.add_argument('-c', '--config', type=str, default='7b', help='Model configuration') parser.add_argument('-p', '--plugin', - choices=['gemini', 'gemini_cuda', 'gemini_cpu', 'fsdp', 'fsdp_cpu', '3d', '3d_cpu'], + choices=['gemini', 'gemini_auto', 'fsdp', 'fsdp_cpu', '3d', '3d_cpu'], default='gemini', help='Choose which plugin to use') parser.add_argument('-b', '--batch_size', type=int, default=2, help='Batch size') parser.add_argument('-s', '--num_steps', type=int, default=5, help='Number of steps to run') parser.add_argument('-i', '--ignore_steps', type=int, default=2, help='Number of steps to ignore') parser.add_argument('-g', '--grad_checkpoint', action='store_true', help='Use gradient checkpointing') - parser.add_argument('-l', '--max_length', type=int, default=2048, help='Max sequence length') - parser.add_argument('-w', '--warmup_ratio', type=float, default=0.8, help='warm up ratio for auto placement policy') + parser.add_argument('-l', '--max_length', type=int, default=4096, help='Max sequence length') + parser.add_argument('-w', + '--warmup_ratio', + type=float, + default=0.8, + help='warm up ratio of non-model data. Only for gemini-auto') parser.add_argument('-m', '--memory_limit', type=int, help='Gemini memory limit in mb') parser.add_argument('-x', '--xformers', action='store_true', help='Use xformers') + parser.add_argument('--shard_param_frac', type=float, default=1.0, help='Shard param fraction. Only for gemini') + parser.add_argument('--offload_optim_frac', type=float, default=0.0, help='Offload optim fraction. Only for gemini') + parser.add_argument('--offload_param_frac', type=float, default=0.0, help='Offload param fraction. Only for gemini') parser.add_argument('--tp', type=int, default=1, help='Tensor parallel size') parser.add_argument('--pp', type=int, default=1, help='Pipeline parallel size') - parser.add_argument('--edp', type=int, default=1, help='Extra data parallel size') parser.add_argument('--mbs', type=int, default=1) parser.add_argument('--zero', type=int, default=0) args = parser.parse_args() @@ -81,15 +86,12 @@ def empty_init(): # ============================== use_empty_init = True if args.plugin == 'gemini': - AutoPlacementPolicy.set_warmup_non_model_data_ratio(args.warmup_ratio) - plugin = GeminiPlugin(placement_policy='auto', precision='bf16', extra_dp_size=args.edp) - elif args.plugin == 'gemini_cuda': - plugin = GeminiPlugin(placement_policy='cuda', precision='bf16', extra_dp_size=args.edp) - elif args.plugin == 'gemini_cpu': - plugin = GeminiPlugin(placement_policy='cpu', precision='bf16', extra_dp_size=args.edp) - elif args.plugin == 'const': - ConstPlacementPolicy.set_const_memory_boundary(args.memory_limit) - plugin = GeminiPlugin(placement_policy='const', precision='bf16') + plugin = GeminiPlugin(precision='bf16', + shard_param_frac=args.shard_param_frac, + offload_optim_frac=args.offload_optim_frac, + offload_param_frac=args.offload_param_frac) + elif args.plugin == 'gemini_auto': + plugin = GeminiPlugin(placement_policy='auto', precision='bf16', warmup_non_model_data_ratio=args.warmup_ratio) elif args.plugin == 'fsdp': if use_empty_init: plugin = TorchFSDPPlugin( @@ -116,21 +118,21 @@ def empty_init(): buffer_dtype=torch.float16), cpu_offload=CPUOffload(offload_params=True)) elif args.plugin == '3d': - plugin = ThreeDimParallelPlugin(tp_size=args.tp, - pp_size=args.pp, - zero_stage=args.zero, - enable_fused_normalization=True, - num_microbatches=args.mbs, - precision='bf16') + plugin = HybridParallelPlugin(tp_size=args.tp, + pp_size=args.pp, + zero_stage=args.zero, + enable_fused_normalization=True, + num_microbatches=args.mbs, + precision='bf16') elif args.plugin == '3d_cpu': - plugin = ThreeDimParallelPlugin(tp_size=args.tp, - pp_size=args.pp, - zero_stage=args.zero, - cpu_offload=True, - enable_fused_normalization=True, - num_microbatches=args.mbs, - initial_scale=2**8, - precision='bf16') + plugin = HybridParallelPlugin(tp_size=args.tp, + pp_size=args.pp, + zero_stage=args.zero, + cpu_offload=True, + enable_fused_normalization=True, + num_microbatches=args.mbs, + initial_scale=2**8, + precision='bf16') else: raise ValueError(f'Unknown plugin {args.plugin}') @@ -139,7 +141,7 @@ def empty_init(): # ============================== # Initialize Dataset and Dataloader # ============================== - dp_size = plugin.dp_size if isinstance(plugin, ThreeDimParallelPlugin) else coordinator.world_size + dp_size = plugin.dp_size if isinstance(plugin, HybridParallelPlugin) else coordinator.world_size config = MODEL_CONFIGS[args.config] dataset = RandomDataset(num_samples=args.batch_size * args.num_steps * dp_size, @@ -152,7 +154,7 @@ def empty_init(): # ============================== init_ctx = LazyInitContext( default_device=get_current_device()) if isinstance(plugin, - (GeminiPlugin, ThreeDimParallelPlugin)) else nullcontext() + (GeminiPlugin, HybridParallelPlugin)) else nullcontext() with init_ctx: model = LlamaForCausalLM(config) @@ -179,7 +181,7 @@ def empty_init(): coordinator.print_on_master( f'Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB') - if isinstance(plugin, ThreeDimParallelPlugin) and args.pp > 1: + if isinstance(plugin, HybridParallelPlugin) and args.pp > 1: data_iter = iter(dataloader) for step in tqdm(range(len(dataloader)), desc='Step', disable=not coordinator.is_master()): performance_evaluator.on_step_start(step) diff --git a/examples/language/llama/pretrain.py b/examples/language/llama/pretrain.py index ce902b23bb50..da883d9853cf 100644 --- a/examples/language/llama/pretrain.py +++ b/examples/language/llama/pretrain.py @@ -29,10 +29,21 @@ from colossalai.utils import get_current_device MODEL_CONFIGS = { - '7b': LlamaConfig(), - '13b': LlamaConfig(hidden_size=5120, intermediate_size=13760, num_hidden_layers=40, num_attention_heads=40), - '30b': LlamaConfig(hidden_size=6656, intermediate_size=17888, num_hidden_layers=60, num_attention_heads=52), - '65b': LlamaConfig(hidden_size=8192, intermediate_size=22016, num_hidden_layers=80, num_attention_heads=64), + '7b': + LlamaConfig(max_position_embeddings=4096), + '13b': + LlamaConfig(hidden_size=5120, + intermediate_size=13824, + num_hidden_layers=40, + num_attention_heads=40, + max_position_embeddings=4096), + '70b': + LlamaConfig(hidden_size=8192, + intermediate_size=28672, + num_hidden_layers=80, + num_attention_heads=64, + max_position_embeddings=4096, + num_key_value_heads=8), } @@ -102,7 +113,7 @@ def main(): parser.add_argument('-c', '--config', type=str, default='7b', help='Model configuration') parser.add_argument('-p', '--plugin', - choices=['gemini', 'gemini_cuda', 'gemini_cpu', 'zero2', 'zero2_cpu'], + choices=['gemini', 'gemini_auto', 'zero2', 'zero2_cpu'], default='gemini', help='Choose which plugin to use') parser.add_argument('-d', @@ -143,20 +154,12 @@ def main(): # Initialize Booster # ============================== if args.plugin == 'gemini': + plugin = GeminiPlugin(precision=args.mixed_precision, initial_scale=2**16, max_norm=args.grad_clip) + elif args.plugin == 'gemini_auto': plugin = GeminiPlugin(precision=args.mixed_precision, placement_policy='auto', initial_scale=2**16, max_norm=args.grad_clip) - elif args.plugin == 'gemini_cuda': - plugin = GeminiPlugin(precision=args.mixed_precision, - placement_policy='cuda', - initial_scale=2**16, - max_norm=args.grad_clip) - elif args.plugin == 'gemini_cpu': - plugin = GeminiPlugin(precision=args.mixed_precision, - placement_policy='cpu', - initial_scale=2**16, - max_norm=args.grad_clip) elif args.plugin == 'zero2': plugin = LowLevelZeroPlugin(stage=2, precision=args.mixed_precision, From ed205fda23600e23668e639931ddf6f025d11111 Mon Sep 17 00:00:00 2001 From: ver217 Date: Thu, 24 Aug 2023 15:15:17 +0800 Subject: [PATCH 05/12] [cli] fix multinode runner --- colossalai/cli/launcher/run.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py index 5e74c2c4f5b8..d2d02811ac9d 100644 --- a/colossalai/cli/launcher/run.py +++ b/colossalai/cli/launcher/run.py @@ -265,6 +265,10 @@ def launch_multi_processes(args: Config) -> None: # establish remote connection runner.connect(host_info_list=active_device_pool, workdir=curr_path, env=env) + # overwrite master addr when num_nodes > 1 and not specified + if len(active_device_pool) > 1 and args.master_addr == "127.0.0.1": + args.master_addr = active_device_pool.hostinfo_list[0].hostname + # execute distributed launching command for node_id, hostinfo in enumerate(active_device_pool): cmd = get_launch_command(master_addr=args.master_addr, From c02c8f8bc625a12ddd1448cbf71a5c6adb80853c Mon Sep 17 00:00:00 2001 From: ver217 Date: Thu, 24 Aug 2023 17:26:03 +0800 Subject: [PATCH 06/12] [example] fit gemini optim checkpoint --- examples/language/llama/pretrain.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/language/llama/pretrain.py b/examples/language/llama/pretrain.py index da883d9853cf..0299f23f7ecf 100644 --- a/examples/language/llama/pretrain.py +++ b/examples/language/llama/pretrain.py @@ -84,8 +84,7 @@ def save(booster: Booster, model: nn.Module, optimizer: Optimizer, lr_scheduler: os.makedirs(os.path.join(save_dir, 'model'), exist_ok=True) booster.save_model(model, os.path.join(save_dir, 'model'), shard=True) - # TODO: sharded optimizer is not supported yet - booster.save_optimizer(optimizer, os.path.join(save_dir, 'optimizer'), shard=False) + booster.save_optimizer(optimizer, os.path.join(save_dir, 'optimizer'), shard=True) booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, 'lr_scheduler')) running_states = { 'epoch': epoch, From 8e93a20d34e88d850939433f1bab2391c4f51eda Mon Sep 17 00:00:00 2001 From: ver217 Date: Thu, 24 Aug 2023 17:42:57 +0800 Subject: [PATCH 07/12] [example] refactor scripts --- .../3d/batch2_seq2048_flash_attn_offload.sh | 19 ------------------- .../benchmark_65B/3d/hostfile_example.txt | 4 ---- .../gemini_auto/batch12_seq2048_flash_attn.sh | 18 ------------------ .../gemini_auto/hostfile_example.txt | 4 ---- .../gemini_cuda/hostfile_example.txt | 4 ---- .../3d.sh} | 11 ++++++++--- .../gemini.sh} | 6 ++++-- .../gemini_auto.sh} | 6 ++++-- .../llama/scripts/benchmark_70B/hosts.txt | 4 ++++ .../gemini.sh} | 6 ++++-- .../batch16_seq512.sh => gemini_auto.sh} | 6 ++++-- .../benchmark_7B/gemini_auto/batch8_seq512.sh | 16 ---------------- .../gemini_auto/hostfile_example.txt | 4 ---- .../gemini_cuda/hostfile_example.txt | 4 ---- .../llama/scripts/benchmark_7B/hosts.txt | 4 ++++ 15 files changed, 32 insertions(+), 84 deletions(-) delete mode 100644 examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh delete mode 100644 examples/language/llama/scripts/benchmark_65B/3d/hostfile_example.txt delete mode 100644 examples/language/llama/scripts/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh delete mode 100644 examples/language/llama/scripts/benchmark_65B/gemini_auto/hostfile_example.txt delete mode 100644 examples/language/llama/scripts/benchmark_65B/gemini_cuda/hostfile_example.txt rename examples/language/llama/scripts/{benchmark_65B/3d/batch2_seq2048_flash_attn.sh => benchmark_70B/3d.sh} (55%) rename examples/language/llama/scripts/{benchmark_65B/gemini_auto/batch4_seq1024.sh => benchmark_70B/gemini.sh} (66%) rename examples/language/llama/scripts/{benchmark_65B/gemini_cuda/batch16_seq512.sh => benchmark_70B/gemini_auto.sh} (65%) create mode 100644 examples/language/llama/scripts/benchmark_70B/hosts.txt rename examples/language/llama/scripts/{benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh => benchmark_7B/gemini.sh} (66%) rename examples/language/llama/scripts/benchmark_7B/{gemini_cuda/batch16_seq512.sh => gemini_auto.sh} (65%) delete mode 100644 examples/language/llama/scripts/benchmark_7B/gemini_auto/batch8_seq512.sh delete mode 100644 examples/language/llama/scripts/benchmark_7B/gemini_auto/hostfile_example.txt delete mode 100644 examples/language/llama/scripts/benchmark_7B/gemini_cuda/hostfile_example.txt create mode 100644 examples/language/llama/scripts/benchmark_7B/hosts.txt diff --git a/examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh b/examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh deleted file mode 100644 index 80b29bd8f46b..000000000000 --- a/examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn_offload.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -################ -#Load your environments and modules here -################ - - -cd ../../.. - -# NCCL IB environment variables -export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 -export NCCL_IB_DISABLE=0 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_IB_GID_INDEX=3 -export NCCL_IB_TIMEOUT=23 -export NCCL_IB_RETRY_CNT=7 - -# 4-tp + 4-pp + 2-zero1, num microbatches=8, cpu offload is enabled -colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '65b' --plugin "3d_cpu" -l 2048 -g -b 16 -x --tp 4 --pp 4 --zero 1 --mbs 8 diff --git a/examples/language/llama/scripts/benchmark_65B/3d/hostfile_example.txt b/examples/language/llama/scripts/benchmark_65B/3d/hostfile_example.txt deleted file mode 100644 index 4150e1be488e..000000000000 --- a/examples/language/llama/scripts/benchmark_65B/3d/hostfile_example.txt +++ /dev/null @@ -1,4 +0,0 @@ -host1 -host2 -host3 -host4 diff --git a/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh b/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh deleted file mode 100644 index 0a3944280c41..000000000000 --- a/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch12_seq2048_flash_attn.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -################ -#Load your environments and modules here -################ - - -cd ../../.. - -# NCCL IB environment variables -export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 -export NCCL_IB_DISABLE=0 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_IB_GID_INDEX=3 -export NCCL_IB_TIMEOUT=23 -export NCCL_IB_RETRY_CNT=7 - -colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '65b' --plugin "gemini" -l 2048 -g -b 12 -x diff --git a/examples/language/llama/scripts/benchmark_65B/gemini_auto/hostfile_example.txt b/examples/language/llama/scripts/benchmark_65B/gemini_auto/hostfile_example.txt deleted file mode 100644 index 4150e1be488e..000000000000 --- a/examples/language/llama/scripts/benchmark_65B/gemini_auto/hostfile_example.txt +++ /dev/null @@ -1,4 +0,0 @@ -host1 -host2 -host3 -host4 diff --git a/examples/language/llama/scripts/benchmark_65B/gemini_cuda/hostfile_example.txt b/examples/language/llama/scripts/benchmark_65B/gemini_cuda/hostfile_example.txt deleted file mode 100644 index 4150e1be488e..000000000000 --- a/examples/language/llama/scripts/benchmark_65B/gemini_cuda/hostfile_example.txt +++ /dev/null @@ -1,4 +0,0 @@ -host1 -host2 -host3 -host4 diff --git a/examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn.sh b/examples/language/llama/scripts/benchmark_70B/3d.sh similarity index 55% rename from examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn.sh rename to examples/language/llama/scripts/benchmark_70B/3d.sh index fd530b9e5655..2320601496e0 100644 --- a/examples/language/llama/scripts/benchmark_65B/3d/batch2_seq2048_flash_attn.sh +++ b/examples/language/llama/scripts/benchmark_70B/3d.sh @@ -1,11 +1,16 @@ #!/bin/bash +# TODO: fix this +echo "3D parallel for LLaMA-2 is not ready yet" +exit 1 + ################ #Load your environments and modules here ################ +HOSTFILE=$(realpath hosts.txt) -cd ../../.. +cd ../.. # NCCL IB environment variables export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 @@ -14,6 +19,6 @@ export NCCL_SOCKET_IFNAME=eth0 export NCCL_IB_GID_INDEX=3 export NCCL_IB_TIMEOUT=23 export NCCL_IB_RETRY_CNT=7 +export OMP_NUM_THREADS=8 -# 4-tp + 4-pp + 2-zero1, num microbatches=8 -colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '65b' --plugin "3d" -l 2048 -g -b 16 -x --tp 4 --pp 4 --zero 1 --mbs 8 +colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -c 70b -p 3d -g -x -b 8 --tp 4 --pp 2 --mbs 4 diff --git a/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch4_seq1024.sh b/examples/language/llama/scripts/benchmark_70B/gemini.sh similarity index 66% rename from examples/language/llama/scripts/benchmark_65B/gemini_auto/batch4_seq1024.sh rename to examples/language/llama/scripts/benchmark_70B/gemini.sh index b377db01ba88..520b7f416b9a 100644 --- a/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch4_seq1024.sh +++ b/examples/language/llama/scripts/benchmark_70B/gemini.sh @@ -4,8 +4,9 @@ #Load your environments and modules here ################ +HOSTFILE=$(realpath hosts.txt) -cd ../../.. +cd ../.. # NCCL IB environment variables export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 @@ -14,5 +15,6 @@ export NCCL_SOCKET_IFNAME=eth0 export NCCL_IB_GID_INDEX=3 export NCCL_IB_TIMEOUT=23 export NCCL_IB_RETRY_CNT=7 +export OMP_NUM_THREADS=8 -colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '65b' --plugin "gemini" -l 1024 -g -b 4 +colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -c 70b -g -x -b 2 diff --git a/examples/language/llama/scripts/benchmark_65B/gemini_cuda/batch16_seq512.sh b/examples/language/llama/scripts/benchmark_70B/gemini_auto.sh similarity index 65% rename from examples/language/llama/scripts/benchmark_65B/gemini_cuda/batch16_seq512.sh rename to examples/language/llama/scripts/benchmark_70B/gemini_auto.sh index ce15ca09f73a..ae7c85c90742 100644 --- a/examples/language/llama/scripts/benchmark_65B/gemini_cuda/batch16_seq512.sh +++ b/examples/language/llama/scripts/benchmark_70B/gemini_auto.sh @@ -4,8 +4,9 @@ #Load your environments and modules here ################ +HOSTFILE=$(realpath hosts.txt) -cd ../../.. +cd ../.. # NCCL IB environment variables export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 @@ -14,5 +15,6 @@ export NCCL_SOCKET_IFNAME=eth0 export NCCL_IB_GID_INDEX=3 export NCCL_IB_TIMEOUT=23 export NCCL_IB_RETRY_CNT=7 +export OMP_NUM_THREADS=8 -colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '65b' --plugin "gemini_cuda" -l 512 -g -b 16 +colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -c 70b -p gemini_auto -g -x -b 2 diff --git a/examples/language/llama/scripts/benchmark_70B/hosts.txt b/examples/language/llama/scripts/benchmark_70B/hosts.txt new file mode 100644 index 000000000000..c0923c7dd37c --- /dev/null +++ b/examples/language/llama/scripts/benchmark_70B/hosts.txt @@ -0,0 +1,4 @@ +192.168.0.38 +192.168.0.39 +192.168.0.40 +192.168.0.41 diff --git a/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh b/examples/language/llama/scripts/benchmark_7B/gemini.sh similarity index 66% rename from examples/language/llama/scripts/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh rename to examples/language/llama/scripts/benchmark_7B/gemini.sh index 385589ddc488..a94c6f5e3d72 100644 --- a/examples/language/llama/scripts/benchmark_65B/gemini_auto/batch2_seq2048_flash_attn.sh +++ b/examples/language/llama/scripts/benchmark_7B/gemini.sh @@ -4,8 +4,9 @@ #Load your environments and modules here ################ +HOSTFILE=$(realpath hosts.txt) -cd ../../.. +cd ../.. # NCCL IB environment variables export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 @@ -14,5 +15,6 @@ export NCCL_SOCKET_IFNAME=eth0 export NCCL_IB_GID_INDEX=3 export NCCL_IB_TIMEOUT=23 export NCCL_IB_RETRY_CNT=7 +export OMP_NUM_THREADS=8 -colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '65b' --plugin "gemini" -l 2048 -g -b 2 -x +colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -g -x -b 16 diff --git a/examples/language/llama/scripts/benchmark_7B/gemini_cuda/batch16_seq512.sh b/examples/language/llama/scripts/benchmark_7B/gemini_auto.sh similarity index 65% rename from examples/language/llama/scripts/benchmark_7B/gemini_cuda/batch16_seq512.sh rename to examples/language/llama/scripts/benchmark_7B/gemini_auto.sh index 04e857bbe28c..3353eb52c3be 100644 --- a/examples/language/llama/scripts/benchmark_7B/gemini_cuda/batch16_seq512.sh +++ b/examples/language/llama/scripts/benchmark_7B/gemini_auto.sh @@ -4,8 +4,9 @@ #Load your environments and modules here ################ +HOSTFILE=$(realpath hosts.txt) -cd ../../.. +cd ../.. # NCCL IB environment variables export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 @@ -14,5 +15,6 @@ export NCCL_SOCKET_IFNAME=eth0 export NCCL_IB_GID_INDEX=3 export NCCL_IB_TIMEOUT=23 export NCCL_IB_RETRY_CNT=7 +export OMP_NUM_THREADS=8 -colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py -c '7b' --plugin "gemini_cuda" -l 512 -g -b 16 +colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -p gemini_auto -g -x -b 16 diff --git a/examples/language/llama/scripts/benchmark_7B/gemini_auto/batch8_seq512.sh b/examples/language/llama/scripts/benchmark_7B/gemini_auto/batch8_seq512.sh deleted file mode 100644 index 8d78a659f1df..000000000000 --- a/examples/language/llama/scripts/benchmark_7B/gemini_auto/batch8_seq512.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -################ -#Load your environments and modules here -################ - -cd ../../.. - -export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 -export NCCL_IB_DISABLE=0 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_IB_GID_INDEX=3 -export NCCL_IB_TIMEOUT=23 -export NCCL_IB_RETRY_CNT=7 - -colossalai run --nproc_per_node 8 --hostfile YOUR_HOST_FILE --master_addr YOUR_MASTER_ADDR benchmark.py --plugin "gemini" -l 512 -g -b 8 diff --git a/examples/language/llama/scripts/benchmark_7B/gemini_auto/hostfile_example.txt b/examples/language/llama/scripts/benchmark_7B/gemini_auto/hostfile_example.txt deleted file mode 100644 index 4150e1be488e..000000000000 --- a/examples/language/llama/scripts/benchmark_7B/gemini_auto/hostfile_example.txt +++ /dev/null @@ -1,4 +0,0 @@ -host1 -host2 -host3 -host4 diff --git a/examples/language/llama/scripts/benchmark_7B/gemini_cuda/hostfile_example.txt b/examples/language/llama/scripts/benchmark_7B/gemini_cuda/hostfile_example.txt deleted file mode 100644 index 4150e1be488e..000000000000 --- a/examples/language/llama/scripts/benchmark_7B/gemini_cuda/hostfile_example.txt +++ /dev/null @@ -1,4 +0,0 @@ -host1 -host2 -host3 -host4 diff --git a/examples/language/llama/scripts/benchmark_7B/hosts.txt b/examples/language/llama/scripts/benchmark_7B/hosts.txt new file mode 100644 index 000000000000..c0923c7dd37c --- /dev/null +++ b/examples/language/llama/scripts/benchmark_7B/hosts.txt @@ -0,0 +1,4 @@ +192.168.0.38 +192.168.0.39 +192.168.0.40 +192.168.0.41 From 8a1c046f64e88b3efd1333396225c8292106cb22 Mon Sep 17 00:00:00 2001 From: ver217 Date: Thu, 24 Aug 2023 17:45:48 +0800 Subject: [PATCH 08/12] [example] update requirements --- examples/language/llama/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/language/llama/requirements.txt b/examples/language/llama/requirements.txt index 30dd31681747..9853342cc5ff 100644 --- a/examples/language/llama/requirements.txt +++ b/examples/language/llama/requirements.txt @@ -1,6 +1,7 @@ colossalai>=0.3.0 datasets numpy -torch +torch>=1.12.0,<=2.0.0 tqdm transformers +flash-attn>=2.0.0,<=2.0.5 From 6c0854eccb32b114a7e653ed4202f812bbc1e5d2 Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 25 Aug 2023 17:27:06 +0800 Subject: [PATCH 09/12] [example] update requirements --- examples/language/llama/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/language/llama/requirements.txt b/examples/language/llama/requirements.txt index 9853342cc5ff..3ddf21ffe534 100644 --- a/examples/language/llama/requirements.txt +++ b/examples/language/llama/requirements.txt @@ -5,3 +5,5 @@ torch>=1.12.0,<=2.0.0 tqdm transformers flash-attn>=2.0.0,<=2.0.5 +SentencePiece==0.1.99 +tensorboard==2.14.0 From 1c235638144f47009f825543e4f1e8b81ee35d85 Mon Sep 17 00:00:00 2001 From: ver217 Date: Mon, 28 Aug 2023 14:48:04 +0800 Subject: [PATCH 10/12] [example] rename llama to llama2 --- examples/language/{llama => llama2}/README.md | 0 examples/language/{llama => llama2}/attn.py | 0 examples/language/{llama => llama2}/benchmark.py | 0 examples/language/{llama => llama2}/data_utils.py | 0 examples/language/{llama => llama2}/model_utils.py | 0 examples/language/{llama => llama2}/performance_evaluator.py | 0 examples/language/{llama => llama2}/pretrain.py | 0 examples/language/{llama => llama2}/requirements.txt | 0 examples/language/{llama => llama2}/scripts/benchmark_70B/3d.sh | 0 .../language/{llama => llama2}/scripts/benchmark_70B/gemini.sh | 0 .../{llama => llama2}/scripts/benchmark_70B/gemini_auto.sh | 0 .../language/{llama => llama2}/scripts/benchmark_70B/hosts.txt | 0 .../language/{llama => llama2}/scripts/benchmark_7B/gemini.sh | 0 .../{llama => llama2}/scripts/benchmark_7B/gemini_auto.sh | 0 .../language/{llama => llama2}/scripts/benchmark_7B/hosts.txt | 0 examples/language/{llama => llama2}/test_ci.sh | 0 16 files changed, 0 insertions(+), 0 deletions(-) rename examples/language/{llama => llama2}/README.md (100%) rename examples/language/{llama => llama2}/attn.py (100%) rename examples/language/{llama => llama2}/benchmark.py (100%) rename examples/language/{llama => llama2}/data_utils.py (100%) rename examples/language/{llama => llama2}/model_utils.py (100%) rename examples/language/{llama => llama2}/performance_evaluator.py (100%) rename examples/language/{llama => llama2}/pretrain.py (100%) rename examples/language/{llama => llama2}/requirements.txt (100%) rename examples/language/{llama => llama2}/scripts/benchmark_70B/3d.sh (100%) rename examples/language/{llama => llama2}/scripts/benchmark_70B/gemini.sh (100%) rename examples/language/{llama => llama2}/scripts/benchmark_70B/gemini_auto.sh (100%) rename examples/language/{llama => llama2}/scripts/benchmark_70B/hosts.txt (100%) rename examples/language/{llama => llama2}/scripts/benchmark_7B/gemini.sh (100%) rename examples/language/{llama => llama2}/scripts/benchmark_7B/gemini_auto.sh (100%) rename examples/language/{llama => llama2}/scripts/benchmark_7B/hosts.txt (100%) rename examples/language/{llama => llama2}/test_ci.sh (100%) diff --git a/examples/language/llama/README.md b/examples/language/llama2/README.md similarity index 100% rename from examples/language/llama/README.md rename to examples/language/llama2/README.md diff --git a/examples/language/llama/attn.py b/examples/language/llama2/attn.py similarity index 100% rename from examples/language/llama/attn.py rename to examples/language/llama2/attn.py diff --git a/examples/language/llama/benchmark.py b/examples/language/llama2/benchmark.py similarity index 100% rename from examples/language/llama/benchmark.py rename to examples/language/llama2/benchmark.py diff --git a/examples/language/llama/data_utils.py b/examples/language/llama2/data_utils.py similarity index 100% rename from examples/language/llama/data_utils.py rename to examples/language/llama2/data_utils.py diff --git a/examples/language/llama/model_utils.py b/examples/language/llama2/model_utils.py similarity index 100% rename from examples/language/llama/model_utils.py rename to examples/language/llama2/model_utils.py diff --git a/examples/language/llama/performance_evaluator.py b/examples/language/llama2/performance_evaluator.py similarity index 100% rename from examples/language/llama/performance_evaluator.py rename to examples/language/llama2/performance_evaluator.py diff --git a/examples/language/llama/pretrain.py b/examples/language/llama2/pretrain.py similarity index 100% rename from examples/language/llama/pretrain.py rename to examples/language/llama2/pretrain.py diff --git a/examples/language/llama/requirements.txt b/examples/language/llama2/requirements.txt similarity index 100% rename from examples/language/llama/requirements.txt rename to examples/language/llama2/requirements.txt diff --git a/examples/language/llama/scripts/benchmark_70B/3d.sh b/examples/language/llama2/scripts/benchmark_70B/3d.sh similarity index 100% rename from examples/language/llama/scripts/benchmark_70B/3d.sh rename to examples/language/llama2/scripts/benchmark_70B/3d.sh diff --git a/examples/language/llama/scripts/benchmark_70B/gemini.sh b/examples/language/llama2/scripts/benchmark_70B/gemini.sh similarity index 100% rename from examples/language/llama/scripts/benchmark_70B/gemini.sh rename to examples/language/llama2/scripts/benchmark_70B/gemini.sh diff --git a/examples/language/llama/scripts/benchmark_70B/gemini_auto.sh b/examples/language/llama2/scripts/benchmark_70B/gemini_auto.sh similarity index 100% rename from examples/language/llama/scripts/benchmark_70B/gemini_auto.sh rename to examples/language/llama2/scripts/benchmark_70B/gemini_auto.sh diff --git a/examples/language/llama/scripts/benchmark_70B/hosts.txt b/examples/language/llama2/scripts/benchmark_70B/hosts.txt similarity index 100% rename from examples/language/llama/scripts/benchmark_70B/hosts.txt rename to examples/language/llama2/scripts/benchmark_70B/hosts.txt diff --git a/examples/language/llama/scripts/benchmark_7B/gemini.sh b/examples/language/llama2/scripts/benchmark_7B/gemini.sh similarity index 100% rename from examples/language/llama/scripts/benchmark_7B/gemini.sh rename to examples/language/llama2/scripts/benchmark_7B/gemini.sh diff --git a/examples/language/llama/scripts/benchmark_7B/gemini_auto.sh b/examples/language/llama2/scripts/benchmark_7B/gemini_auto.sh similarity index 100% rename from examples/language/llama/scripts/benchmark_7B/gemini_auto.sh rename to examples/language/llama2/scripts/benchmark_7B/gemini_auto.sh diff --git a/examples/language/llama/scripts/benchmark_7B/hosts.txt b/examples/language/llama2/scripts/benchmark_7B/hosts.txt similarity index 100% rename from examples/language/llama/scripts/benchmark_7B/hosts.txt rename to examples/language/llama2/scripts/benchmark_7B/hosts.txt diff --git a/examples/language/llama/test_ci.sh b/examples/language/llama2/test_ci.sh similarity index 100% rename from examples/language/llama/test_ci.sh rename to examples/language/llama2/test_ci.sh From 34995d7e23e7220a34d8299f019b7e7491bfa907 Mon Sep 17 00:00:00 2001 From: ver217 Date: Mon, 28 Aug 2023 15:10:49 +0800 Subject: [PATCH 11/12] [example] update readme and pretrain script --- examples/language/llama2/README.md | 56 +++++++++------------------- examples/language/llama2/pretrain.py | 2 +- 2 files changed, 19 insertions(+), 39 deletions(-) diff --git a/examples/language/llama2/README.md b/examples/language/llama2/README.md index 04645ebecfd8..893aa08ac38b 100644 --- a/examples/language/llama2/README.md +++ b/examples/language/llama2/README.md @@ -1,13 +1,4 @@ -# Pretraining LLaMA: best practices for building LLaMA-like base models - -

- -

- -- 65-billion-parameter large model pretraining accelerated by 38% -[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama) -[[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining) - +# Pretraining LLaMA-2: best practices for building LLaMA-2-like base models ## Dataset @@ -43,16 +34,10 @@ We follow the hyperparameter settings from the original LLaMA paper. We use Adam ### 1. Installation -You should install ColossalAI of this branch from source. +Please install the latest ColossalAI from source. ```bash -git clone -b example/llama https://github.com/hpcaitech/ColossalAI.git -``` - -At the root directory of ColossalAI, run - -```bash -CUDA_EXT=1 pip install . +CUDA_EXT=1 pip install -U git+https://github.com/hpcaitech/ColossalAI ``` Then install other dependencies. @@ -61,12 +46,7 @@ Then install other dependencies. pip install -r requirements.txt ``` -If you want to use flash attention, which can accelerate training while saving memory, you should install: -```bash -pip install xformers -``` - -Additionally, we recommend you to use torch 1.13.1. We've tested our code on torch 1.13.1 and found it's compatible with our code and xformers. +Additionally, we recommend you to use torch 1.13.1. We've tested our code on torch 1.13.1 and found it's compatible with our code and flash attention. ### 2. Download the dataset @@ -77,7 +57,7 @@ The dataset can be automatically downloaded by using `huggingface/datasets`. You Yon can use colossalai run to launch multi-nodes training: ```bash colossalai run --nproc_per_node YOUR_GPU_PER_NODE --hostfile YOUR_HOST_FILE \ ---master_addr YOUR_MASTER_ADDR pretrain.py --OTHER_CONFIGURATIONS +pretrain.py --OTHER_CONFIGURATIONS ``` Here is a sample hostfile: @@ -94,7 +74,7 @@ Make sure master node can access all nodes (including itself) by ssh without pas Here is details about CLI arguments: - Model configuration: `-c`, `--config`. `7b`, `13b`, `30b` and `65b` are supported. -- Booster plugin: `-p`, `--plugin`. `gemini`, `gemini_cpu`, `zero2` and `zero2_cpu` are supported. For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins). +- Booster plugin: `-p`, `--plugin`. `gemini`, `gemini_auto`, `zero2` and `zero2_cpu` are supported. For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins). - Dataset path: `-d`, `--dataset`. The default dataset is `togethercomputer/RedPajama-Data-1T-Sample`. It support any dataset from `datasets` with the same data format as RedPajama. - Number of epochs: `-e`, `--num_epochs`. The default value is 1. - Local batch size: `-b`, `--batch_size`. Batch size per GPU. The default value is 2. @@ -102,26 +82,27 @@ Here is details about CLI arguments: - Weight decay: `-w`, `--weight_decay`. The default value is 0.1. - Warmup steps: `-s`, `--warmup_steps`. The default value is 2000. - Gradient checkpointing: `-g`, `--gradient_checkpoint`. The default value is `False`. This saves memory at the cost of speed. You'd better enable this option when training with a large batch size. -- Max length: `-l`, `--max_length`. The default value is 2048. +- Max length: `-l`, `--max_length`. The default value is 4096. - Mixed precision: `-x`, `--mixed_precision`. The default value is "fp16". "fp16" and "bf16" are supported. - Save interval: `-i`, `--save_interval`. The interval (steps) of saving checkpoints. The default value is 1000. - Checkpoint directory: `-o`, `--save_dir`. The directoty path to save checkpoints. The default value is `checkpoint`. - Checkpoint to load: `-f`, `--load`. The checkpoint path to load. The default value is `None`. - Gradient clipping: `--gradient_clipping`. The default value is 1.0. - Tensorboard log directory: `-t`, `--tensorboard_dir`. The directory path to save tensorboard logs. The default value is `tb_logs`. -- Flash attention: `-a`, `--flash_attention`. If you want to use flash attention, you must install [xformers](https://github.com/facebookresearch/xformers) first. The default value is `False`. This is helpful to accelerate training while saving memory. We recommend you always use flash attention. +- Flash attention: `-a`, `--flash_attention`. If you want to use flash attention, you must install `flash-attn`. The default value is `False`. This is helpful to accelerate training while saving memory. We recommend you always use flash attention. ### 4. Shell Script Examples -For your convenience, we provide some shell scripts to run benchmark with various gemini configurations. -You can find them in `benchmark_65B` and `benchmark_7B` directory. The main command should be in the format of: +For your convenience, we provide some shell scripts to run benchmark with various configurations. + +You can find them in `scripts/benchmark_7B` and `scripts/benchmark_70B` directory. The main command should be in the format of: ```bash colossalai run --nproc_per_node YOUR_GPU_PER_NODE --hostfile YOUR_HOST_FILE \ ---master_addr YOUR_MASTER_ADDR benchmark.py --OTHER_CONFIGURATIONS +benchmark.py --OTHER_CONFIGURATIONS ``` Here we will show an example of how to run training -llama pretraining with `gemini(gemini_auto plugin), batch_size=12, sequence_length=2048, gradient_checkpoint=True`. +llama pretraining with `gemini, batch_size=16, sequence_length=4096, gradient_checkpoint=True, flash_attn=True`. #### a. Running environment This experiment was performed on 4 computing nodes with 32 A800 GPUs in total. The nodes are @@ -129,16 +110,15 @@ connected with RDMA and GPUs within one node are fully connected with NVLink. #### b. Running command ```bash -cd examples/language/llama/benchmark_65B/gemini_auto/ -# First, modify hostfile_example.txt with your real host ip or host name. -# Second, replace the hostfile path and the master address in the shell. -# Third, add the system environment variables and load the running Python environment to the shell +cd scripts/benchmark_7B +# First, modify hosts.txt with your real host ip or host name. +# Then, add the system environment variables and load the running Python environment to the shell # if needed. -bash batch12_seq2048_flash_attn.sh +bash gemini.sh ``` #### c. Results If you run the above command successfully, you will get the following results: -`max memory usage: 58500.20 MB, throughput: 5.29 samples/s, TFLOPS/GPU: 176.84`. +`max memory usage: 55491.10 MB, throughput: 24.26 samples/s, TFLOPS/GPU: 167.43`. ## Reference diff --git a/examples/language/llama2/pretrain.py b/examples/language/llama2/pretrain.py index 0299f23f7ecf..b72a3019692e 100644 --- a/examples/language/llama2/pretrain.py +++ b/examples/language/llama2/pretrain.py @@ -126,7 +126,7 @@ def main(): parser.add_argument('-w', '--weigth_decay', type=float, default=0.1, help='Weight decay') parser.add_argument('-s', '--warmup_steps', type=int, default=2000, help='Warmup steps') parser.add_argument('-g', '--grad_checkpoint', action='store_true', help='Use gradient checkpointing') - parser.add_argument('-l', '--max_length', type=int, default=2048, help='Max sequence length') + parser.add_argument('-l', '--max_length', type=int, default=4096, help='Max sequence length') parser.add_argument('-x', '--mixed_precision', default='fp16', choices=['fp16', 'bf16'], help='Mixed precision') parser.add_argument('-i', '--save_interval', type=int, default=1000, help='Save interval') parser.add_argument('-o', '--save_dir', type=str, default='checkpoint', help='Checkpoint directory') From 1510e0c7429e894d949c371aafcbc21e929b5727 Mon Sep 17 00:00:00 2001 From: ver217 Date: Mon, 28 Aug 2023 15:39:14 +0800 Subject: [PATCH 12/12] [example] refactor scripts --- examples/language/llama2/README.md | 21 ++++++++++++++++--- .../llama2/scripts/benchmark_70B/3d.sh | 7 ------- .../llama2/scripts/benchmark_70B/gemini.sh | 7 ------- .../scripts/benchmark_70B/gemini_auto.sh | 7 ------- .../llama2/scripts/benchmark_70B/hosts.txt | 4 ---- .../llama2/scripts/benchmark_7B/gemini.sh | 7 ------- .../scripts/benchmark_7B/gemini_auto.sh | 7 ------- .../llama2/scripts/benchmark_7B/hosts.txt | 4 ---- 8 files changed, 18 insertions(+), 46 deletions(-) delete mode 100644 examples/language/llama2/scripts/benchmark_70B/hosts.txt delete mode 100644 examples/language/llama2/scripts/benchmark_7B/hosts.txt diff --git a/examples/language/llama2/README.md b/examples/language/llama2/README.md index 893aa08ac38b..b64b5d29ecb8 100644 --- a/examples/language/llama2/README.md +++ b/examples/language/llama2/README.md @@ -109,11 +109,26 @@ This experiment was performed on 4 computing nodes with 32 A800 GPUs in total. T connected with RDMA and GPUs within one node are fully connected with NVLink. #### b. Running command + ```bash cd scripts/benchmark_7B -# First, modify hosts.txt with your real host ip or host name. -# Then, add the system environment variables and load the running Python environment to the shell -# if needed. +``` + +First, put your host file (`hosts.txt`) in this directory with your real host ip or host name. + +Here is a sample `hosts.txt`: +```text +hostname1 +hostname2 +hostname3 +hostname4 +``` + +Then add environment variables to script if needed. + +Finally, run the following command to start training: + +```bash bash gemini.sh ``` #### c. Results diff --git a/examples/language/llama2/scripts/benchmark_70B/3d.sh b/examples/language/llama2/scripts/benchmark_70B/3d.sh index 2320601496e0..d50c57042d1a 100644 --- a/examples/language/llama2/scripts/benchmark_70B/3d.sh +++ b/examples/language/llama2/scripts/benchmark_70B/3d.sh @@ -12,13 +12,6 @@ HOSTFILE=$(realpath hosts.txt) cd ../.. -# NCCL IB environment variables -export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 -export NCCL_IB_DISABLE=0 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_IB_GID_INDEX=3 -export NCCL_IB_TIMEOUT=23 -export NCCL_IB_RETRY_CNT=7 export OMP_NUM_THREADS=8 colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -c 70b -p 3d -g -x -b 8 --tp 4 --pp 2 --mbs 4 diff --git a/examples/language/llama2/scripts/benchmark_70B/gemini.sh b/examples/language/llama2/scripts/benchmark_70B/gemini.sh index 520b7f416b9a..c80d4d9f25bf 100644 --- a/examples/language/llama2/scripts/benchmark_70B/gemini.sh +++ b/examples/language/llama2/scripts/benchmark_70B/gemini.sh @@ -8,13 +8,6 @@ HOSTFILE=$(realpath hosts.txt) cd ../.. -# NCCL IB environment variables -export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 -export NCCL_IB_DISABLE=0 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_IB_GID_INDEX=3 -export NCCL_IB_TIMEOUT=23 -export NCCL_IB_RETRY_CNT=7 export OMP_NUM_THREADS=8 colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -c 70b -g -x -b 2 diff --git a/examples/language/llama2/scripts/benchmark_70B/gemini_auto.sh b/examples/language/llama2/scripts/benchmark_70B/gemini_auto.sh index ae7c85c90742..ce3b2f2170cc 100644 --- a/examples/language/llama2/scripts/benchmark_70B/gemini_auto.sh +++ b/examples/language/llama2/scripts/benchmark_70B/gemini_auto.sh @@ -8,13 +8,6 @@ HOSTFILE=$(realpath hosts.txt) cd ../.. -# NCCL IB environment variables -export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 -export NCCL_IB_DISABLE=0 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_IB_GID_INDEX=3 -export NCCL_IB_TIMEOUT=23 -export NCCL_IB_RETRY_CNT=7 export OMP_NUM_THREADS=8 colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -c 70b -p gemini_auto -g -x -b 2 diff --git a/examples/language/llama2/scripts/benchmark_70B/hosts.txt b/examples/language/llama2/scripts/benchmark_70B/hosts.txt deleted file mode 100644 index c0923c7dd37c..000000000000 --- a/examples/language/llama2/scripts/benchmark_70B/hosts.txt +++ /dev/null @@ -1,4 +0,0 @@ -192.168.0.38 -192.168.0.39 -192.168.0.40 -192.168.0.41 diff --git a/examples/language/llama2/scripts/benchmark_7B/gemini.sh b/examples/language/llama2/scripts/benchmark_7B/gemini.sh index a94c6f5e3d72..db4968a8df7f 100644 --- a/examples/language/llama2/scripts/benchmark_7B/gemini.sh +++ b/examples/language/llama2/scripts/benchmark_7B/gemini.sh @@ -8,13 +8,6 @@ HOSTFILE=$(realpath hosts.txt) cd ../.. -# NCCL IB environment variables -export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 -export NCCL_IB_DISABLE=0 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_IB_GID_INDEX=3 -export NCCL_IB_TIMEOUT=23 -export NCCL_IB_RETRY_CNT=7 export OMP_NUM_THREADS=8 colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -g -x -b 16 diff --git a/examples/language/llama2/scripts/benchmark_7B/gemini_auto.sh b/examples/language/llama2/scripts/benchmark_7B/gemini_auto.sh index 3353eb52c3be..59ec1c1a75c2 100644 --- a/examples/language/llama2/scripts/benchmark_7B/gemini_auto.sh +++ b/examples/language/llama2/scripts/benchmark_7B/gemini_auto.sh @@ -8,13 +8,6 @@ HOSTFILE=$(realpath hosts.txt) cd ../.. -# NCCL IB environment variables -export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1 -export NCCL_IB_DISABLE=0 -export NCCL_SOCKET_IFNAME=eth0 -export NCCL_IB_GID_INDEX=3 -export NCCL_IB_TIMEOUT=23 -export NCCL_IB_RETRY_CNT=7 export OMP_NUM_THREADS=8 colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -p gemini_auto -g -x -b 16 diff --git a/examples/language/llama2/scripts/benchmark_7B/hosts.txt b/examples/language/llama2/scripts/benchmark_7B/hosts.txt deleted file mode 100644 index c0923c7dd37c..000000000000 --- a/examples/language/llama2/scripts/benchmark_7B/hosts.txt +++ /dev/null @@ -1,4 +0,0 @@ -192.168.0.38 -192.168.0.39 -192.168.0.40 -192.168.0.41