diff --git a/colossalai/kernel/cuda_native/scaled_softmax.py b/colossalai/kernel/cuda_native/scaled_softmax.py index 3f0260aaed87..44d750c5cbde 100644 --- a/colossalai/kernel/cuda_native/scaled_softmax.py +++ b/colossalai/kernel/cuda_native/scaled_softmax.py @@ -114,6 +114,13 @@ def __init__( self.softmax_in_fp32 = softmax_in_fp32 self.scale = scale + try: + from colossalai._C import scaled_masked_softmax + except ImportError: + from colossalai.kernel.op_builder.scaled_masked_softmax import ScaledMaskedSoftmaxBuilder + scaled_masked_softmax = ScaledMaskedSoftmaxBuilder().load() + self.scaled_masked_softmax = scaled_masked_softmax + assert (self.scale is None or softmax_in_fp32), "softmax should be in fp32 when scaled" def forward(self, input, mask): @@ -178,11 +185,5 @@ def forward_torch_softmax(self, input, mask): return probs - @staticmethod - def get_batch_per_block(sq, sk, b, np): - try: - import colossalai._C.scaled_masked_softmax - except ImportError: - raise RuntimeError('ScaledMaskedSoftmax requires cuda extensions') - - return colossalai._C.scaled_masked_softmax.get_batch_per_block(sq, sk, b, np) + def get_batch_per_block(self, sq, sk, b, np): + return self.scaled_masked_softmax.get_batch_per_block(sq, sk, b, np) diff --git a/examples/tutorial/large_batch_optimizer/README.md b/examples/tutorial/large_batch_optimizer/README.md index d85afa427518..1a17c2d8740f 100644 --- a/examples/tutorial/large_batch_optimizer/README.md +++ b/examples/tutorial/large_batch_optimizer/README.md @@ -1,9 +1,11 @@ -# Comparison of Large Batch Training Optimization +# Large Batch Training Optimization ## Table of contents -- [Overview](#-overview) -- [Quick Start](#-quick-start) +- [Large Batch Training Optimization](#large-batch-training-optimization) + - [Table of contents](#table-of-contents) + - [πŸ“š Overview](#-overview) + - [πŸš€ Quick Start](#-quick-start) ## πŸ“š Overview diff --git a/examples/tutorial/sequence_parallel/README.md b/examples/tutorial/sequence_parallel/README.md index 7058f53db8b6..1b7c60e22861 100644 --- a/examples/tutorial/sequence_parallel/README.md +++ b/examples/tutorial/sequence_parallel/README.md @@ -1,139 +1,56 @@ -# Sequence Parallelism with BERT +# Sequence Parallelism -In this example, we implemented BERT with sequence parallelism. Sequence parallelism splits the input tensor and intermediate -activation along the sequence dimension. This method can achieve better memory efficiency and allows us to train with larger batch size and longer sequence length. +## Table of contents -Paper: [Sequence Parallelism: Long Sequence Training from System Perspective](https://arxiv.org/abs/2105.13120) +- [Sequence Parallelism](#sequence-parallelism) + - [Table of contents](#table-of-contents) + - [πŸ“š Overview](#-overview) + - [πŸš€ Quick Start](#-quick-start) + - [🏎 How to Train with Sequence Parallelism](#-how-to-train-with-sequence-parallelism) + - [Step 1. Configure your parameters](#step-1-configure-your-parameters) + - [Step 2. Invoke parallel training](#step-2-invoke-parallel-training) -## πŸš€Quick Start -1. Run with the following command -```bash -export PYTHONPATH=$PWD -colossalai run --nproc_per_node 4 train.py -s -``` -2. The default config is sequence parallel size = 2, pipeline size = 1, let’s change pipeline size to be 2 and try it again. - - -## How to Prepare WikiPedia Dataset - -First, let's prepare the WikiPedia dataset from scratch. To generate a preprocessed dataset, we need four items: -1. raw WikiPedia dataset -2. wikipedia extractor (extract data from the raw dataset) -3. vocabulary file -4. preprocessing scripts (generate final data from extracted data) - -For the preprocessing script, we thank Megatron-LM for providing a preprocessing script to generate the corpus file. - -```python -# download raw data -mkdir data && cd ./data -wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 - -# install wiki extractor -git clone https://github.com/FrankLeeeee/wikiextractor.git -pip install ./wikiextractor - -# extractmodule -wikiextractor --json enwiki-latest-pages-articles.xml.bz2 -cat text/*/* > ./corpus.json -cd .. - -# download vocab file -mkdir vocab && cd ./vocab -wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt -cd .. - -# preprocess some data -git clone https://github.com/NVIDIA/Megatron-LM.git -cd ./Megatron-LM -python tools/preprocess_data.py \ - --input ../data/corpus.json \ - --output-prefix my-bert \ - --vocab ../vocab/bert-large-uncased-vocab.txt \ - --dataset-impl mmap \ - --tokenizer-type BertWordPieceLowerCase \ - --split-sentences \ - --workers 24 -``` +## πŸ“š Overview -After running the preprocessing scripts, you will obtain two files: -1. my-bert_text_sentence.bin -2. my-bert_text_sentence.idx - -If you happen to encouter `index out of range` problem when running Megatron's script, -this is probably because that a sentence starts with a punctuation and cannot be tokenized. A work-around is to update `Encoder.encode` method with the code below: - -```python -class Encoder(object): - def __init__(self, args): - ... - - def initializer(self): - ... - - def encode(self, json_line): - data = json.loads(json_line) - ids = {} - for key in self.args.json_keys: - text = data[key] - doc_ids = [] - - # lsg: avoid sentences which start with a punctuation - # as it cannot be tokenized by splitter - if len(text) > 0 and text[0] in string.punctuation: - text = text[1:] - - for sentence in Encoder.splitter.tokenize(text): - sentence_ids = Encoder.tokenizer.tokenize(sentence) - if len(sentence_ids) > 0: - doc_ids.append(sentence_ids) - if len(doc_ids) > 0 and self.args.append_eod: - doc_ids[-1].append(Encoder.tokenizer.eod) - ids[key] = doc_ids - return ids, len(json_line) -``` +In this tutorial, we implemented BERT with sequence parallelism. Sequence parallelism splits the input tensor and intermediate +activation along the sequence dimension. This method can achieve better memory efficiency and allows us to train with larger batch size and longer sequence length. -## How to Train with Sequence Parallelism +Paper: [Sequence Parallelism: Long Sequence Training from System Perspective](https://arxiv.org/abs/2105.13120) -We provided `train.py` for you to execute training. Before invoking the script, there are several -steps to perform. +## πŸš€ Quick Start -### Step 1. Set data path and vocab path +1. Install PyTorch -At the top of `config.py`, you can see two global variables `DATA_PATH` and `VOCAB_FILE_PATH`. +2. Install the dependencies. -```python -DATA_PATH = -VOCAB_FILE_PATH = +```bash +pip install -r requirements.txt ``` -`DATA_PATH` refers to the path to the data file generated by Megatron's script. For example, in the section above, you should get two data files (my-bert_text_sentence.bin and my-bert_text_sentence.idx). You just need to `DATA_PATH` to the path to the bin file without the file extension. +3. Run with the following command -For example, if your my-bert_text_sentence.bin is /home/Megatron-LM/my-bert_text_sentence.bin, then you should set +```bash +export PYTHONPATH=$PWD -```python -DATA_PATH = '/home/Megatron-LM/my-bert_text_sentence' +# run with synthetic dataset +colossalai run --nproc_per_node 4 train.py ``` -The `VOCAB_FILE_PATH` refers to the path to the vocabulary downloaded when you prepare the dataset -(e.g. bert-large-uncased-vocab.txt). +> The default config is sequence parallel size = 2, pipeline size = 1, let’s change pipeline size to be 2 and try it again. -### Step 3. Make Dataset Helper -Build BERT dataset helper. Requirements are `CUDA`, `g++`, `pybind11` and `make`. +## 🏎 How to Train with Sequence Parallelism -```python -cd ./data/datasets -make -``` +We provided `train.py` for you to execute training. Before invoking the script, there are several +steps to perform. -### Step 3. Configure your parameters +### Step 1. Configure your parameters In the `config.py` provided, a set of parameters are defined including training scheme, model, etc. You can also modify the ColossalAI setting. For example, if you wish to parallelize over the sequence dimension on 8 GPUs. You can change `size=4` to `size=8`. If you wish to use pipeline parallelism, you can set `pipeline=`. -### Step 4. Invoke parallel training +### Step 2. Invoke parallel training Lastly, you can start training with sequence parallelism. How you invoke `train.py` depends on your machine setting. diff --git a/examples/tutorial/sequence_parallel/config.py b/examples/tutorial/sequence_parallel/config.py index df0c5282f032..6edf9cc2c7e5 100644 --- a/examples/tutorial/sequence_parallel/config.py +++ b/examples/tutorial/sequence_parallel/config.py @@ -1,11 +1,8 @@ from colossalai.amp import AMP_TYPE -DATA_PATH = '' -VOCAB_FILE_PATH = '' - # hyper-parameters -TRAIN_ITERS = 1000000 -DECAY_ITERS = 990000 +TRAIN_ITERS = 10 +DECAY_ITERS = 4 WARMUP_FRACTION = 0.01 GLOBAL_BATCH_SIZE = 32 # dp world size * sentences per GPU EVAL_ITERS = 10 @@ -13,12 +10,12 @@ LR = 0.0001 MIN_LR = 1e-05 WEIGHT_DECAY = 0.01 -SEQ_LENGTH = 512 +SEQ_LENGTH = 128 # BERT config -DEPTH = 12 -NUM_ATTENTION_HEADS = 12 -HIDDEN_SIZE = 768 +DEPTH = 4 +NUM_ATTENTION_HEADS = 4 +HIDDEN_SIZE = 128 # model config ADD_BINARY_HEAD = False diff --git a/examples/tutorial/sequence_parallel/requirements.txt b/examples/tutorial/sequence_parallel/requirements.txt index 137a69e80498..b49a94554afb 100644 --- a/examples/tutorial/sequence_parallel/requirements.txt +++ b/examples/tutorial/sequence_parallel/requirements.txt @@ -1,2 +1,2 @@ -colossalai >= 0.1.12 -torch >= 1.8.1 +colossalai +torch diff --git a/examples/tutorial/sequence_parallel/test_ci.sh b/examples/tutorial/sequence_parallel/test_ci.sh new file mode 100644 index 000000000000..7bc20de3b6e4 --- /dev/null +++ b/examples/tutorial/sequence_parallel/test_ci.sh @@ -0,0 +1,7 @@ +#!/bin/bash +set -euxo pipefail + +pip install -r requirements.txt + +# run test +colossalai run --nproc_per_node 4 train.py diff --git a/examples/tutorial/sequence_parallel/train.py b/examples/tutorial/sequence_parallel/train.py index b92061000d10..a89747b5845e 100644 --- a/examples/tutorial/sequence_parallel/train.py +++ b/examples/tutorial/sequence_parallel/train.py @@ -1,9 +1,8 @@ import argparse import torch -from data import build_train_valid_test_data_iterators from data.bert_helper import SequenceParallelDataIterator, get_batch_for_sequence_parallel -from data.tokenizer import get_padded_vocab_size, initialize_tokenizer +from data.dummy_dataloader import DummyDataloader from loss_func.bert_loss import BertLoss from lr_scheduler import AnnealingLR from model.bert import BertForPretrain, build_pipeline_bert @@ -36,7 +35,7 @@ def parse_args(): def pipeline_data_process_func(stage_output, micro_batch_data): - tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = micro_batch_data + tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = micro_batch_data if gpc.is_first_rank(ParallelMode.PIPELINE): data = (tokens, padding_mask, types, lm_labels) label = (loss_mask, sentence_order) @@ -53,36 +52,15 @@ def main(): logger = get_dist_logger() - # build dataloader - if not args.synthetic: - initialize_tokenizer(gpc.config.VOCAB_FILE_PATH, tokenizer_type='BertWordPieceLowerCase') - VOCAB_SIZE = get_padded_vocab_size() - trainloader, validloader, testloader = build_train_valid_test_data_iterators( - train_iters=gpc.config.TRAIN_ITERS, - global_batch_size=gpc.config.GLOBAL_BATCH_SIZE, - eval_interval=gpc.config.EVAL_INTERVAL, - eval_iters=gpc.config.EVAL_ITERS, - data_prefix=[gpc.config.DATA_PATH], - data_impl='mmap', - splits_string='949,50,1', - max_seq_length=gpc.config.SEQ_LENGTH, - masked_lm_prob=0.15, - short_seq_prob=0.1, - seed=1234, - skip_warmup=True, - binary_head=False, - ) - else: - from data.dummy_dataloader import DummyDataloader - - BATCH_SIZE_PER_GPUS = gpc.config.GLOBAL_BATCH_SIZE // gpc.get_world_size(ParallelMode.DATA) - VOCAB_SIZE = 30528 - trainloader = DummyDataloader(batch_size=BATCH_SIZE_PER_GPUS, - vocab_size=VOCAB_SIZE, - seq_length=gpc.config.SEQ_LENGTH) - validloader = DummyDataloader(batch_size=BATCH_SIZE_PER_GPUS, - vocab_size=VOCAB_SIZE, - seq_length=gpc.config.SEQ_LENGTH) + # build synthetic dataloader + BATCH_SIZE_PER_GPUS = gpc.config.GLOBAL_BATCH_SIZE // gpc.get_world_size(ParallelMode.DATA) + VOCAB_SIZE = 30528 + trainloader = DummyDataloader(batch_size=BATCH_SIZE_PER_GPUS, + vocab_size=VOCAB_SIZE, + seq_length=gpc.config.SEQ_LENGTH) + validloader = DummyDataloader(batch_size=BATCH_SIZE_PER_GPUS, + vocab_size=VOCAB_SIZE, + seq_length=gpc.config.SEQ_LENGTH) logger.info("Dataloaders are built", ranks=[0])