Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions examples/language/bert/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
## Overview

This directory includes two parts: Using the Booster API fintune Huggingface Bert and AlBert models and benchmarking Bert and AlBert models with different Booster Plugin.

## Finetune
```
bash test_ci.sh
```

## Benchmark
```
bash benchmark.sh
```

Now include these metrics in benchmark: CUDA mem occupy, throughput and the number of model parameters. If you have custom metrics, you can add them to benchmark_util.

## Results

### Bert

| | max cuda mem | throughput(sample/s) | params |
| :-----| -----------: | :--------: | :----: |
| ddp | 21.44 GB | 3.0 | 82M |
| ddp_fp16 | 16.26 GB | 11.3 | 82M |
| gemini | 11.0 GB | 12.9 | 82M |
| low_level_zero | 11.29 G | 14.7 | 82M |

### AlBert
| | max cuda mem | throughput(sample/s) | params |
| :-----| -----------: | :--------: | :----: |
| ddp | OOM | | |
| ddp_fp16 | OOM | | |
| gemini | 69.39 G | 1.3 | 208M |
| low_level_zero | 56.89 G | 1.4 | 208M |
174 changes: 174 additions & 0 deletions examples/language/bert/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import argparse

import torch
from benchmark_utils import benchmark
from torch.utils.data import DataLoader, Dataset
from transformers import (
AlbertConfig,
AlbertForSequenceClassification,
BertConfig,
BertForSequenceClassification,
get_linear_schedule_with_warmup,
)

import colossalai
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam

# ==============================
# Prepare Hyperparameters
# ==============================
NUM_EPOCHS = 3
BATCH_SIZE = 32
LEARNING_RATE = 2.4e-5
WEIGHT_DECAY = 0.01
WARMUP_FRACTION = 0.1
SEQ_LEN = 512
VOCAB_SIZE = 1000
NUM_LABELS = 10
DATASET_LEN = 1000


class RandintDataset(Dataset):

def __init__(self, dataset_length: int, sequence_length: int, vocab_size: int, n_class: int):

self._sequence_length = sequence_length
self._vocab_size = vocab_size
self._n_class = n_class
self._dataset_length = dataset_length
self._datas = torch.randint(
low=0,
high=self._vocab_size,
size=(self._dataset_length, self._sequence_length,),
dtype=torch.long,
)
self._labels = torch.randint(low=0, high=self._n_class, size=(self._dataset_length, 1), dtype=torch.long)

def __len__(self):
return self._dataset_length

def __getitem__(self, idx):
return self._datas[idx], self._labels[idx]


def main():
# ==============================
# Parse Arguments
# ==============================
parser = argparse.ArgumentParser()
parser.add_argument('-t', '--task', default='mrpc', help="GLUE task to run")
parser.add_argument('-p',
'--plugin',
type=str,
default='torch_ddp',
choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
help="plugin to use")
parser.add_argument(
"--model_type",
type=str,
default="bert",
help="bert or albert",
)

args = parser.parse_args()

# ==============================
# Launch Distributed Environment
# ==============================
colossalai.launch_from_torch(config={}, seed=42)
coordinator = DistCoordinator()

# local_batch_size = BATCH_SIZE // coordinator.world_size
lr = LEARNING_RATE * coordinator.world_size

# ==============================
# Instantiate Plugin and Booster
# ==============================
booster_kwargs = {}
if args.plugin == 'torch_ddp_fp16':
booster_kwargs['mixed_precision'] = 'fp16'
if args.plugin.startswith('torch_ddp'):
plugin = TorchDDPPlugin()
elif args.plugin == 'gemini':
plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
elif args.plugin == 'low_level_zero':
plugin = LowLevelZeroPlugin(initial_scale=2**5)

booster = Booster(plugin=plugin, **booster_kwargs)

# ==============================
# Prepare Dataloader
# ==============================

train_dataset = RandintDataset(dataset_length=DATASET_LEN,
sequence_length=SEQ_LEN,
vocab_size=VOCAB_SIZE,
n_class=NUM_LABELS)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

# ====================================
# Prepare model, optimizer
# ====================================
# bert pretrained model

if args.model_type == "bert":
cfg = BertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
model = BertForSequenceClassification(cfg)
elif args.model_type == "albert":
cfg = AlbertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
model = AlbertForSequenceClassification(cfg)
else:
raise RuntimeError

# optimizer
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": WEIGHT_DECAY,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]

optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, eps=1e-8)

# lr scheduler
total_steps = len(train_dataloader) * NUM_EPOCHS
num_warmup_steps = int(WARMUP_FRACTION * total_steps)
lr_scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=num_warmup_steps,
num_training_steps=total_steps,
)

# criterion
criterion = lambda inputs: inputs[0]

# ==============================
# Boost with ColossalAI
# ==============================
model, optimizer, _, _, lr_scheduler = booster.boost(model, optimizer, lr_scheduler=lr_scheduler)

# ==============================
# Benchmark model
# ==============================

results = benchmark(model,
booster,
optimizer,
lr_scheduler,
train_dataloader,
criterion=criterion,
epoch_num=NUM_EPOCHS)

coordinator.print_on_master(results)


if __name__ == '__main__':
main()
9 changes: 9 additions & 0 deletions examples/language/bert/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
set -xe

pip install -r requirements.txt

for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
torchrun --standalone --nproc_per_node 2 benchmark.py --plugin $plugin --model_type "bert"
torchrun --standalone --nproc_per_node 2 benchmark.py --plugin $plugin --model_type "albert"
done
146 changes: 146 additions & 0 deletions examples/language/bert/benchmark_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import inspect
from logging import getLogger
from time import time
from typing import Callable

import torch
import yaml
from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
from torch.utils.data import DataLoader
from tqdm import tqdm

from colossalai.booster import Booster
from colossalai.cluster import DistCoordinator

logger = getLogger("colossalai-booster-benchmark")
_INVALID = float("nan")


def format_num(num: int, bytes=False):
"""Scale bytes to its proper format, e.g. 1253656 => '1.20MB'"""
factor = 1024 if bytes else 1000
suffix = "B" if bytes else ""
for unit in ["", " K", " M", " G", " T", " P"]:
if num < factor:
return f"{num:.2f}{unit}{suffix}"
num /= factor


def _is_valid(val):
return val == val


def get_call_arg_names(module_or_fn):
if isinstance(module_or_fn, torch.nn.Module):
return inspect.getfullargspec(module_or_fn.forward)[0][1:]
return inspect.getfullargspec(module_or_fn)[0]


def measure_params(model):
num_params = _INVALID

try:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
except AttributeError as e:
logger.error(f"Unable to measure model params due to error: {e}")

return num_params


def warm_up(
model,
booster,
dataloader,
criterion,
optimizer,
lr_scheduler,
num_runs=10,
):
for i, data in enumerate(dataloader):
if i > num_runs:
break
inputs, labels = data[0].cuda(), data[1].cuda()
outputs = model(inputs, labels=labels)
loss = criterion(outputs)
booster.backward(loss, optimizer)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()


def fmt(d: dict):
return yaml.dump(d)


def benchmark(
model: torch.nn.Module,
booster: Booster,
optimizer: torch.optim.Optimizer,
lr_scheduler: LRScheduler,
dataloader: DataLoader,
criterion: Callable = None,
warm_up_fn=warm_up,
epoch_num: int = 3,
batch_size: int = 32,
warm_up_steps: int = 3,
):
results = {}
model_device = torch.cuda.current_device()

# Warm up
warm_up_fn(
model,
booster,
dataloader,
criterion,
optimizer,
lr_scheduler,
num_runs=warm_up_steps,
)
# Measure params
params = measure_params(model)
if _is_valid(params):
results["params"] = format_num(params)
logger.info(f"Model parameters: {params} ({format_num(params)})")

# Measure Allocated Memory and Throughput
memory = {}
throughput = {}
torch.cuda.reset_peak_memory_stats(device=model_device)
pre_mem = torch.cuda.memory_allocated(device=model_device)

start_time = time()

for epoch in range(epoch_num):
with tqdm(dataloader, desc=f'Epoch [{epoch + 1}/{epoch_num}]',
disable=not DistCoordinator().is_master()) as pbar:
for data in pbar:
inputs, labels = data[0].cuda(), data[1].cuda()
outputs = model(inputs, labels=labels)
loss = criterion(outputs)
booster.backward(loss, optimizer)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()

end_time = time()

all_sample = epoch_num * len(dataloader)

post_mem = torch.cuda.memory_allocated(device=model_device)
max_mem = torch.cuda.max_memory_allocated(device=model_device)

memory[f"batch_size_{batch_size}"] = {
"cuda_pre_training_bytes": format_num(pre_mem, bytes=True),
"cuda_max_training_bytes": format_num(max_mem, bytes=True),
"cuda_post_training_bytes": format_num(post_mem, bytes=True),
}
logger.info(fmt({f"Memory results (batch_size={batch_size})": memory[f"batch_size_{batch_size}"]}))

throughput[f"batch_size_{batch_size}"] = {"throughput:": "{:.1f}".format(all_sample * DistCoordinator().world_size / (end_time - start_time))}
logger.info(fmt({f"Throughput results (batch_size={batch_size})": throughput[f"batch_size_{batch_size}"]}))

results["throughput"] = throughput
results["memory"] = memory

return results
Loading