hpcaitech · ver217 · Jun 7, 2023 · Jun 1, 2023
@@ -0,0 +1,34 @@
+## Overview
+
+This directory includes two parts: Using the Booster API fintune Huggingface Bert and AlBert models and benchmarking Bert and AlBert models with different Booster Plugin.
+
+## Finetune
+```
+bash test_ci.sh
+```
+
+## Benchmark
+```
+bash benchmark.sh
+```
+
+Now include these metrics in benchmark: CUDA mem occupy, throughput and the number of model parameters. If you have custom metrics, you can add them to benchmark_util.
+
+## Results
+
+### Bert
+
+|       | max cuda mem | throughput(sample/s) | params |
+| :-----| -----------: | :--------: | :----: |
+| ddp | 21.44 GB | 3.0 | 82M |
+| ddp_fp16 | 16.26 GB | 11.3 | 82M |
+| gemini | 11.0 GB | 12.9 | 82M |
+| low_level_zero | 11.29 G | 14.7 | 82M |
+
+### AlBert
+|       | max cuda mem | throughput(sample/s) | params |
+| :-----| -----------: | :--------: | :----: |
+| ddp | OOM |  | |
+| ddp_fp16 | OOM |  | |
+| gemini | 69.39 G | 1.3 | 208M |
+| low_level_zero | 56.89 G | 1.4 | 208M |
@@ -0,0 +1,174 @@
+import argparse
+
+import torch
+from benchmark_utils import benchmark
+from torch.utils.data import DataLoader, Dataset
+from transformers import (
+    AlbertConfig,
+    AlbertForSequenceClassification,
+    BertConfig,
+    BertForSequenceClassification,
+    get_linear_schedule_with_warmup,
+)
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.nn.optimizer import HybridAdam
+
+# ==============================
+# Prepare Hyperparameters
+# ==============================
+NUM_EPOCHS = 3
+BATCH_SIZE = 32
+LEARNING_RATE = 2.4e-5
+WEIGHT_DECAY = 0.01
+WARMUP_FRACTION = 0.1
+SEQ_LEN = 512
+VOCAB_SIZE = 1000
+NUM_LABELS = 10
+DATASET_LEN = 1000
+
+
+class RandintDataset(Dataset):
+
+    def __init__(self, dataset_length: int, sequence_length: int, vocab_size: int, n_class: int):
+
+        self._sequence_length = sequence_length
+        self._vocab_size = vocab_size
+        self._n_class = n_class
+        self._dataset_length = dataset_length
+        self._datas = torch.randint(
+            low=0,
+            high=self._vocab_size,
+            size=(self._dataset_length, self._sequence_length,),
+            dtype=torch.long,
+        )
+        self._labels = torch.randint(low=0, high=self._n_class, size=(self._dataset_length, 1), dtype=torch.long) 
+
+    def __len__(self):
+        return self._dataset_length
+
+    def __getitem__(self, idx):
+        return self._datas[idx], self._labels[idx]
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-t', '--task', default='mrpc', help="GLUE task to run")
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default="bert",
+        help="bert or albert",
+    )
+
+    args = parser.parse_args()
+
+    # ==============================
+    # Launch Distributed Environment
+    # ==============================
+    colossalai.launch_from_torch(config={}, seed=42)
+    coordinator = DistCoordinator()
+
+    # local_batch_size = BATCH_SIZE // coordinator.world_size
+    lr = LEARNING_RATE * coordinator.world_size
+
+    # ==============================
+    # Instantiate Plugin and Booster
+    # ==============================
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
+    # ==============================
+    # Prepare Dataloader
+    # ==============================
+
+    train_dataset = RandintDataset(dataset_length=DATASET_LEN,
+                                   sequence_length=SEQ_LEN,
+                                   vocab_size=VOCAB_SIZE,
+                                   n_class=NUM_LABELS)
+    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
+
+    # ====================================
+    # Prepare model, optimizer
+    # ====================================
+    # bert pretrained model
+
+    if args.model_type == "bert":
+        cfg = BertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
+        model = BertForSequenceClassification(cfg)
+    elif args.model_type == "albert":
+        cfg = AlbertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
+        model = AlbertForSequenceClassification(cfg)
+    else:
+        raise RuntimeError
+
+    # optimizer
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": WEIGHT_DECAY,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+
+    optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, eps=1e-8)
+
+    # lr scheduler
+    total_steps = len(train_dataloader) * NUM_EPOCHS
+    num_warmup_steps = int(WARMUP_FRACTION * total_steps)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=total_steps,
+    )
+
+    # criterion
+    criterion = lambda inputs: inputs[0]
+
+    # ==============================
+    # Boost with ColossalAI
+    # ==============================
+    model, optimizer, _, _, lr_scheduler = booster.boost(model, optimizer, lr_scheduler=lr_scheduler)
+
+    # ==============================
+    # Benchmark model
+    # ==============================
+
+    results = benchmark(model,
+                        booster,
+                        optimizer,
+                        lr_scheduler,
+                        train_dataloader,
+                        criterion=criterion,
+                        epoch_num=NUM_EPOCHS)
+
+    coordinator.print_on_master(results)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -xe
+
+pip install -r requirements.txt
+
+for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
+   torchrun --standalone --nproc_per_node 2  benchmark.py --plugin $plugin --model_type "bert"
+   torchrun --standalone --nproc_per_node 2  benchmark.py  --plugin $plugin --model_type "albert"
+done
@@ -0,0 +1,146 @@
+import inspect
+from logging import getLogger
+from time import time
+from typing import Callable
+
+import torch
+import yaml
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
+
+logger = getLogger("colossalai-booster-benchmark")
+_INVALID = float("nan")
+
+
+def format_num(num: int, bytes=False):
+    """Scale bytes to its proper format, e.g. 1253656 => '1.20MB'"""
+    factor = 1024 if bytes else 1000
+    suffix = "B" if bytes else ""
+    for unit in ["", " K", " M", " G", " T", " P"]:
+        if num < factor:
+            return f"{num:.2f}{unit}{suffix}"
+        num /= factor
+
+
+def _is_valid(val):
+    return val == val
+
+
+def get_call_arg_names(module_or_fn):
+    if isinstance(module_or_fn, torch.nn.Module):
+        return inspect.getfullargspec(module_or_fn.forward)[0][1:]
+    return inspect.getfullargspec(module_or_fn)[0]
+
+
+def measure_params(model):
+    num_params = _INVALID
+
+    try:
+        num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    except AttributeError as e:
+        logger.error(f"Unable to measure model params due to error: {e}")
+
+    return num_params
+
+
+def warm_up(
+    model,
+    booster,
+    dataloader,
+    criterion,
+    optimizer,
+    lr_scheduler,
+    num_runs=10,
+):
+    for i, data in enumerate(dataloader):
+        if i > num_runs:
+            break
+        inputs, labels = data[0].cuda(), data[1].cuda()
+        outputs = model(inputs, labels=labels)
+        loss = criterion(outputs)
+        booster.backward(loss, optimizer)
+        optimizer.step()
+        lr_scheduler.step()
+        optimizer.zero_grad()
+
+
+def fmt(d: dict):
+    return yaml.dump(d)
+
+
+def benchmark(
+    model: torch.nn.Module,
+    booster: Booster,
+    optimizer: torch.optim.Optimizer,
+    lr_scheduler: LRScheduler,
+    dataloader: DataLoader,
+    criterion: Callable = None,
+    warm_up_fn=warm_up,
+    epoch_num: int = 3,
+    batch_size: int = 32,
+    warm_up_steps: int = 3,
+):
+    results = {}
+    model_device = torch.cuda.current_device()
+
+    # Warm up
+    warm_up_fn(
+        model,
+        booster,
+        dataloader,
+        criterion,
+        optimizer,
+        lr_scheduler,
+        num_runs=warm_up_steps,
+    )
+    # Measure params
+    params = measure_params(model)
+    if _is_valid(params):
+        results["params"] = format_num(params)
+        logger.info(f"Model parameters: {params} ({format_num(params)})")
+
+    # Measure Allocated Memory and Throughput
+    memory = {}
+    throughput = {}
+    torch.cuda.reset_peak_memory_stats(device=model_device)
+    pre_mem = torch.cuda.memory_allocated(device=model_device)
+
+    start_time = time()
+
+    for epoch in range(epoch_num):
+        with tqdm(dataloader, desc=f'Epoch [{epoch + 1}/{epoch_num}]',
+                  disable=not DistCoordinator().is_master()) as pbar:
+            for data in pbar:
+                inputs, labels = data[0].cuda(), data[1].cuda()
+                outputs = model(inputs, labels=labels)
+                loss = criterion(outputs)
+                booster.backward(loss, optimizer)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+    end_time = time()
+
+    all_sample = epoch_num * len(dataloader)
+
+    post_mem = torch.cuda.memory_allocated(device=model_device)
+    max_mem = torch.cuda.max_memory_allocated(device=model_device)
+
+    memory[f"batch_size_{batch_size}"] = {
+        "cuda_pre_training_bytes": format_num(pre_mem, bytes=True),
+        "cuda_max_training_bytes": format_num(max_mem, bytes=True),
+        "cuda_post_training_bytes": format_num(post_mem, bytes=True),
+    }
+    logger.info(fmt({f"Memory results (batch_size={batch_size})": memory[f"batch_size_{batch_size}"]}))
+
+    throughput[f"batch_size_{batch_size}"] = {"throughput:": "{:.1f}".format(all_sample * DistCoordinator().world_size / (end_time - start_time))}
+    logger.info(fmt({f"Throughput results (batch_size={batch_size})": throughput[f"batch_size_{batch_size}"]}))
+
+    results["throughput"] = throughput
+    results["memory"] = memory
+
+    return results