From 378a43c9581d6e7645dd6ddf4c5f0911b82833a9 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Thu, 12 Jan 2023 15:41:09 +0800 Subject: [PATCH 1/5] [example] integrate autoparallel demo with CI --- examples/tutorial/auto_parallel/README.md | 7 +++-- .../auto_parallel_with_resnet.py | 10 ------- .../tutorial/auto_parallel/environment.yaml | 29 ++++--------------- .../tutorial/auto_parallel/requirements.txt | 7 +++-- examples/tutorial/auto_parallel/test_ci.sh | 11 ++----- 5 files changed, 19 insertions(+), 45 deletions(-) diff --git a/examples/tutorial/auto_parallel/README.md b/examples/tutorial/auto_parallel/README.md index e99a018c2da1..08bb55f812e1 100644 --- a/examples/tutorial/auto_parallel/README.md +++ b/examples/tutorial/auto_parallel/README.md @@ -1,13 +1,16 @@ -# Auto-Parallelism with ResNet +# Auto-Parallelism ## πŸš€Quick Start + ### Auto-Parallel Tutorial + 1. Install `pulp` and `coin-or-cbc` for the solver. + ```bash pip install pulp conda install -c conda-forge coin-or-cbc ``` -2. Run the auto parallel resnet example with 4 GPUs with synthetic dataset. +1. Run the auto parallel resnet example with 4 GPUs with synthetic dataset. ```bash colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py -s ``` diff --git a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py index 1f0d720449e5..5de7a5aa80bc 100644 --- a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py +++ b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py @@ -1,11 +1,4 @@ -import argparse -import os -from pathlib import Path - import torch -from titans.utils import barrier_context -from torchvision import transforms -from torchvision.datasets import CIFAR10 from torchvision.models import resnet50 from tqdm import tqdm @@ -14,9 +7,6 @@ from colossalai.core import global_context as gpc from colossalai.logging import get_dist_logger from colossalai.nn.lr_scheduler import CosineAnnealingLR -from colossalai.utils import get_dataloader - -DATA_ROOT = Path(os.environ.get('DATA', '../data')).absolute() def synthesize_data(): diff --git a/examples/tutorial/auto_parallel/environment.yaml b/examples/tutorial/auto_parallel/environment.yaml index 5b811631a19f..5f79647fd78b 100644 --- a/examples/tutorial/auto_parallel/environment.yaml +++ b/examples/tutorial/auto_parallel/environment.yaml @@ -1,32 +1,15 @@ -name: auto +name: auto-parallel channels: - pytorch - conda-forge - defaults dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_kmp_llvm - - blas=1.0=mkl - - brotlipy=0.7.0=py38h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.12.7=ha878542_0 - - certifi=2022.12.7=pyhd8ed1ab_0 - - cffi=1.15.1=py38h74dc2b5_0 - - charset-normalizer=2.0.4=pyhd3eb1b0_0 - - coin-or-cbc=2.10.8=h3786ebc_0 - - coin-or-cgl=0.60.6=h6f57e76_2 - - coin-or-clp=1.17.7=hc56784d_2 - - coin-or-osi=0.108.7=h2720bb7_2 - - coin-or-utils=2.11.6=h202d8b1_2 - - python=3.8.13 - - pip=22.2.2 - - cudatoolkit=11.3 - - pytorch=1.12.1 - - torchvision=0.13.1 - - numpy=1.23.1 + - python=3.8 + - pip + - coin-or-cbc - pip: - titans - - torch==1.12.1 - - pulp==2.7.0 + - torch + - pulp - datasets - colossalai diff --git a/examples/tutorial/auto_parallel/requirements.txt b/examples/tutorial/auto_parallel/requirements.txt index 137a69e80498..2086348be8e1 100644 --- a/examples/tutorial/auto_parallel/requirements.txt +++ b/examples/tutorial/auto_parallel/requirements.txt @@ -1,2 +1,5 @@ -colossalai >= 0.1.12 -torch >= 1.8.1 +torch +colossalai +titans +pulp +datasets diff --git a/examples/tutorial/auto_parallel/test_ci.sh b/examples/tutorial/auto_parallel/test_ci.sh index 74332548f623..bf6275b673ff 100644 --- a/examples/tutorial/auto_parallel/test_ci.sh +++ b/examples/tutorial/auto_parallel/test_ci.sh @@ -1,11 +1,6 @@ #!/bin/bash set -euxo pipefail -conda init bash -conda env create -f environment.yaml -conda activate auto -cd ../../.. -pip uninstall colossalai -pip install -v . -cd ./examples/tutorial/auto_parallel -colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py -s +pip install -r requirements.txt +conda install -c conda-forge coin-or-cbc +colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py From b6885c1d670d6fc6e3ed2f0f22d1e9563098bb86 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Thu, 12 Jan 2023 15:57:55 +0800 Subject: [PATCH 2/5] polish code --- examples/tutorial/auto_parallel/README.md | 92 +++++-------------- .../auto_parallel_with_resnet.py | 2 +- examples/tutorial/auto_parallel/config.py | 4 +- .../tutorial/auto_parallel/environment.yaml | 15 --- .../tutorial/auto_parallel/requirements.txt | 2 + 5 files changed, 30 insertions(+), 85 deletions(-) delete mode 100644 examples/tutorial/auto_parallel/environment.yaml diff --git a/examples/tutorial/auto_parallel/README.md b/examples/tutorial/auto_parallel/README.md index 08bb55f812e1..bb014b9067b2 100644 --- a/examples/tutorial/auto_parallel/README.md +++ b/examples/tutorial/auto_parallel/README.md @@ -1,76 +1,52 @@ # Auto-Parallelism -## πŸš€Quick Start +## Table of contents -### Auto-Parallel Tutorial +- [Auto-Parallelism](#auto-parallelism) + - [Table of contents](#table-of-contents) + - [πŸ“š Overview](#-overview) + - [πŸš€ Quick Start](#-quick-start) + - [Setup](#setup) + - [Auto-Parallel Tutorial](#auto-parallel-tutorial) + - [Auto-Checkpoint Tutorial](#auto-checkpoint-tutorial) -1. Install `pulp` and `coin-or-cbc` for the solver. -```bash -pip install pulp -conda install -c conda-forge coin-or-cbc -``` -1. Run the auto parallel resnet example with 4 GPUs with synthetic dataset. -```bash -colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py -s -``` +## πŸ“š Overview -You should expect to the log like this. This log shows the edge cost on the computation graph as well as the sharding strategy for an operation. For example, `layer1_0_conv1 S01R = S01R X RR` means that the first dimension (batch) of the input and output is sharded while the weight is not sharded (S means sharded, R means replicated), simply equivalent to data parallel training. -![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-parallel%20demo.png) +This tutorial folder contains a simple demo to run auto-parallelism with ResNet. Meanwhile, this diretory also contains demo scripts to run automatic activation checkpointing, but both features are still experimental for now and no guarantee that they will work for your version of Colossal-AI. +## πŸš€ Quick Start -### Auto-Checkpoint Tutorial -1. Stay in the `auto_parallel` folder. -2. Install the dependencies. -```bash -pip install matplotlib transformers -``` -3. Run a simple resnet50 benchmark to automatically checkpoint the model. -```bash -python auto_ckpt_solver_test.py --model resnet50 -``` +### Setup -You should expect the log to be like this -![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-ckpt%20demo.png) +1. Create a conda environment -This shows that given different memory budgets, the model is automatically injected with activation checkpoint and its time taken per iteration. You can run this benchmark for GPT as well but it can much longer since the model is larger. ```bash -python auto_ckpt_solver_test.py --model gpt2 -``` - -4. Run a simple benchmark to find the optimal batch size for checkpointed model. -```bash -python auto_ckpt_batchsize_test.py +conda create -n auto python=3.8 +conda activate auto ``` -You can expect the log to be like -![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-ckpt%20batchsize.png) - - -## Prepare Dataset - -We use CIFAR10 dataset in this example. You should invoke the `donwload_cifar10.py` in the tutorial root directory or directly run the `auto_parallel_with_resnet.py`. -The dataset will be downloaded to `colossalai/examples/tutorials/data` by default. -If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command. +2. Install `requirements` and `coin-or-cbc` for the solver. ```bash -export DATA=/path/to/data +pip install -r requirements.txt +conda install -c conda-forge coin-or-cbc ``` -## extra requirements to use autoparallel -```bash -pip install pulp -conda install coin-or-cbc -``` +### Auto-Parallel Tutorial -## Run on 2*2 device mesh +Run the auto parallel resnet example with 4 GPUs with synthetic dataset. ```bash colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py ``` -## Auto Checkpoint Benchmarking +You should expect to the log like this. This log shows the edge cost on the computation graph as well as the sharding strategy for an operation. For example, `layer1_0_conv1 S01R = S01R X RR` means that the first dimension (batch) of the input and output is sharded while the weight is not sharded (S means sharded, R means replicated), simply equivalent to data parallel training. +![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/tutorial/auto-parallel%20demo.png) + + +### Auto-Checkpoint Tutorial We prepare two bechmarks for you to test the performance of auto checkpoint @@ -89,21 +65,3 @@ python auto_ckpt_solver_test.py --model resnet50 # tun auto_ckpt_batchsize_test.py python auto_ckpt_batchsize_test.py ``` - -There are some results for your reference - -## Auto Checkpoint Solver Test - -### ResNet 50 -![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/tutorial/resnet50_benchmark.png) - -### GPT2 Medium -![](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/tutorial/gpt2_benchmark.png) - -## Auto Checkpoint Batch Size Test -```bash -===============test summary================ -batch_size: 512, peak memory: 73314.392 MB, through put: 254.286 images/s -batch_size: 1024, peak memory: 73316.216 MB, through put: 397.608 images/s -batch_size: 2048, peak memory: 72927.837 MB, through put: 277.429 images/s -``` diff --git a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py index 5de7a5aa80bc..c549f7538783 100644 --- a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py +++ b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py @@ -63,7 +63,7 @@ def main(): # if we use synthetic data # we assume it only has 10 steps for evaluation - num_steps = range(30) + num_steps = range(10) progress = tqdm(num_steps) diff --git a/examples/tutorial/auto_parallel/config.py b/examples/tutorial/auto_parallel/config.py index fa14eda740f7..52e0abcef698 100644 --- a/examples/tutorial/auto_parallel/config.py +++ b/examples/tutorial/auto_parallel/config.py @@ -1,2 +1,2 @@ -BATCH_SIZE = 128 -NUM_EPOCHS = 10 +BATCH_SIZE = 32 +NUM_EPOCHS = 2 diff --git a/examples/tutorial/auto_parallel/environment.yaml b/examples/tutorial/auto_parallel/environment.yaml deleted file mode 100644 index 5f79647fd78b..000000000000 --- a/examples/tutorial/auto_parallel/environment.yaml +++ /dev/null @@ -1,15 +0,0 @@ -name: auto-parallel -channels: - - pytorch - - conda-forge - - defaults -dependencies: - - python=3.8 - - pip - - coin-or-cbc - - pip: - - titans - - torch - - pulp - - datasets - - colossalai diff --git a/examples/tutorial/auto_parallel/requirements.txt b/examples/tutorial/auto_parallel/requirements.txt index 2086348be8e1..ce89e7c80070 100644 --- a/examples/tutorial/auto_parallel/requirements.txt +++ b/examples/tutorial/auto_parallel/requirements.txt @@ -3,3 +3,5 @@ colossalai titans pulp datasets +matplotlib +transformers From 086a56fef197ae143fd849a4d2525fad8313c753 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Thu, 12 Jan 2023 16:11:24 +0800 Subject: [PATCH 3/5] polish code --- .../tutorial/auto_parallel/auto_parallel_with_resnet.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py index c549f7538783..15429f19cbcf 100644 --- a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py +++ b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py @@ -38,9 +38,8 @@ def main(): model.train() # if we use synthetic data - # we assume it only has 30 steps per epoch - num_steps = range(30) - + # we assume it only has 10 steps per epoch + num_steps = range(10) progress = tqdm(num_steps) for _ in progress: @@ -64,7 +63,6 @@ def main(): # if we use synthetic data # we assume it only has 10 steps for evaluation num_steps = range(10) - progress = tqdm(num_steps) for _ in progress: From 72a8b5cbd072bedfece50213eb7e1e4a8d6fcd2f Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Thu, 12 Jan 2023 16:13:00 +0800 Subject: [PATCH 4/5] polish code --- examples/tutorial/auto_parallel/auto_parallel_with_resnet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py index 15429f19cbcf..dcaabadc3986 100644 --- a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py +++ b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py @@ -1,5 +1,5 @@ import torch -from torchvision.models import resnet50 +from torchvision.models import resnet18 from tqdm import tqdm import colossalai @@ -21,7 +21,7 @@ def main(): logger = get_dist_logger() # trace the model with meta data - model = resnet50(num_classes=10).cuda() + model = resnet18(num_classes=10).cuda() input_sample = {'x': torch.rand([gpc.config.BATCH_SIZE * torch.distributed.get_world_size(), 3, 32, 32]).to('meta')} model = autoparallelize(model, input_sample) From 57c417a17b4f1a415b28ad6bb0adbd7c38fcc97b Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Thu, 12 Jan 2023 16:19:38 +0800 Subject: [PATCH 5/5] polish code --- examples/tutorial/auto_parallel/auto_parallel_with_resnet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py index dcaabadc3986..15429f19cbcf 100644 --- a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py +++ b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py @@ -1,5 +1,5 @@ import torch -from torchvision.models import resnet18 +from torchvision.models import resnet50 from tqdm import tqdm import colossalai @@ -21,7 +21,7 @@ def main(): logger = get_dist_logger() # trace the model with meta data - model = resnet18(num_classes=10).cuda() + model = resnet50(num_classes=10).cuda() input_sample = {'x': torch.rand([gpc.config.BATCH_SIZE * torch.distributed.get_world_size(), 3, 32, 32]).to('meta')} model = autoparallelize(model, input_sample)