Skip to content
Merged

Best #153

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
9fb0946
Merge pull request #129 from jamesthesnake/best
jamesthesnake Aug 5, 2023
b830b6a
Merge pull request #135 from jamesthesnake/best
jamesthesnake Aug 10, 2023
cb49f42
Merge pull request #141 from jamesthesnake/best
jamesthesnake Aug 17, 2023
2706142
[gemini] improve compatibility and add static placement policy (#4479)
ver217 Aug 24, 2023
152b0f4
Merge pull request #147 from jamesthesnake/better
jamesthesnake Aug 24, 2023
c0efc3e
[format] applied code formatting on changed files in pull request 447…
github-actions[bot] Aug 25, 2023
839847b
[zero]support zero2 with gradient accumulation (#4511)
Gy-Lu Aug 25, 2023
0b00def
[example] add llama2 example (#4527)
ver217 Aug 28, 2023
12c95a9
fix runtime prepare pass (#4502)
vincentccc Aug 30, 2023
8e2e199
[example] update streamlit 0.73.1 to 1.11.1 (#4386)
ChengDaqi2023 Aug 30, 2023
f1ae8c9
[example] change accelerate version (#4431)
tiansiyuan Aug 30, 2023
c7b60f7
[devops] cancel previous runs in the PR (#4546)
ver217 Aug 30, 2023
cbac782
[zero]fix zero ckptIO with offload (#4529)
Gy-Lu Sep 1, 2023
eb952ea
Update Dockerfile (#4499)
data-infra Sep 1, 2023
cfa6070
[Fix] Fix compile error (#4357)
HAOCHENYE Sep 1, 2023
7298842
Merge pull request #149 from hpcaitech/main
jamesthesnake Sep 3, 2023
8592807
Merge pull request #150 from jamesthesnake/jordan
jamesthesnake Sep 3, 2023
5a571c3
Merge pull request #151 from jamesthesnake/better
jamesthesnake Sep 3, 2023
131e54e
Merge pull request #152 from jamesthesnake/ra
jamesthesnake Sep 3, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/workflows/build_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ jobs:
run:
shell: bash
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
steps:
- name: Copy testmon cache
run: | # branch name may contain slash, we need to replace it with space
Expand All @@ -87,8 +87,8 @@ jobs:
anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }}
runs-on: ubuntu-latest
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
steps:
- uses: actions/checkout@v2
with:
Expand Down Expand Up @@ -147,8 +147,8 @@ jobs:
run:
shell: bash
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
steps:
- name: Checkout TensorNVMe
uses: actions/checkout@v2
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/compatiblity_test_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ jobs:
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
steps:
- uses: actions/checkout@v3
- id: set-matrix
Expand Down Expand Up @@ -44,8 +44,8 @@ jobs:
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
timeout-minutes: 120
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
steps:
- name: Install dependencies
run: |
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/doc_check_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
steps:
- uses: actions/checkout@v2

Expand All @@ -35,8 +35,8 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
steps:
- uses: actions/checkout@v2
with:
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/doc_test_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ jobs:
any_changed: ${{ steps.changed-files.outputs.any_changed }}
changed_files: ${{ steps.changed-files.outputs.all_changed_files }}
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
name: Detect changed example files
steps:
- uses: actions/checkout@v3
Expand Down Expand Up @@ -63,8 +63,8 @@ jobs:
run:
shell: bash
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
steps:
- name: Checkout ColossalAI-Documentation
uses: actions/checkout@v2
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/example_check_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ jobs:
anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
name: Detect changed example files
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
steps:
- uses: actions/checkout@v3
with:
Expand Down Expand Up @@ -81,8 +81,8 @@ jobs:
options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 10
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
steps:
- uses: actions/checkout@v3

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def size_value_converting_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh

# DeviceMesh information instructs the scaling of the size value
device_mesh_info = {}
for dim, dim_size in enumerate(device_mesh.mesh_shape):
for dim, dim_size in enumerate(device_mesh.shape):
device_mesh_info[dim] = dim_size

def _extract_target_dim(node):
Expand Down
104 changes: 39 additions & 65 deletions colossalai/booster/plugin/gemini_plugin.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import gc
import logging
import os
import warnings
from pathlib import Path
from typing import Callable, Iterator, List, Optional, Tuple, Union
from typing import Callable, Iterator, List, Optional, Tuple

import torch
import torch.nn as nn
from torch import Tensor
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
from torch.utils.data import DataLoader
Expand All @@ -16,16 +14,14 @@
from colossalai.checkpoint_io.utils import (
get_model_base_filenames,
get_optimizer_base_filenames,
get_shard_filename,
load_shard_state_dict,
save_state_dict,
save_state_dict_shards,
)
from colossalai.cluster import DistCoordinator
from colossalai.interface import ModelWrapper, OptimizerWrapper
from colossalai.utils import get_current_device
from colossalai.zero import GeminiDDP, zero_model_wrapper, zero_optim_wrapper
from colossalai.zero.gemini import ZeroOptimizer
from colossalai.zero import GeminiDDP, GeminiOptimizer
from colossalai.zero.gemini.memory_tracer import MemStats

from .dp_plugin_base import DPPluginBase
Expand Down Expand Up @@ -132,11 +128,7 @@ def save_sharded_optimizer(self, optimizer: Optimizer, checkpoint: Path, gather_
As there is communication when getting state dict, this must be called on all processes.
"""

# If optimizer is wrapped, unwrap it.
if isinstance(optimizer, OptimizerWrapper):
optimizer = optimizer.unwrap()

assert isinstance(optimizer, ZeroOptimizer)
assert isinstance(optimizer, GeminiOptimizer)

if os.path.isfile(checkpoint):
logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
Expand Down Expand Up @@ -183,11 +175,7 @@ def load_sharded_optimizer(self, optimizer: Optimizer, checkpoint_index_file: Pa
if not os.path.isfile(checkpoint_index_file):
logging.error(f"Provided path ({checkpoint_index_file}) should be a file")

# If optimizer is wrapped, unwrap it.
if isinstance(optimizer, OptimizerWrapper):
optimizer = optimizer.unwrap()

assert isinstance(optimizer, ZeroOptimizer)
assert isinstance(optimizer, GeminiOptimizer)

# Read checkpoint index file.
ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
Expand Down Expand Up @@ -220,47 +208,6 @@ def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
super().save_lr_scheduler(lr_scheduler, checkpoint)


class GeminiModel(ModelWrapper):

def __init__(self, module: nn.Module, gemini_config: dict, verbose: bool = False) -> None:
super().__init__(module)
self.module = zero_model_wrapper(module, zero_stage=3, gemini_config=gemini_config, verbose=verbose)

def unwrap(self):
# as save/load state dict is coupled with the GeminiDDP, we only return GeminiDDP model
return self.module


class GeminiOptimizer(OptimizerWrapper):

def __init__(self,
module: GeminiDDP,
optimizer: Optimizer,
zero_optim_config: dict,
optim_kwargs: dict,
verbose: bool = False) -> None:
optimizer = zero_optim_wrapper(module,
optimizer,
optim_config=zero_optim_config,
**optim_kwargs,
verbose=verbose)
super().__init__(optimizer)

def backward(self, loss: Tensor, *args, **kwargs):
self.optim.backward(loss)

def clip_grad_by_norm(self,
max_norm: Union[float, int],
norm_type: Union[float, int] = 2,
error_if_nonfinite: bool = False,
*args,
**kwargs) -> Tensor:
warnings.warn(f'Gemini controls grad clipping by itself, so you should not use clip_grad_by_norm')

def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
raise NotImplementedError('Gemini does not support clip_grad_by_value')


class GeminiPlugin(DPPluginBase):
"""
Plugin for Gemini.
Expand All @@ -277,8 +224,20 @@ class GeminiPlugin(DPPluginBase):
>>> model, optimizer, train_dataloader, criterion = booster.boost(model, optimizer, train_dataloader, criterion)

Args:
device (torch.device): device to place the model.
placement_policy (str, optional): "cpu", "cuda", "auto". Defaults to "cpu".
chunk_config_dict (dict, optional): chunk configuration dictionary.
chunk_init_device (torch.device, optional): device to initialize the chunk.
placement_policy (str, optional): "static" and "auto". Defaults to "static".
shard_param_frac (float, optional): fraction of parameters to be sharded. Only for "static" placement.
If `shard_param_frac` is 1.0, it's equal to zero-3. If `shard_param_frac` is 0.0, it's equal to zero-2. Defaults to 1.0.
offload_optim_frac (float, optional): fraction of optimizer states to be offloaded. Only for "static" placement.
If `shard_param_frac` is 1.0 and `offload_optim_frac` is 0.0, it's equal to old "cuda" placement. Defaults to 0.0.
offload_param_frac (float, optional): fraction of parameters to be offloaded. Only for "static" placement.
For efficiency, this argument is useful only when `shard_param_frac` is 1.0 and `offload_optim_frac` is 1.0.
If `shard_param_frac` is 1.0, `offload_optim_frac` is 1.0 and `offload_param_frac` is 1.0, it's equal to old "cpu" placement.
When using static placement, we recommend users to tune `shard_param_frac` first and then `offload_optim_frac`.
Defaults to 0.0.
warmup_non_model_data_ratio (float, optional): ratio of expected non-model data memory during warmup. Only for "auto" placement. Defaults to 0.8.
steady_cuda_cap_ratio (float, optional): ratio of allowed cuda capacity for model data during steady state. Only for "auto" placement. Defaults to 0.9.
precision (str, optional): precision. Support 'fp16' and 'bf16'. Defaults to 'fp16'.
pin_memory (bool, optional): use pin memory on CPU. Defaults to False.
force_outputs_fp32 (bool, optional): force outputs are fp32. Defaults to False.
Expand Down Expand Up @@ -310,8 +269,14 @@ class GeminiPlugin(DPPluginBase):

def __init__(
self,
device: Optional[torch.device] = None,
placement_policy: str = "cpu",
chunk_config_dict: Optional[dict] = None,
chunk_init_device: Optional[torch.device] = None,
placement_policy: str = "static",
shard_param_frac: float = 1.0, # only for static placement
offload_optim_frac: float = 0.0, # only for static placement
offload_param_frac: float = 0.0, # only for static placement
warmup_non_model_data_ratio: float = 0.8, # only for auto placement
steady_cuda_cap_ratio: float = 0.9, # only for auto placement
precision: str = "fp16",
pin_memory: bool = False,
force_outputs_fp32: bool = False,
Expand All @@ -335,8 +300,14 @@ def __init__(
super().__init__()
assert precision in SUPPORTED_PRECISION, f'precision {precision} is not supported'
self.gemini_config = dict(
device=(device or get_current_device()),
chunk_config_dict=chunk_config_dict,
chunk_init_device=(chunk_init_device or get_current_device()),
placement_policy=placement_policy,
shard_param_frac=shard_param_frac,
offload_optim_frac=offload_optim_frac,
offload_param_frac=offload_param_frac,
warmup_non_model_data_ratio=warmup_non_model_data_ratio,
steady_cuda_cap_ratio=steady_cuda_cap_ratio,
pin_memory=pin_memory,
force_outputs_fp32=force_outputs_fp32,
strict_ddp_mode=strict_ddp_mode,
Expand Down Expand Up @@ -393,12 +364,15 @@ def configure(
# model = nn.SyncBatchNorm.convert_sync_batchnorm(model, None)

# wrap the model with Gemini
model = GeminiModel(model, self.gemini_config, self.verbose)
model = GeminiDDP(model, **self.gemini_config, verbose=self.verbose)

if optimizer is not None and \
not isinstance(optimizer, OptimizerWrapper):
optimizer = GeminiOptimizer(model.unwrap(), optimizer, self.zero_optim_config, self.optim_kwargs,
self.verbose)
optimizer = GeminiOptimizer(optimizer,
model.unwrap(),
**self.zero_optim_config,
**self.optim_kwargs,
verbose=self.verbose)

return model, optimizer, criterion, dataloader, lr_scheduler

Expand Down
4 changes: 4 additions & 0 deletions colossalai/cli/launcher/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ def launch_multi_processes(args: Config) -> None:
# establish remote connection
runner.connect(host_info_list=active_device_pool, workdir=curr_path, env=env)

# overwrite master addr when num_nodes > 1 and not specified
if len(active_device_pool) > 1 and args.master_addr == "127.0.0.1":
args.master_addr = active_device_pool.hostinfo_list[0].hostname

# execute distributed launching command
for node_id, hostinfo in enumerate(active_device_pool):
cmd = get_launch_command(master_addr=args.master_addr,
Expand Down
15 changes: 7 additions & 8 deletions colossalai/kernel/cuda_native/mha/mem_eff_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@

HAS_MEM_EFF_ATTN = False
try:
from xformers.ops.fmha import memory_efficient_attention
from xformers.ops.fmha import MemoryEfficientAttentionCutlassOp, memory_efficient_attention
from xformers.ops.fmha.attn_bias import (
BlockDiagonalCausalMask,
BlockDiagonalMask,
LowerTriangularMask,
LowerTriangularMaskWithTensorBias,
)
HAS_MEM_EFF_ATTN = True
except ImportError:
warnings.warn('please install xformers from https://github.com/facebookresearch/xformers')
Expand All @@ -16,13 +22,6 @@
from typing import Optional

import torch
from xformers.ops.fmha import MemoryEfficientAttentionCutlassOp
from xformers.ops.fmha.attn_bias import (
BlockDiagonalCausalMask,
BlockDiagonalMask,
LowerTriangularMask,
LowerTriangularMaskWithTensorBias,
)

from .utils import SeqLenInfo

Expand Down
Loading