jamesthesnake · jamesthesnake · Sep 3, 2023 · Aug 5, 2023 · Aug 10, 2023 · Aug 17, 2023
diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
@@ -61,8 +61,8 @@ jobs:
       run:
         shell: bash
     concurrency:
-      group: ${{ github.head_ref }}
-      cancel-in-progress: false
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      cancel-in-progress: true
     steps:
       - name: Copy testmon cache
         run: | # branch name may contain slash, we need to replace it with space
@@ -87,8 +87,8 @@ jobs:
       anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }}
     runs-on: ubuntu-latest
     concurrency:
-      group: ${{ github.head_ref }}
-      cancel-in-progress: false
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      cancel-in-progress: true
     steps:
       - uses: actions/checkout@v2
         with:
@@ -147,8 +147,8 @@ jobs:
       run:
         shell: bash
     concurrency:
-      group: ${{ github.head_ref }}
-      cancel-in-progress: false
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      cancel-in-progress: true
     steps:
       - name: Checkout TensorNVMe
         uses: actions/checkout@v2

diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
@@ -13,8 +13,8 @@ jobs:
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     concurrency:
-      group: ${{ github.head_ref }}
-      cancel-in-progress: false
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      cancel-in-progress: true
     steps:
       - uses: actions/checkout@v3
       - id: set-matrix
@@ -44,8 +44,8 @@ jobs:
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
     timeout-minutes: 120
     concurrency:
-      group: ${{ github.head_ref }}
-      cancel-in-progress: false
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      cancel-in-progress: true
     steps:
       - name: Install dependencies
         run: |

diff --git a/.github/workflows/doc_check_on_pr.yml b/.github/workflows/doc_check_on_pr.yml
@@ -17,8 +17,8 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: ubuntu-latest
     concurrency:
-      group: ${{ github.head_ref }}
-      cancel-in-progress: false
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      cancel-in-progress: true
     steps:
       - uses: actions/checkout@v2
 
@@ -35,8 +35,8 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: ubuntu-latest
     concurrency:
-      group: ${{ github.head_ref }}
-      cancel-in-progress: false
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      cancel-in-progress: true
     steps:
       - uses: actions/checkout@v2
         with:

diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
@@ -20,8 +20,8 @@ jobs:
       any_changed: ${{ steps.changed-files.outputs.any_changed }}
       changed_files: ${{ steps.changed-files.outputs.all_changed_files }}
     concurrency:
-      group: ${{ github.head_ref }}
-      cancel-in-progress: false
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      cancel-in-progress: true
     name: Detect changed example files
     steps:
       - uses: actions/checkout@v3
@@ -63,8 +63,8 @@ jobs:
       run:
         shell: bash
     concurrency:
-      group: ${{ github.head_ref }}
-      cancel-in-progress: false
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      cancel-in-progress: true
     steps:
       - name: Checkout ColossalAI-Documentation
         uses: actions/checkout@v2

diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
@@ -21,8 +21,8 @@ jobs:
       anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
     name: Detect changed example files
     concurrency:
-      group: ${{ github.head_ref }}
-      cancel-in-progress: false
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      cancel-in-progress: true
     steps:
       - uses: actions/checkout@v3
         with:
@@ -81,8 +81,8 @@ jobs:
       options: --gpus all --rm -v /data/scratch/examples-data:/data/
     timeout-minutes: 10
     concurrency:
-      group: ${{ github.head_ref }}
-      cancel-in-progress: false
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      cancel-in-progress: true
     steps:
       - uses: actions/checkout@v3
 

diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
@@ -144,7 +144,7 @@ def size_value_converting_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh
 
     # DeviceMesh information instructs the scaling of the size value
     device_mesh_info = {}
-    for dim, dim_size in enumerate(device_mesh.mesh_shape):
+    for dim, dim_size in enumerate(device_mesh.shape):
         device_mesh_info[dim] = dim_size
 
     def _extract_target_dim(node):

diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
@@ -1,13 +1,11 @@
 import gc
 import logging
 import os
-import warnings
 from pathlib import Path
-from typing import Callable, Iterator, List, Optional, Tuple, Union
+from typing import Callable, Iterator, List, Optional, Tuple
 
 import torch
 import torch.nn as nn
-from torch import Tensor
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
@@ -16,16 +14,14 @@
 from colossalai.checkpoint_io.utils import (
     get_model_base_filenames,
     get_optimizer_base_filenames,
-    get_shard_filename,
     load_shard_state_dict,
     save_state_dict,
     save_state_dict_shards,
 )
 from colossalai.cluster import DistCoordinator
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 from colossalai.utils import get_current_device
-from colossalai.zero import GeminiDDP, zero_model_wrapper, zero_optim_wrapper
-from colossalai.zero.gemini import ZeroOptimizer
+from colossalai.zero import GeminiDDP, GeminiOptimizer
 from colossalai.zero.gemini.memory_tracer import MemStats
 
 from .dp_plugin_base import DPPluginBase
@@ -132,11 +128,7 @@ def save_sharded_optimizer(self, optimizer: Optimizer, checkpoint: Path, gather_
         As there is communication when getting state dict, this must be called on all processes.
         """
 
-        # If optimizer is wrapped, unwrap it.
-        if isinstance(optimizer, OptimizerWrapper):
-            optimizer = optimizer.unwrap()
-
-        assert isinstance(optimizer, ZeroOptimizer)
+        assert isinstance(optimizer, GeminiOptimizer)
 
         if os.path.isfile(checkpoint):
             logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
@@ -183,11 +175,7 @@ def load_sharded_optimizer(self, optimizer: Optimizer, checkpoint_index_file: Pa
         if not os.path.isfile(checkpoint_index_file):
             logging.error(f"Provided path ({checkpoint_index_file}) should be a file")
 
-        # If optimizer is wrapped, unwrap it.
-        if isinstance(optimizer, OptimizerWrapper):
-            optimizer = optimizer.unwrap()
-
-        assert isinstance(optimizer, ZeroOptimizer)
+        assert isinstance(optimizer, GeminiOptimizer)
 
         # Read checkpoint index file.
         ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
@@ -220,47 +208,6 @@ def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
             super().save_lr_scheduler(lr_scheduler, checkpoint)
 
 
-class GeminiModel(ModelWrapper):
-
-    def __init__(self, module: nn.Module, gemini_config: dict, verbose: bool = False) -> None:
-        super().__init__(module)
-        self.module = zero_model_wrapper(module, zero_stage=3, gemini_config=gemini_config, verbose=verbose)
-
-    def unwrap(self):
-        # as save/load state dict is coupled with the GeminiDDP, we only return GeminiDDP model
-        return self.module
-
-
-class GeminiOptimizer(OptimizerWrapper):
-
-    def __init__(self,
-                 module: GeminiDDP,
-                 optimizer: Optimizer,
-                 zero_optim_config: dict,
-                 optim_kwargs: dict,
-                 verbose: bool = False) -> None:
-        optimizer = zero_optim_wrapper(module,
-                                       optimizer,
-                                       optim_config=zero_optim_config,
-                                       **optim_kwargs,
-                                       verbose=verbose)
-        super().__init__(optimizer)
-
-    def backward(self, loss: Tensor, *args, **kwargs):
-        self.optim.backward(loss)
-
-    def clip_grad_by_norm(self,
-                          max_norm: Union[float, int],
-                          norm_type: Union[float, int] = 2,
-                          error_if_nonfinite: bool = False,
-                          *args,
-                          **kwargs) -> Tensor:
-        warnings.warn(f'Gemini controls grad clipping by itself, so you should not use clip_grad_by_norm')
-
-    def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
-        raise NotImplementedError('Gemini does not support clip_grad_by_value')
-
-
 class GeminiPlugin(DPPluginBase):
     """
     Plugin for Gemini.
@@ -277,8 +224,20 @@ class GeminiPlugin(DPPluginBase):
         >>> model, optimizer, train_dataloader, criterion = booster.boost(model, optimizer, train_dataloader, criterion)
 
     Args:
-        device (torch.device): device to place the model.
-        placement_policy (str, optional): "cpu", "cuda", "auto". Defaults to "cpu".
+        chunk_config_dict (dict, optional): chunk configuration dictionary.
+        chunk_init_device (torch.device, optional): device to initialize the chunk.
+        placement_policy (str, optional): "static" and "auto". Defaults to "static".
+        shard_param_frac (float, optional): fraction of parameters to be sharded. Only for "static" placement.
+            If `shard_param_frac` is 1.0, it's equal to zero-3. If `shard_param_frac` is 0.0, it's equal to zero-2. Defaults to 1.0.
+        offload_optim_frac (float, optional): fraction of optimizer states to be offloaded. Only for "static" placement.
+            If `shard_param_frac` is 1.0 and `offload_optim_frac` is 0.0, it's equal to old "cuda" placement. Defaults to 0.0.
+        offload_param_frac (float, optional): fraction of parameters to be offloaded. Only for "static" placement.
+            For efficiency, this argument is useful only when `shard_param_frac` is 1.0 and `offload_optim_frac` is 1.0.
+            If `shard_param_frac` is 1.0, `offload_optim_frac` is 1.0 and `offload_param_frac` is 1.0, it's equal to old "cpu" placement.
+            When using static placement, we recommend users to tune `shard_param_frac` first and then `offload_optim_frac`.
+            Defaults to 0.0.
+        warmup_non_model_data_ratio (float, optional): ratio of expected non-model data memory during warmup. Only for "auto" placement. Defaults to 0.8.
+        steady_cuda_cap_ratio (float, optional): ratio of allowed cuda capacity for model data during steady state. Only for "auto" placement. Defaults to 0.9.
         precision (str, optional): precision. Support 'fp16' and 'bf16'. Defaults to 'fp16'.
         pin_memory (bool, optional): use pin memory on CPU. Defaults to False.
         force_outputs_fp32 (bool, optional): force outputs are fp32. Defaults to False.
@@ -310,8 +269,14 @@ class GeminiPlugin(DPPluginBase):
 
     def __init__(
         self,
-        device: Optional[torch.device] = None,
-        placement_policy: str = "cpu",
+        chunk_config_dict: Optional[dict] = None,
+        chunk_init_device: Optional[torch.device] = None,
+        placement_policy: str = "static",
+        shard_param_frac: float = 1.0,    # only for static placement
+        offload_optim_frac: float = 0.0,    # only for static placement
+        offload_param_frac: float = 0.0,    # only for static placement
+        warmup_non_model_data_ratio: float = 0.8,    # only for auto placement
+        steady_cuda_cap_ratio: float = 0.9,    # only for auto placement
         precision: str = "fp16",
         pin_memory: bool = False,
         force_outputs_fp32: bool = False,
@@ -335,8 +300,14 @@ def __init__(
         super().__init__()
         assert precision in SUPPORTED_PRECISION, f'precision {precision} is not supported'
         self.gemini_config = dict(
-            device=(device or get_current_device()),
+            chunk_config_dict=chunk_config_dict,
+            chunk_init_device=(chunk_init_device or get_current_device()),
             placement_policy=placement_policy,
+            shard_param_frac=shard_param_frac,
+            offload_optim_frac=offload_optim_frac,
+            offload_param_frac=offload_param_frac,
+            warmup_non_model_data_ratio=warmup_non_model_data_ratio,
+            steady_cuda_cap_ratio=steady_cuda_cap_ratio,
             pin_memory=pin_memory,
             force_outputs_fp32=force_outputs_fp32,
             strict_ddp_mode=strict_ddp_mode,
@@ -393,12 +364,15 @@ def configure(
             # model = nn.SyncBatchNorm.convert_sync_batchnorm(model, None)
 
             # wrap the model with Gemini
-            model = GeminiModel(model, self.gemini_config, self.verbose)
+            model = GeminiDDP(model, **self.gemini_config, verbose=self.verbose)
 
         if optimizer is not None and \
                 not isinstance(optimizer, OptimizerWrapper):
-            optimizer = GeminiOptimizer(model.unwrap(), optimizer, self.zero_optim_config, self.optim_kwargs,
-                                        self.verbose)
+            optimizer = GeminiOptimizer(optimizer,
+                                        model.unwrap(),
+                                        **self.zero_optim_config,
+                                        **self.optim_kwargs,
+                                        verbose=self.verbose)
 
         return model, optimizer, criterion, dataloader, lr_scheduler
 

diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py
@@ -265,6 +265,10 @@ def launch_multi_processes(args: Config) -> None:
     # establish remote connection
     runner.connect(host_info_list=active_device_pool, workdir=curr_path, env=env)
 
+    # overwrite master addr when num_nodes > 1 and not specified
+    if len(active_device_pool) > 1 and args.master_addr == "127.0.0.1":
+        args.master_addr = active_device_pool.hostinfo_list[0].hostname
+
     # execute distributed launching command
     for node_id, hostinfo in enumerate(active_device_pool):
         cmd = get_launch_command(master_addr=args.master_addr,

diff --git a/colossalai/kernel/cuda_native/mha/mem_eff_attn.py b/colossalai/kernel/cuda_native/mha/mem_eff_attn.py
@@ -2,7 +2,13 @@
 
 HAS_MEM_EFF_ATTN = False
 try:
-    from xformers.ops.fmha import memory_efficient_attention
+    from xformers.ops.fmha import MemoryEfficientAttentionCutlassOp, memory_efficient_attention
+    from xformers.ops.fmha.attn_bias import (
+        BlockDiagonalCausalMask,
+        BlockDiagonalMask,
+        LowerTriangularMask,
+        LowerTriangularMaskWithTensorBias,
+    )
     HAS_MEM_EFF_ATTN = True
 except ImportError:
     warnings.warn('please install xformers from https://github.com/facebookresearch/xformers')
@@ -16,13 +22,6 @@
     from typing import Optional
 
     import torch
-    from xformers.ops.fmha import MemoryEfficientAttentionCutlassOp
-    from xformers.ops.fmha.attn_bias import (
-        BlockDiagonalCausalMask,
-        BlockDiagonalMask,
-        LowerTriangularMask,
-        LowerTriangularMaskWithTensorBias,
-    )
 
     from .utils import SeqLenInfo