jamesthesnake · jamesthesnake · Sep 13, 2023 · Sep 9, 2023 · Sep 11, 2023 · Sep 11, 2023
diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
@@ -44,7 +44,7 @@ jobs:
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
     timeout-minutes: 120
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test-${{ matrix.container }}
       cancel-in-progress: true
     steps:
       - name: Install dependencies

diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
@@ -81,7 +81,7 @@ jobs:
       options: --gpus all --rm -v /data/scratch/examples-data:/data/
     timeout-minutes: 10
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example-${{ matrix.directory }}
       cancel-in-progress: true
     steps:
       - uses: actions/checkout@v3

diff --git a/LICENSE b/LICENSE
@@ -396,3 +396,35 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    POSSIBILITY OF SUCH DAMAGE.
+
+   ---------------- LICENSE FOR VLLM TEAM ----------------
+
+   from VLLM TEAM:
+
+      Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+         https://github.com/vllm-project/vllm/blob/main/LICENSE
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   ---------------- LICENSE FOR LIGHTLLM TEAM ----------------
+
+   from LIGHTLLM TEAM:
+
+      Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+         https://github.com/ModelTC/lightllm/blob/main/LICENSE
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/colossalai/auto_parallel/offload/base_offload_module.py b/colossalai/auto_parallel/offload/base_offload_module.py
@@ -4,7 +4,7 @@
 import torch
 import torch.nn as nn
 
-from colossalai.nn.parallel.data_parallel import _cast_float
+from colossalai.utils import _cast_float
 from colossalai.zero.legacy.gemini.tensor_utils import free_storage
 
 from .region_manager import RegionManager

diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/registry.py b/colossalai/auto_parallel/tensor_shard/node_handler/registry.py
@@ -1,5 +1,4 @@
 class Registry:
-    # TODO: refactor the registry classes used in colossalai.legacy.registry, colossalai.fx and here
 
     def __init__(self, name):
         self.name = name

diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
@@ -1,6 +1,6 @@
 import warnings
 from contextlib import contextmanager
-from typing import Any, Callable, Iterator, List, Optional, Union
+from typing import Any, Callable, Dict, Iterator, List, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -24,29 +24,31 @@ class Booster:
     Booster is a high-level API for training neural networks. It provides a unified interface for
     training with different precision, accelerator, and plugin.
 
-    Examples:
-        ```python
-        colossalai.launch(...)
-        plugin = GeminiPlugin(...)
-        booster = Booster(precision='fp16', plugin=plugin)
-
-        model = GPT2()
-        optimizer = HybridAdam(model.parameters())
-        dataloader = Dataloader(Dataset)
-        lr_scheduler = LinearWarmupScheduler()
-        criterion = GPTLMLoss()
-
-        model, optimizer, lr_scheduler, dataloader = booster.boost(model, optimizer, lr_scheduler, dataloader)
-
-        for epoch in range(max_epochs):
-            for input_ids, attention_mask in dataloader:
-                outputs = model(input_ids, attention_mask)
-                loss = criterion(outputs.logits, input_ids)
-                booster.backward(loss, optimizer)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-        ```
+
+    ```python
+    # Following is pseudocode
+
+    colossalai.launch(...)
+    plugin = GeminiPlugin(...)
+    booster = Booster(precision='fp16', plugin=plugin)
+
+    model = GPT2()
+    optimizer = HybridAdam(model.parameters())
+    dataloader = plugin.prepare_dataloader(train_dataset, batch_size=8)
+    lr_scheduler = LinearWarmupScheduler()
+    criterion = GPTLMLoss()
+
+    model, optimizer, criterion, dataloader, lr_scheduler = booster.boost(model, optimizer, criterion, dataloader, lr_scheduler)
+
+    for epoch in range(max_epochs):
+        for input_ids, attention_mask in dataloader:
+            outputs = model(input_ids.cuda(), attention_mask.cuda())
+            loss = criterion(outputs.logits, input_ids)
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+    ```
 
     Args:
         device (str or torch.device): The device to run the training. Default: None.
@@ -60,7 +62,7 @@ class Booster:
 
     def __init__(self,
                  device: Optional[str] = None,
-                 mixed_precision: Union[MixedPrecision, str] = None,
+                 mixed_precision: Optional[Union[MixedPrecision, str]] = None,
                  plugin: Optional[Plugin] = None) -> None:
         if plugin is not None:
             assert isinstance(
@@ -110,14 +112,19 @@ def boost(
         lr_scheduler: Optional[LRScheduler] = None,
     ) -> List[Union[nn.Module, Optimizer, LRScheduler, DataLoader]]:
         """
-        Boost the model, optimizer, criterion, lr_scheduler, and dataloader.
+        Wrap and inject features to the passed in model, optimizer, criterion, lr_scheduler, and dataloader.
 
         Args:
-            model (nn.Module): The model to be boosted.
-            optimizer (Optimizer): The optimizer to be boosted.
-            criterion (Callable): The criterion to be boosted.
-            dataloader (DataLoader): The dataloader to be boosted.
-            lr_scheduler (LRScheduler): The lr_scheduler to be boosted.
+            model (nn.Module): Convert model into a wrapped model for distributive training.
+                               The model might be decorated or partitioned by plugin's strategy after execution of this method.
+            optimizer (Optimizer, optional): Convert optimizer into a wrapped optimizer for distributive training.
+                                             The optimizer's param groups or states might be decorated or partitioned by plugin's strategy after execution of this method. Defaults to None.
+            criterion (Callable, optional): The function that calculates loss. Defaults to None.
+            dataloader (DataLoader, optional): The prepared dataloader for training. Defaults to None.
+            lr_scheduler (LRScheduler, optional): The learning scheduler for training. Defaults to None.
+
+        Returns:
+            List[Union[nn.Module, Optimizer, LRScheduler, DataLoader]]: The list of boosted input arguments.
         """
         # TODO(FrankLeeeee): consider multi-model and multi-optimizer case
         # TODO(FrankLeeeee): consider multi-dataloader case
@@ -138,10 +145,10 @@ def boost(
         return model, optimizer, criterion, dataloader, lr_scheduler
 
     def backward(self, loss: torch.Tensor, optimizer: Optimizer) -> None:
-        """Backward pass.
+        """Execution of backward during training step.
 
         Args:
-            loss (torch.Tensor): The loss to be backpropagated.
+            loss (torch.Tensor): The loss for backpropagation.
             optimizer (Optimizer): The optimizer to be updated.
         """
         # TODO(frank lee): implement this method with plugin
@@ -153,9 +160,31 @@ def execute_pipeline(self,
                          criterion: Callable[[Any, Any], torch.Tensor],
                          optimizer: Optional[Optimizer] = None,
                          return_loss: bool = True,
-                         return_outputs: bool = False) -> dict:
-        # run pipeline forward backward pass
-        # return loss or outputs if needed
+                         return_outputs: bool = False) -> Dict[str, Any]:
+        """
+        Execute forward & backward when utilizing pipeline parallel.
+        Return loss or Huggingface style model outputs if needed.
+
+        Warning: This function is tailored for the scenario of pipeline parallel.
+        As a result, please don't do the forward/backward pass in the conventional way (model(input)/loss.backward())
+        when doing pipeline parallel training with booster, which will cause unexpected errors.
+
+        Args:
+            data_iter(Iterator): The iterator for getting the next batch of data. Usually there are two ways to obtain this argument:
+                                 1. wrap the dataloader to iterator through: iter(dataloader)
+                                 2. get the next batch from dataloader, and wrap this batch to iterator: iter([batch])
+            model (nn.Module): The model to execute forward/backward, it should be a model wrapped by a plugin that supports pipeline.
+            criterion: (Callable[[Any, Any], torch.Tensor]): Criterion to be used. It should take two arguments: model outputs and inputs, and returns loss tensor.
+                                                             'lambda y, x: loss_fn(y)' can turn a normal loss function into a valid two-argument criterion here.
+            optimizer (Optimizer, optional): The optimizer for execution of backward. Can be None when only doing forward (i.e. evaluation). Defaults to None.
+            return_loss (bool, optional): Whether to return loss in the dict returned by this method. Defaults to True.
+            return_output (bool, optional): Whether to return Huggingface style model outputs in the dict returned by this method. Defaults to False.
+
+        Returns:
+            Dict[str, Any]: Output dict in the form of {'loss': ..., 'outputs': ...}.
+                            ret_dict['loss'] is the loss of forward if return_loss is set to True, else None.
+                            ret_dict['outputs'] is the Huggingface style model outputs during forward if return_output is set to True, else None.
+        """
         assert isinstance(self.plugin,
                           PipelinePluginBase), f'The plugin {self.plugin.__class__.__name__} does not support pipeline.'
         return self.plugin.execute_pipeline(data_iter, model, criterion, optimizer, return_loss, return_outputs)
@@ -175,7 +204,7 @@ def no_sync(self, model: nn.Module = None, optimizer: OptimizerWrapper = None) -
         assert self.plugin.support_no_sync(), f'The plugin {self.plugin.__class__.__name__} does not support no_sync.'
         return self.plugin.no_sync(model, optimizer)
 
-    def load_model(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, strict: bool = True):
+    def load_model(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, strict: bool = True) -> None:
         """Load model from checkpoint.
 
         Args:
@@ -195,15 +224,15 @@ def save_model(self,
                    gather_dtensor: bool = True,
                    prefix: Optional[str] = None,
                    size_per_shard: int = 1024,
-                   use_safetensors: bool = False):
+                   use_safetensors: bool = False) -> None:
         """Save model to checkpoint.
 
         Args:
             model (nn.Module or ModelWrapper): A model boosted by Booster.
             checkpoint (str): Path to the checkpoint. It must be a local path.
                 It is a file path if ``shard=False``. Otherwise, it is a directory path.
             shard (bool, optional): Whether to save checkpoint a sharded way.
-                If true, the checkpoint will be a folder. Otherwise, it will be a single file. Defaults to False.
+                If true, the checkpoint will be a folder with the same format as Huggingface transformers checkpoint. Otherwise, it will be a single file. Defaults to False.
             gather_dtensor (bool, optional): whether to gather the distributed tensor to the first device. Default: True.
             prefix (str, optional): A prefix added to parameter and buffer
                 names to compose the keys in state_dict. Defaults to None.
@@ -218,7 +247,7 @@ def save_model(self,
                                       size_per_shard=size_per_shard,
                                       use_safetensors=use_safetensors)
 
-    def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
+    def load_optimizer(self, optimizer: Optimizer, checkpoint: str) -> None:
         """Load optimizer from checkpoint.
 
         Args:
@@ -237,7 +266,7 @@ def save_optimizer(self,
                        shard: bool = False,
                        gather_dtensor: bool = True,
                        prefix: Optional[str] = None,
-                       size_per_shard: int = 1024):
+                       size_per_shard: int = 1024) -> None:
         """
         Save optimizer to checkpoint.
 
@@ -254,7 +283,7 @@ def save_optimizer(self,
         """
         self.checkpoint_io.save_optimizer(optimizer, checkpoint, shard, gather_dtensor, prefix, size_per_shard)
 
-    def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
+    def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str) -> None:
         """Save lr scheduler to checkpoint.
 
         Args:
@@ -263,7 +292,7 @@ def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
         """
         self.checkpoint_io.save_lr_scheduler(lr_scheduler, checkpoint)
 
-    def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
+    def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str) -> None:
         """Load lr scheduler from checkpoint.
 
         Args:

diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py
@@ -11,8 +11,6 @@
 import torch
 import torch.nn as nn
 from torch.optim import Optimizer
-from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
-from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
 
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 from colossalai.nn.optimizer import ColossalaiOptimizer
@@ -383,6 +381,11 @@ def save_config_file(model: nn.Module, checkpoint_path: str, is_master: bool = T
         checkpoint_path (str): Path to the checkpoint directory.
         is_master (bool): Whether current rank is main process.
     """
+    try:
+        from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
+        from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
+    except ImportError:
+        return
     if not isinstance(model, PreTrainedModel):
         return
 

diff --git a/colossalai/cli/benchmark/models.py b/colossalai/cli/benchmark/models.py
@@ -1,6 +1,6 @@
 import torch
 
-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn
 
 
 class MLP(torch.nn.Module):