jamesthesnake · jamesthesnake · Jun 19, 2023 · Jun 9, 2023 · Jun 12, 2023 · Jun 9, 2023
diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
@@ -41,7 +41,7 @@ jobs:
         run: | # branch name may contain slash, we need to replace it with space
           export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
           if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
-            [ ! -z "$(ls -A /github/home/testmon_cache/${MAIN_BRANCH})" ] && cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
+             cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
           fi
         env:
           MAIN_BRANCH: ${{ github.event.master_branch }}
@@ -60,12 +60,15 @@ jobs:
     defaults:
       run:
         shell: bash
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     steps:
       - name: Copy testmon cache
         run: | # branch name may contain slash, we need to replace it with space
           export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
-          if [ -d "/github/home/testmon_cache/${BASE}" ]; then
-            [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ] && mkdir -p /github/home/testmon_cache/_pull && cp -p -r "/github/home/testmon_cache/${BASE}" /github/home/testmon_cache/_pull/${PR_NUMBER}
+          if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
+            mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}
           fi
         env:
           PR_NUMBER: ${{ github.event.number }}
@@ -83,6 +86,9 @@ jobs:
       changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }}
       anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }}
     runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     steps:
       - uses: actions/checkout@v2
         with:
@@ -140,6 +146,9 @@ jobs:
     defaults:
       run:
         shell: bash
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     steps:
       - name: Checkout TensorNVMe
         uses: actions/checkout@v2
@@ -150,7 +159,9 @@ jobs:
 
       - name: Restore TensorNVMe Cache
         run: |
-          [ ! -z "$(ls -A /github/home/tensornvme_cache/)" ] && cp -p -r /github/home/tensornvme_cache/* /__w/ColossalAI/ColossalAI/TensorNVMe
+          if [ -d /github/home/tensornvme_cache ] && [ ! -z "$(ls -A /github/home/tensornvme_cache/)" ]; then
+            cp -p -r /github/home/tensornvme_cache/* /__w/ColossalAI/ColossalAI/TensorNVMe
+          fi
 
       - name: Install TensorNVMe
         run: |
@@ -173,7 +184,9 @@ jobs:
         if: needs.detect.outputs.anyExtensionFileChanged != 'true'
         run: |
           # -p flag is required to preserve the file timestamp to avoid ninja rebuild
-          [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
+          if [ -d /github/home/cuda_ext_cache ] && [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ]; then
+            cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
+          fi
 
       - name: Install Colossal-AI
         run: |
@@ -187,8 +200,8 @@ jobs:
 
       - name: Restore Testmon Cache
         run: |
-          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
-            [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
+          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
+            cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
           fi
         env:
           PR_NUMBER: ${{ github.event.number }}
@@ -264,8 +277,8 @@ jobs:
         if: github.event.pull_request.merged == true
         run: | # branch name may contain slash, we need to replace it with space
           export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
-          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
-            [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
+          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
+            cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
           fi
         env:
           PR_NUMBER: ${{ github.event.pull_request.number }}

diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
@@ -12,6 +12,9 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     steps:
       - uses: actions/checkout@v3
       - id: set-matrix
@@ -40,6 +43,9 @@ jobs:
       image: ${{ matrix.container }}
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
     timeout-minutes: 120
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     steps:
       - name: Install dependencies
         run: |

diff --git a/.github/workflows/doc_check_on_pr.yml b/.github/workflows/doc_check_on_pr.yml
@@ -16,6 +16,9 @@ jobs:
       github.event.pull_request.draft == false &&
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     steps:
       - uses: actions/checkout@v2
 
@@ -31,6 +34,9 @@ jobs:
       github.event.pull_request.draft == false &&
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     steps:
       - uses: actions/checkout@v2
         with:

diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
@@ -19,6 +19,9 @@ jobs:
     outputs:
       any_changed: ${{ steps.changed-files.outputs.any_changed }}
       changed_files: ${{ steps.changed-files.outputs.all_changed_files }}
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     name: Detect changed example files
     steps:
       - uses: actions/checkout@v3
@@ -59,6 +62,9 @@ jobs:
     defaults:
       run:
         shell: bash
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     steps:
       - name: Checkout ColossalAI-Documentation
         uses: actions/checkout@v2

diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
@@ -20,6 +20,9 @@ jobs:
       matrix: ${{ steps.setup-matrix.outputs.matrix }}
       anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
     name: Detect changed example files
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     steps:
       - uses: actions/checkout@v3
         with:
@@ -77,6 +80,9 @@ jobs:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/
     timeout-minutes: 10
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     steps:
       - uses: actions/checkout@v3
 

diff --git a/applications/Chat/evaluate/gpt_evaluate.py b/applications/Chat/evaluate/gpt_evaluate.py
@@ -361,7 +361,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
     """
     Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
 
-    Temprature is set to 0 to make the model more deterministic.
+    Temperature is set to 0 to make the model more deterministic.
 
     Args:
         prompt: a dictionary including prompt template, CoT and metrics.
@@ -435,7 +435,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
     Use completion model(text-davinci-003) to evaluate one model answer.
     Only completion models can return log probabilities.
 
-    Temprature is set to 0 to make the model more deterministic.
+    Temperature is set to 0 to make the model more deterministic.
 
     Args:
         prompt: a dictionary including prompt template, CoT and metrics.
@@ -593,7 +593,7 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
 def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
     """
     Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
-    Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
+    Different from text-davinci-003, this function directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
     Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
 
     Args:

diff --git a/applications/Chat/evaluate/unieval/evaluator.py b/applications/Chat/evaluate/unieval/evaluator.py
@@ -277,7 +277,7 @@ def evaluate(self, data, category):
         n_data = len(data)
         eval_scores = [{} for _ in range(n_data)]
 
-        # Calculate average sentence-level scores for facutal consistency
+        # Calculate average sentence-level scores for factual consistency
         src_list, output_list = [], []
         n_sents = []    # the number of sentences in the claim
         for i in range(n_data):
@@ -288,7 +288,7 @@ def evaluate(self, data, category):
                 src_list.append(source)
                 output_list.append(system_outputs[j])
         input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
-        sent_score = self.scorer.score(input_list, self.task, category, dim)
+        sent_score = self.scorer.score(input_list, self.task, category, self.dim)
 
         # Get average score for each sample
         start_idx = 0

diff --git a/applications/Chat/evaluate/unieval/utils.py b/applications/Chat/evaluate/unieval/utils.py
@@ -37,7 +37,7 @@ def add_question(dimension, output, src=None, ref=None, context=None, task=None)
         src: source input for different NLG tasks. For example, source document for summarization
              and dialogue history for dialogue response generation.
         output: output text generated by the models
-        ref: human-annotataed groundtruth
+        ref: human-annotated groundtruth
         context: the context needed to evaluate several specific dimension. For example,
                  additional factual information when evaluating engagingness and groundedness in dialogues.
     """

diff --git a/applications/Chat/tests/test_data.py b/applications/Chat/tests/test_data.py
@@ -33,7 +33,7 @@ def gather_and_equal(tensor: torch.Tensor) -> bool:
 
 
 def run_test_data(strategy):
-    EXPERINCE_BATCH_SIZE = 4
+    EXPERIENCE_BATCH_SIZE = 4
     SAMPLE_BATCH_SIZE = 2
 
     if strategy == 'ddp':
@@ -54,7 +54,7 @@ def run_test_data(strategy):
 
     # experience of all ranks should be the same
     for _ in range(2):
-        data = get_data(EXPERINCE_BATCH_SIZE)
+        data = get_data(EXPERIENCE_BATCH_SIZE)
         assert gather_and_equal(data['input_ids'])
         assert gather_and_equal(data['attention_mask'])
         experience = experience_maker.make_experience(**data,

diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
@@ -9,6 +9,7 @@
 from torch.utils.data import DataLoader
 
 from colossalai.checkpoint_io import GeneralCheckpointIO
+from colossalai.interface import ModelWrapper
 
 from .accelerator import Accelerator
 from .mixed_precision import MixedPrecision, mixed_precision_factory
@@ -97,10 +98,10 @@ def __init__(self,
     def boost(
         self,
         model: nn.Module,
-        optimizer: Optimizer,
-        criterion: Callable = None,
-        dataloader: DataLoader = None,
-        lr_scheduler: LRScheduler = None,
+        optimizer: Optional[Optimizer] = None,
+        criterion: Optional[Callable] = None,
+        dataloader: Optional[DataLoader] = None,
+        lr_scheduler: Optional[LRScheduler] = None,
     ) -> List[Union[nn.Module, Optimizer, LRScheduler, DataLoader]]:
         """
         Boost the model, optimizer, criterion, lr_scheduler, and dataloader.
@@ -165,11 +166,11 @@ def no_sync(self, model: nn.Module) -> contextmanager:
         assert self.plugin.support_no_sync, f'The plugin {self.plugin.__class__.__name__} does not support no_sync.'
         return self.plugin.no_sync(model)
 
-    def load_model(self, model: nn.Module, checkpoint: str, strict: bool = True):
+    def load_model(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, strict: bool = True):
         """Load model from checkpoint.
 
         Args:
-            model (nn.Module): A model boosted by Booster.
+            model (nn.Module or ModelWrapper): A model boosted by Booster.
             checkpoint (str): Path to the checkpoint. It must be a local path.
                 It should be a directory path if the checkpoint is sharded. Otherwise, it should be a file path.
             strict (bool, optional): whether to strictly enforce that the keys
@@ -179,24 +180,34 @@ def load_model(self, model: nn.Module, checkpoint: str, strict: bool = True):
         self.checkpoint_io.load_model(model, checkpoint, strict)
 
     def save_model(self,
-                   model: nn.Module,
+                   model: Union[nn.Module, ModelWrapper],
                    checkpoint: str,
-                   prefix: str = None,
                    shard: bool = False,
-                   size_per_shard: int = 1024):
+                   gather_dtensor: bool = True,
+                   prefix: Optional[str] = None,
+                   size_per_shard: int = 1024,
+                   use_safetensors: bool = False):
         """Save model to checkpoint.
 
         Args:
-            model (nn.Module): A model boosted by Booster.
+            model (nn.Module or ModelWrapper): A model boosted by Booster.
             checkpoint (str): Path to the checkpoint. It must be a local path.
                 It is a file path if ``shard=False``. Otherwise, it is a directory path.
-            prefix (str, optional): A prefix added to parameter and buffer
-                names to compose the keys in state_dict. Defaults to None.
             shard (bool, optional): Whether to save checkpoint a sharded way.
                 If true, the checkpoint will be a folder. Otherwise, it will be a single file. Defaults to False.
+            gather_dtensor (bool, optional): whether to gather the distributed tensor to the first device. Default: True.
+            prefix (str, optional): A prefix added to parameter and buffer
+                names to compose the keys in state_dict. Defaults to None.
             size_per_shard (int, optional): Maximum size of checkpoint shard file in MB. This is useful only when ``shard=True``. Defaults to 1024.
+            use_safetensors (bool, optional): whether to use safe tensors. Default: False. If set to True, the checkpoint will be saved.
         """
-        self.checkpoint_io.save_model(model, checkpoint=checkpoint, shard=shard, size_per_shard=size_per_shard)
+        self.checkpoint_io.save_model(model,
+                                      checkpoint=checkpoint,
+                                      shard=shard,
+                                      gather_dtensor=gather_dtensor,
+                                      prefix=prefix,
+                                      size_per_shard=size_per_shard,
+                                      use_safetensors=use_safetensors)
 
     def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
         """Load optimizer from checkpoint.
@@ -205,22 +216,34 @@ def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
             optimizer (Optimizer): An optimizer boosted by Booster.
             checkpoint (str): Path to the checkpoint. It must be a local path.
                 It should be a directory path if the checkpoint is sharded. Otherwise, it should be a file path.
+            prefix (str, optional): A prefix added to parameter and buffer
+                names to compose the keys in state_dict. Defaults to None.
+            size_per_shard (int, optional): Maximum size of checkpoint shard file in MB. This is useful only when ``shard=True``. Defaults to 1024.
         """
         self.checkpoint_io.load_optimizer(optimizer, checkpoint)
 
-    def save_optimizer(self, optimizer: Optimizer, checkpoint: str, shard: bool = False, size_per_shard: int = 1024):
-        """Save optimizer to checkpoint.
-        Warning: Saving sharded optimizer checkpoint is not supported yet.
+    def save_optimizer(self,
+                       optimizer: Optimizer,
+                       checkpoint: str,
+                       shard: bool = False,
+                       gather_dtensor: bool = True,
+                       prefix: Optional[str] = None,
+                       size_per_shard: int = 1024):
+        """
+        Save optimizer to checkpoint.
 
         Args:
             optimizer (Optimizer): An optimizer boosted by Booster.
             checkpoint (str): Path to the checkpoint. It must be a local path.
                 It is a file path if ``shard=False``. Otherwise, it is a directory path.
             shard (bool, optional): Whether to save checkpoint a sharded way.
                 If true, the checkpoint will be a folder. Otherwise, it will be a single file. Defaults to False.
+            gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True.
+            prefix (str, optional): A prefix added to parameter and buffer
+                names to compose the keys in state_dict. Defaults to None.
             size_per_shard (int, optional): Maximum size of checkpoint shard file in MB. This is useful only when ``shard=True``. Defaults to 1024.
         """
-        self.checkpoint_io.save_optimizer(optimizer, checkpoint, shard, size_per_shard)
+        self.checkpoint_io.save_optimizer(optimizer, checkpoint, shard, gather_dtensor, prefix, size_per_shard)
 
     def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
         """Save lr scheduler to checkpoint.

diff --git a/colossalai/booster/mixed_precision/fp16_torch.py b/colossalai/booster/mixed_precision/fp16_torch.py
@@ -115,10 +115,12 @@ def __init__(self,
 
     def configure(self,
                   model: nn.Module,
-                  optimizer: Optimizer,
-                  criterion: Callable = None) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
+                  optimizer: Optional[Optimizer] = None,
+                  criterion: Optional[Callable] = None,
+                  ) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
         model = TorchAMPModule(model)
-        optimizer = TorchAMPOptimizer(optimizer, **self.torch_amp_kwargs)
+        if optimizer is not None:
+            optimizer = TorchAMPOptimizer(optimizer, **self.torch_amp_kwargs)
         if criterion is not None:
             criterion = TorchAMPModule(criterion)
         return model, optimizer, criterion
diff --git a/colossalai/booster/mixed_precision/mixed_precision_base.py b/colossalai/booster/mixed_precision/mixed_precision_base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Callable, Tuple
+from typing import Callable, Optional, Tuple
 
 import torch.nn as nn
 from torch.optim import Optimizer
@@ -15,7 +15,8 @@ class MixedPrecision(ABC):
     @abstractmethod
     def configure(self,
                   model: nn.Module,
-                  optimizer: Optimizer,
-                  criterion: Callable = None) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
+                  optimizer: Optional[Optimizer] = None,
+                  criterion: Optional[Callable] = None,
+                  ) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
         # TODO: implement this method
         pass