Skip to content
Merged

Co #75

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
bd1ab98
[gemini] fixed the gemini checkpoint io (#3934)
FrankLeeeee Jun 9, 2023
4110d1f
[workflow] cancel duplicated workflow jobs (#3960)
FrankLeeeee Jun 12, 2023
71fe527
[gemini] fixed the gemini checkpoint io (#3934)
FrankLeeeee Jun 9, 2023
6718a2f
[workflow] cancel duplicated workflow jobs (#3960)
FrankLeeeee Jun 12, 2023
2bf6547
Merge pull request #3967 from ver217/update-develop
FrankLeeeee Jun 12, 2023
8bcad73
[workflow] fixed the directory check in build (#3980)
FrankLeeeee Jun 13, 2023
c9cff7e
[checkpointio] General Checkpointing of Sharded Optimizers (#3984)
Jun 15, 2023
725af3e
[booster] make optimizer argument optional for boost (#3993)
cwher Jun 15, 2023
822c3d4
[checkpointio] sharded optimizer checkpoint for DDP plugin (#4002)
Jun 16, 2023
a5883aa
[test] fixed codefactor format report (#4026)
FrankLeeeee Jun 16, 2023
ca768eb
Merge pull request #4025 from hpcaitech/develop
FrankLeeeee Jun 19, 2023
727c459
[nfc] fix dim not defined and fix typo (#3991)
digger-yu Jun 19, 2023
160c64c
[example] fix bucket size in example of gpt gemini (#4028)
Gy-Lu Jun 19, 2023
a52f620
[format] applied code formatting on changed files in pull request 402…
github-actions[bot] Jun 19, 2023
4a81faa
[devops] fix build on pr ci (#4043)
ver217 Jun 19, 2023
d91ab44
Merge pull request #72 from hpcaitech/main
jamesthesnake Jun 19, 2023
c2402f7
Merge pull request #73 from jamesthesnake/ra
jamesthesnake Jun 19, 2023
98cf97e
Merge pull request #74 from jamesthesnake/l
jamesthesnake Jun 19, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 22 additions & 9 deletions .github/workflows/build_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
run: | # branch name may contain slash, we need to replace it with space
export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
[ ! -z "$(ls -A /github/home/testmon_cache/${MAIN_BRANCH})" ] && cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
fi
env:
MAIN_BRANCH: ${{ github.event.master_branch }}
Expand All @@ -60,12 +60,15 @@ jobs:
defaults:
run:
shell: bash
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- name: Copy testmon cache
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
if [ -d "/github/home/testmon_cache/${BASE}" ]; then
[ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ] && mkdir -p /github/home/testmon_cache/_pull && cp -p -r "/github/home/testmon_cache/${BASE}" /github/home/testmon_cache/_pull/${PR_NUMBER}
if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}
fi
env:
PR_NUMBER: ${{ github.event.number }}
Expand All @@ -83,6 +86,9 @@ jobs:
changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }}
anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }}
runs-on: ubuntu-latest
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v2
with:
Expand Down Expand Up @@ -140,6 +146,9 @@ jobs:
defaults:
run:
shell: bash
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- name: Checkout TensorNVMe
uses: actions/checkout@v2
Expand All @@ -150,7 +159,9 @@ jobs:

- name: Restore TensorNVMe Cache
run: |
[ ! -z "$(ls -A /github/home/tensornvme_cache/)" ] && cp -p -r /github/home/tensornvme_cache/* /__w/ColossalAI/ColossalAI/TensorNVMe
if [ -d /github/home/tensornvme_cache ] && [ ! -z "$(ls -A /github/home/tensornvme_cache/)" ]; then
cp -p -r /github/home/tensornvme_cache/* /__w/ColossalAI/ColossalAI/TensorNVMe
fi

- name: Install TensorNVMe
run: |
Expand All @@ -173,7 +184,9 @@ jobs:
if: needs.detect.outputs.anyExtensionFileChanged != 'true'
run: |
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
if [ -d /github/home/cuda_ext_cache ] && [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ]; then
cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
fi

- name: Install Colossal-AI
run: |
Expand All @@ -187,8 +200,8 @@ jobs:

- name: Restore Testmon Cache
run: |
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
[ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
fi
env:
PR_NUMBER: ${{ github.event.number }}
Expand Down Expand Up @@ -264,8 +277,8 @@ jobs:
if: github.event.pull_request.merged == true
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
[ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
fi
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/compatiblity_test_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ jobs:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v3
- id: set-matrix
Expand Down Expand Up @@ -40,6 +43,9 @@ jobs:
image: ${{ matrix.container }}
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
timeout-minutes: 120
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- name: Install dependencies
run: |
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/doc_check_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ jobs:
github.event.pull_request.draft == false &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v2

Expand All @@ -31,6 +34,9 @@ jobs:
github.event.pull_request.draft == false &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v2
with:
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/doc_test_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ jobs:
outputs:
any_changed: ${{ steps.changed-files.outputs.any_changed }}
changed_files: ${{ steps.changed-files.outputs.all_changed_files }}
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
name: Detect changed example files
steps:
- uses: actions/checkout@v3
Expand Down Expand Up @@ -59,6 +62,9 @@ jobs:
defaults:
run:
shell: bash
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- name: Checkout ColossalAI-Documentation
uses: actions/checkout@v2
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/example_check_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ jobs:
matrix: ${{ steps.setup-matrix.outputs.matrix }}
anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
name: Detect changed example files
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v3
with:
Expand Down Expand Up @@ -77,6 +80,9 @@ jobs:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 10
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v3

Expand Down
6 changes: 3 additions & 3 deletions applications/Chat/evaluate/gpt_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
"""
Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.

Temprature is set to 0 to make the model more deterministic.
Temperature is set to 0 to make the model more deterministic.

Args:
prompt: a dictionary including prompt template, CoT and metrics.
Expand Down Expand Up @@ -435,7 +435,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
Use completion model(text-davinci-003) to evaluate one model answer.
Only completion models can return log probabilities.

Temprature is set to 0 to make the model more deterministic.
Temperature is set to 0 to make the model more deterministic.

Args:
prompt: a dictionary including prompt template, CoT and metrics.
Expand Down Expand Up @@ -593,7 +593,7 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
"""
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Different from text-davinci-003, this function directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.

Args:
Expand Down
4 changes: 2 additions & 2 deletions applications/Chat/evaluate/unieval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def evaluate(self, data, category):
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]

# Calculate average sentence-level scores for facutal consistency
# Calculate average sentence-level scores for factual consistency
src_list, output_list = [], []
n_sents = [] # the number of sentences in the claim
for i in range(n_data):
Expand All @@ -288,7 +288,7 @@ def evaluate(self, data, category):
src_list.append(source)
output_list.append(system_outputs[j])
input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
sent_score = self.scorer.score(input_list, self.task, category, dim)
sent_score = self.scorer.score(input_list, self.task, category, self.dim)

# Get average score for each sample
start_idx = 0
Expand Down
2 changes: 1 addition & 1 deletion applications/Chat/evaluate/unieval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def add_question(dimension, output, src=None, ref=None, context=None, task=None)
src: source input for different NLG tasks. For example, source document for summarization
and dialogue history for dialogue response generation.
output: output text generated by the models
ref: human-annotataed groundtruth
ref: human-annotated groundtruth
context: the context needed to evaluate several specific dimension. For example,
additional factual information when evaluating engagingness and groundedness in dialogues.
"""
Expand Down
4 changes: 2 additions & 2 deletions applications/Chat/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def gather_and_equal(tensor: torch.Tensor) -> bool:


def run_test_data(strategy):
EXPERINCE_BATCH_SIZE = 4
EXPERIENCE_BATCH_SIZE = 4
SAMPLE_BATCH_SIZE = 2

if strategy == 'ddp':
Expand All @@ -54,7 +54,7 @@ def run_test_data(strategy):

# experience of all ranks should be the same
for _ in range(2):
data = get_data(EXPERINCE_BATCH_SIZE)
data = get_data(EXPERIENCE_BATCH_SIZE)
assert gather_and_equal(data['input_ids'])
assert gather_and_equal(data['attention_mask'])
experience = experience_maker.make_experience(**data,
Expand Down
57 changes: 40 additions & 17 deletions colossalai/booster/booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from torch.utils.data import DataLoader

from colossalai.checkpoint_io import GeneralCheckpointIO
from colossalai.interface import ModelWrapper

from .accelerator import Accelerator
from .mixed_precision import MixedPrecision, mixed_precision_factory
Expand Down Expand Up @@ -97,10 +98,10 @@ def __init__(self,
def boost(
self,
model: nn.Module,
optimizer: Optimizer,
criterion: Callable = None,
dataloader: DataLoader = None,
lr_scheduler: LRScheduler = None,
optimizer: Optional[Optimizer] = None,
criterion: Optional[Callable] = None,
dataloader: Optional[DataLoader] = None,
lr_scheduler: Optional[LRScheduler] = None,
) -> List[Union[nn.Module, Optimizer, LRScheduler, DataLoader]]:
"""
Boost the model, optimizer, criterion, lr_scheduler, and dataloader.
Expand Down Expand Up @@ -165,11 +166,11 @@ def no_sync(self, model: nn.Module) -> contextmanager:
assert self.plugin.support_no_sync, f'The plugin {self.plugin.__class__.__name__} does not support no_sync.'
return self.plugin.no_sync(model)

def load_model(self, model: nn.Module, checkpoint: str, strict: bool = True):
def load_model(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, strict: bool = True):
"""Load model from checkpoint.

Args:
model (nn.Module): A model boosted by Booster.
model (nn.Module or ModelWrapper): A model boosted by Booster.
checkpoint (str): Path to the checkpoint. It must be a local path.
It should be a directory path if the checkpoint is sharded. Otherwise, it should be a file path.
strict (bool, optional): whether to strictly enforce that the keys
Expand All @@ -179,24 +180,34 @@ def load_model(self, model: nn.Module, checkpoint: str, strict: bool = True):
self.checkpoint_io.load_model(model, checkpoint, strict)

def save_model(self,
model: nn.Module,
model: Union[nn.Module, ModelWrapper],
checkpoint: str,
prefix: str = None,
shard: bool = False,
size_per_shard: int = 1024):
gather_dtensor: bool = True,
prefix: Optional[str] = None,
size_per_shard: int = 1024,
use_safetensors: bool = False):
"""Save model to checkpoint.

Args:
model (nn.Module): A model boosted by Booster.
model (nn.Module or ModelWrapper): A model boosted by Booster.
checkpoint (str): Path to the checkpoint. It must be a local path.
It is a file path if ``shard=False``. Otherwise, it is a directory path.
prefix (str, optional): A prefix added to parameter and buffer
names to compose the keys in state_dict. Defaults to None.
shard (bool, optional): Whether to save checkpoint a sharded way.
If true, the checkpoint will be a folder. Otherwise, it will be a single file. Defaults to False.
gather_dtensor (bool, optional): whether to gather the distributed tensor to the first device. Default: True.
prefix (str, optional): A prefix added to parameter and buffer
names to compose the keys in state_dict. Defaults to None.
size_per_shard (int, optional): Maximum size of checkpoint shard file in MB. This is useful only when ``shard=True``. Defaults to 1024.
use_safetensors (bool, optional): whether to use safe tensors. Default: False. If set to True, the checkpoint will be saved.
"""
self.checkpoint_io.save_model(model, checkpoint=checkpoint, shard=shard, size_per_shard=size_per_shard)
self.checkpoint_io.save_model(model,
checkpoint=checkpoint,
shard=shard,
gather_dtensor=gather_dtensor,
prefix=prefix,
size_per_shard=size_per_shard,
use_safetensors=use_safetensors)

def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
"""Load optimizer from checkpoint.
Expand All @@ -205,22 +216,34 @@ def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
optimizer (Optimizer): An optimizer boosted by Booster.
checkpoint (str): Path to the checkpoint. It must be a local path.
It should be a directory path if the checkpoint is sharded. Otherwise, it should be a file path.
prefix (str, optional): A prefix added to parameter and buffer
names to compose the keys in state_dict. Defaults to None.
size_per_shard (int, optional): Maximum size of checkpoint shard file in MB. This is useful only when ``shard=True``. Defaults to 1024.
"""
self.checkpoint_io.load_optimizer(optimizer, checkpoint)

def save_optimizer(self, optimizer: Optimizer, checkpoint: str, shard: bool = False, size_per_shard: int = 1024):
"""Save optimizer to checkpoint.
Warning: Saving sharded optimizer checkpoint is not supported yet.
def save_optimizer(self,
optimizer: Optimizer,
checkpoint: str,
shard: bool = False,
gather_dtensor: bool = True,
prefix: Optional[str] = None,
size_per_shard: int = 1024):
"""
Save optimizer to checkpoint.

Args:
optimizer (Optimizer): An optimizer boosted by Booster.
checkpoint (str): Path to the checkpoint. It must be a local path.
It is a file path if ``shard=False``. Otherwise, it is a directory path.
shard (bool, optional): Whether to save checkpoint a sharded way.
If true, the checkpoint will be a folder. Otherwise, it will be a single file. Defaults to False.
gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True.
prefix (str, optional): A prefix added to parameter and buffer
names to compose the keys in state_dict. Defaults to None.
size_per_shard (int, optional): Maximum size of checkpoint shard file in MB. This is useful only when ``shard=True``. Defaults to 1024.
"""
self.checkpoint_io.save_optimizer(optimizer, checkpoint, shard, size_per_shard)
self.checkpoint_io.save_optimizer(optimizer, checkpoint, shard, gather_dtensor, prefix, size_per_shard)

def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
"""Save lr scheduler to checkpoint.
Expand Down
8 changes: 5 additions & 3 deletions colossalai/booster/mixed_precision/fp16_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,12 @@ def __init__(self,

def configure(self,
model: nn.Module,
optimizer: Optimizer,
criterion: Callable = None) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
optimizer: Optional[Optimizer] = None,
criterion: Optional[Callable] = None,
) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
model = TorchAMPModule(model)
optimizer = TorchAMPOptimizer(optimizer, **self.torch_amp_kwargs)
if optimizer is not None:
optimizer = TorchAMPOptimizer(optimizer, **self.torch_amp_kwargs)
if criterion is not None:
criterion = TorchAMPModule(criterion)
return model, optimizer, criterion
7 changes: 4 additions & 3 deletions colossalai/booster/mixed_precision/mixed_precision_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import Callable, Tuple
from typing import Callable, Optional, Tuple

import torch.nn as nn
from torch.optim import Optimizer
Expand All @@ -15,7 +15,8 @@ class MixedPrecision(ABC):
@abstractmethod
def configure(self,
model: nn.Module,
optimizer: Optimizer,
criterion: Callable = None) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
optimizer: Optional[Optimizer] = None,
criterion: Optional[Callable] = None,
) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
# TODO: implement this method
pass
Loading