Skip to content
Merged

j #161

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
424629f
[shardformer/sequence parallel] Cherry pick commit to new branch (#4450)
FoolPlayer Aug 16, 2023
6ef33f7
[shardformer] support DDP in HybridPlugin/add tp+dp tests (#4446)
Aug 16, 2023
26e29d5
[devops] add large-scale distributed test marker (#4452)
ver217 Aug 16, 2023
a78daf6
[shardformer] support interleaved pipeline (#4448)
Gy-Lu Aug 16, 2023
7c8be77
[shardformer/sequence parallel] support gpt2 seq parallel with pp/dp/…
FoolPlayer Aug 18, 2023
0ecd71e
[shardformer] bloom support sequence parallel (#4465)
flybird11111 Aug 18, 2023
a27e0bb
[shardformer] bert support sequence parallel. (#4455)
flybird11111 Aug 18, 2023
8739aa7
[shardformer] Pipeline/whisper (#4456)
CjhHa1 Aug 18, 2023
1c7df56
[shardformer] support tp+zero for shardformer (#4472)
Aug 21, 2023
5545114
rename chatglm to chatglm2 (#4484)
CjhHa1 Aug 22, 2023
351351a
[shardformer/sequence parallel] not support opt of seq-parallel, add …
FoolPlayer Aug 22, 2023
59e252e
[shardformer] chatglm support sequence parallel (#4482)
flybird11111 Aug 22, 2023
e04436a
[shardformer] tests for 3d parallel (#4493)
CjhHa1 Aug 23, 2023
3353e55
[shardformer] vit/llama/t5 ignore the sequence parallelism flag and s…
flybird11111 Aug 24, 2023
de8a65b
[shardformer] opt fix. (#4514)
flybird11111 Aug 25, 2023
44eab2b
[shardformer] support sharded checkpoint IO for models of HybridParal…
Aug 25, 2023
376533a
[shardformer] zero1+pp and the corresponding tests (#4517)
CjhHa1 Aug 28, 2023
c554b7f
[shardformer/fix overlap bug] fix overlap bug, add overlap as an opti…
FoolPlayer Aug 28, 2023
0387a47
[shardformer] fix emerged bugs after updating transformers (#4526)
Aug 29, 2023
e241b74
[shardformer] Add overlap support for gpt2 (#4535)
FoolPlayer Aug 29, 2023
d367b88
[shardformer] fix opt test hanging (#4521)
flybird11111 Aug 30, 2023
ec18fc7
[shardformer] support pp+tp+zero1 tests (#4531)
flybird11111 Aug 30, 2023
2c787d7
[shardformer] fix submodule replacement bug when enabling pp (#4544)
Aug 31, 2023
c9625db
[shardformer] support sharded optimizer checkpointIO of HybridParalle…
Aug 31, 2023
38ccb8b
[shardformer] support from_pretrained when loading model with HybridP…
Sep 1, 2023
508ca36
[pipeline] 1f1b schedule receive microbatch size (#4589)
ver217 Sep 1, 2023
24c0768
[shardformer] Pytree fix (#4533)
CjhHa1 Sep 4, 2023
0a94fcd
[shardformer] update bert finetune example with HybridParallelPlugin …
flybird11111 Sep 4, 2023
e79b1e8
[checkpointio] support huggingface from_pretrained for all plugins (#…
Sep 4, 2023
a39a5c6
Merge branch 'main' into feature/shardformer
ver217 Sep 4, 2023
86d2258
[shardformer] Add overlap optional for HybridParallelPlugin (#4615)
FoolPlayer Sep 5, 2023
ec08668
[shardformer] update shardformer readme (#4617)
flybird11111 Sep 5, 2023
e71d245
[test] ignore gpt2 shardformer test (#4619)
ver217 Sep 5, 2023
807e01a
[zero] hotfix master param sync (#4618)
ver217 Sep 5, 2023
bd18678
[test] fix gemini checkpoint and gpt test (#4620)
ver217 Sep 5, 2023
89fe027
[legacy] move trainer to legacy (#4545)
ver217 Aug 31, 2023
8accecd
[legacy] move engine to legacy (#4560)
ver217 Sep 4, 2023
ac178ca
[legacy] move builder and registry to legacy (#4603)
ver217 Sep 4, 2023
fae6c92
Merge branch 'main' into feature/shardformer
ver217 Sep 5, 2023
efba0f4
Merge pull request #4612 from hpcaitech/feature/shardformer
ver217 Sep 5, 2023
9709b8f
[release] update version (#4623)
ver217 Sep 6, 2023
c3d5fa3
[shardformer] Support customized policy for llamav2 based model with …
eric8607242 Sep 7, 2023
660eed9
[pipeline] set optimizer to optional in execute_pipeline (#4630)
Sep 7, 2023
295b38f
[example] update vit example for hybrid parallel plugin (#4641)
Sep 7, 2023
a686f9d
[devops] fix concurrency group and compatibility test (#4665)
ver217 Sep 8, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/build_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
run:
shell: bash
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache
cancel-in-progress: true
steps:
- name: Copy testmon cache
Expand All @@ -87,7 +87,7 @@ jobs:
anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }}
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change
cancel-in-progress: true
steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -147,7 +147,7 @@ jobs:
run:
shell: bash
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test
cancel-in-progress: true
steps:
- name: Checkout TensorNVMe
Expand Down Expand Up @@ -208,7 +208,7 @@ jobs:

- name: Execute Unit Testing
run: |
CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. --durations=10 tests/
CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/
env:
DATA: /data/scratch/cifar-10
NCCL_SHM_DISABLE: 1
Expand Down
7 changes: 3 additions & 4 deletions .github/workflows/compatiblity_test_on_dispatch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
name: Test for PyTorch Compatibility
needs: matrix_preparation
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
runs-on: [self-hosted, 8-gpu]
strategy:
fail-fast: false
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
Expand All @@ -64,7 +64,7 @@ jobs:
- name: Install tensornvme
run: |
cd TensorNVMe
conda install cmake
apt update && apt install -y cmake
pip install -r requirements.txt
pip install -v .
- uses: actions/checkout@v2
Expand All @@ -83,8 +83,7 @@ jobs:
fi
- name: Install Colossal-AI
run: |
pip install -r requirements/requirements.txt
pip install -v --no-cache-dir .
CUDA_EXT=1 pip install -v .
pip install -r requirements/requirements-test.txt
- name: Unit Testing
run: |
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/compatiblity_test_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-prepare-matrix
cancel-in-progress: true
steps:
- uses: actions/checkout@v3
Expand All @@ -35,7 +35,7 @@ jobs:
name: Test for PyTorch Compatibility
needs: matrix_preparation
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
runs-on: [self-hosted, 8-gpu]
strategy:
fail-fast: false
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
Expand All @@ -44,7 +44,7 @@ jobs:
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
timeout-minutes: 120
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test
cancel-in-progress: true
steps:
- name: Install dependencies
Expand All @@ -58,7 +58,7 @@ jobs:
- name: Install tensornvme
run: |
cd TensorNVMe
conda install cmake
apt update && apt install -y cmake
pip install -r requirements.txt
pip install -v .
- uses: actions/checkout@v2
Expand All @@ -78,7 +78,7 @@ jobs:

- name: Install Colossal-AI
run: |
pip install -v --no-cache-dir .
CUDA_EXT=1 pip install -v .
pip install -r requirements/requirements-test.txt
- name: Unit Testing
run: |
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/compatiblity_test_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
name: Test for PyTorch Compatibility
needs: matrix_preparation
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
runs-on: [self-hosted, 8-gpu]
strategy:
fail-fast: false
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
Expand All @@ -54,7 +54,7 @@ jobs:
- name: Install tensornvme
run: |
cd TensorNVMe
conda install cmake
apt update && apt install -y cmake
pip install -r requirements.txt
pip install -v .
- uses: actions/checkout@v2
Expand All @@ -75,7 +75,7 @@ jobs:

- name: Install Colossal-AI
run: |
pip install -v --no-cache-dir .
CUDA_EXT=1 pip install -v .
pip install -r requirements/requirements-test.txt

- name: Unit Testing
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/doc_check_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-check-i18n
cancel-in-progress: true
steps:
- uses: actions/checkout@v2
Expand All @@ -35,7 +35,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-check-doc
cancel-in-progress: true
steps:
- uses: actions/checkout@v2
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/doc_test_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
any_changed: ${{ steps.changed-files.outputs.any_changed }}
changed_files: ${{ steps.changed-files.outputs.all_changed_files }}
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change
cancel-in-progress: true
name: Detect changed example files
steps:
Expand Down Expand Up @@ -63,7 +63,7 @@ jobs:
run:
shell: bash
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-doctest
cancel-in-progress: true
steps:
- name: Checkout ColossalAI-Documentation
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/example_check_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
name: Detect changed example files
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change
cancel-in-progress: true
steps:
- uses: actions/checkout@v3
Expand Down Expand Up @@ -81,7 +81,7 @@ jobs:
options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 10
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example
cancel-in-progress: true
steps:
- uses: actions/checkout@v3
Expand Down
79 changes: 38 additions & 41 deletions applications/Chat/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,43 @@
from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
SFT_DATASET = [
{
"instruction": "Provide a list of the top 10 most popular mobile games in Asia",
"input": "",
"output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
"id": 0
"instruction":
"Provide a list of the top 10 most popular mobile games in Asia",
"input":
"",
"output":
"The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
"id":
0
},
{
"instruction": "Please provide an action plan for reducing carbon footprint on a corporate level",
"input": "",
"output": "An action plan for reducing carbon footprint on a corporate level could include switching to renewable energy sources (e.g., solar or wind powered), investing in energy efficient infrastructure, implementing green production practices (e.g., reduce water and paper usage, dispose of materials responsibly, recycle/reuse materials), reducing or eliminating travel, encouraging employees to be conscious of and reduce their own energy usage, and incentivizing employees to participate in sustainable work practices.",
"id": 1
"instruction":
"Please provide an action plan for reducing carbon footprint on a corporate level",
"input":
"",
"output":
"An action plan for reducing carbon footprint on a corporate level could include switching to renewable energy sources (e.g., solar or wind powered), investing in energy efficient infrastructure, implementing green production practices (e.g., reduce water and paper usage, dispose of materials responsibly, recycle/reuse materials), reducing or eliminating travel, encouraging employees to be conscious of and reduce their own energy usage, and incentivizing employees to participate in sustainable work practices.",
"id":
1
},
{
"instruction": "Write a persuasive email to your boss explaining why you should have a pay raise",
"input": "",
"output": "Dear [Boss], \n\nI hope this message finds you well. I am writing to request a pay raise.\n\nAs you know, I have been a dedicated and hardworking member of the team since I started working here [insert number] of months/years ago. My enthusiasm and passion for my job has remained consistent over the years, and I have always given 100% to my role. \n\nI understand that the current financial situation is challenging, however, I would sincerely appreciate you taking the time to consider my request. I believe that my dedication to the job and the value that I bring to the organization warrants a raise. I work diligently and am confident that I can continue to be an asset to the company. \n\nI hope my request is taken into account and I thank you in advance for your understanding. I look forward to our conversation. \n\nSincerely,\n[Your Name]",
"id": 2
"instruction":
"Write a persuasive email to your boss explaining why you should have a pay raise",
"input":
"",
"output":
"Dear [Boss], \n\nI hope this message finds you well. I am writing to request a pay raise.\n\nAs you know, I have been a dedicated and hardworking member of the team since I started working here [insert number] of months/years ago. My enthusiasm and passion for my job has remained consistent over the years, and I have always given 100% to my role. \n\nI understand that the current financial situation is challenging, however, I would sincerely appreciate you taking the time to consider my request. I believe that my dedication to the job and the value that I bring to the organization warrants a raise. I work diligently and am confident that I can continue to be an asset to the company. \n\nI hope my request is taken into account and I thank you in advance for your understanding. I look forward to our conversation. \n\nSincerely,\n[Your Name]",
"id":
2
},
]

PROMPT_DATASET = [
{
"instruction": "Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
"id": 0
"instruction":
"Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
"id":
0
},
{
"instruction": "Write a descriptive paragraph about a memorable vacation you went on",
Expand Down Expand Up @@ -73,9 +87,7 @@ def make_tokenizer(model: str):
return tokenizer


def check_content(input_ids_stripped: torch.Tensor,
tokenizer: PreTrainedTokenizer,
model: str):
def check_content(input_ids_stripped: torch.Tensor, tokenizer: PreTrainedTokenizer, model: str):
if model == "opt":
# NOTE: Contrary to GPT2, OPT adds the EOS token </s> to the beginning of every prompt.
assert input_ids_stripped[0] == tokenizer.eos_token_id
Expand All @@ -98,13 +110,10 @@ def check_content(input_ids_stripped: torch.Tensor,
assert input_ids_stripped != tokenizer.mask_token_id


@pytest.mark.cpu
@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
@pytest.mark.parametrize("max_length", [32, 1024])
@pytest.mark.parametrize("max_datasets_size", [2])
def test_prompt_dataset(model: str,
max_datasets_size: int,
max_length: int):
def test_prompt_dataset(model: str, max_datasets_size: int, max_length: int):
with tempfile.TemporaryDirectory() as tmp_dir:
dataset_name = "prompt_dataset.json"
with open(os.path.join(tmp_dir, dataset_name), "w") as f:
Expand All @@ -127,19 +136,12 @@ def test_prompt_dataset(model: str,
check_content(input_ids.masked_select(attention_mask), tokenizer, model)


@pytest.mark.cpu
@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
@pytest.mark.parametrize(["dataset_path", "subset"], [
("Anthropic/hh-rlhf", "harmless-base"),
("Dahoas/rm-static", None)
])
@pytest.mark.parametrize(["dataset_path", "subset"], [("Anthropic/hh-rlhf", "harmless-base"),
("Dahoas/rm-static", None)])
@pytest.mark.parametrize("max_datasets_size", [32])
@pytest.mark.parametrize("max_length", [32, 1024])
def test_reward_dataset(model: str,
dataset_path: str,
subset: Optional[str],
max_datasets_size: int,
max_length: int):
def test_reward_dataset(model: str, dataset_path: str, subset: Optional[str], max_datasets_size: int, max_length: int):
data = load_dataset(dataset_path, data_dir=subset)
assert max_datasets_size <= len(data["train"]) \
and max_datasets_size <= len(data["test"])
Expand Down Expand Up @@ -196,15 +198,12 @@ def test_reward_dataset(model: str,
assert torch.all(r_mask)


@pytest.mark.cpu

@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama", "chatglm"])
@pytest.mark.parametrize("dataset_path", ["yizhongw/self_instruct", None])
@pytest.mark.parametrize("max_dataset_size", [2])
@pytest.mark.parametrize("max_length", [32, 1024])
def test_sft_dataset(model: str,
dataset_path: Optional[str],
max_dataset_size: int,
max_length: int):
def test_sft_dataset(model: str, dataset_path: Optional[str], max_dataset_size: int, max_length: int):
tokenizer = make_tokenizer(model)
if dataset_path == "yizhongw/self_instruct":
data = load_dataset(dataset_path, "super_natural_instructions")
Expand Down Expand Up @@ -253,10 +252,7 @@ def test_sft_dataset(model: str,


if __name__ == "__main__":
test_sft_dataset(model="bloom",
dataset_path="yizhongw/self_instruct",
max_dataset_size=2,
max_length=256)
test_sft_dataset(model="bloom", dataset_path="yizhongw/self_instruct", max_dataset_size=2, max_length=256)

test_reward_dataset(model="gpt2",
dataset_path="Anthropic/hh-rlhf",
Expand All @@ -266,4 +262,5 @@ def test_sft_dataset(model: str,

test_prompt_dataset(model="opt",
max_datasets_size=2,
max_length=128)
max_length=128)

Loading