jamesthesnake · jamesthesnake · Sep 8, 2023 · Aug 16, 2023 · Aug 16, 2023 · Aug 16, 2023
diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
@@ -61,7 +61,7 @@ jobs:
       run:
         shell: bash
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache
       cancel-in-progress: true
     steps:
       - name: Copy testmon cache
@@ -87,7 +87,7 @@ jobs:
       anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }}
     runs-on: ubuntu-latest
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change
       cancel-in-progress: true
     steps:
       - uses: actions/checkout@v2
@@ -147,7 +147,7 @@ jobs:
       run:
         shell: bash
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test
       cancel-in-progress: true
     steps:
       - name: Checkout TensorNVMe
@@ -208,7 +208,7 @@ jobs:
 
       - name: Execute Unit Testing
         run: |
-          CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. --durations=10 tests/
+          CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/
         env:
           DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1

diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml
@@ -44,7 +44,7 @@ jobs:
     name: Test for PyTorch Compatibility
     needs: matrix_preparation
     if: github.repository == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
+    runs-on: [self-hosted, 8-gpu]
     strategy:
       fail-fast: false
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
@@ -64,7 +64,7 @@ jobs:
       - name: Install tensornvme
         run: |
           cd TensorNVMe
-          conda install cmake
+          apt update && apt install -y cmake
           pip install -r requirements.txt
           pip install -v .
       - uses: actions/checkout@v2
@@ -83,8 +83,7 @@ jobs:
           fi
       - name: Install Colossal-AI
         run: |
-          pip install -r requirements/requirements.txt
-          pip install -v --no-cache-dir .
+          CUDA_EXT=1 pip install -v .
           pip install -r requirements/requirements-test.txt
       - name: Unit Testing
         run: |

diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
@@ -13,7 +13,7 @@ jobs:
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-prepare-matrix
       cancel-in-progress: true
     steps:
       - uses: actions/checkout@v3
@@ -35,7 +35,7 @@ jobs:
     name: Test for PyTorch Compatibility
     needs: matrix_preparation
     if: github.repository == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
+    runs-on: [self-hosted, 8-gpu]
     strategy:
       fail-fast: false
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
@@ -44,7 +44,7 @@ jobs:
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
     timeout-minutes: 120
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test
       cancel-in-progress: true
     steps:
       - name: Install dependencies
@@ -58,7 +58,7 @@ jobs:
       - name: Install tensornvme
         run: |
           cd TensorNVMe
-          conda install cmake
+          apt update && apt install -y cmake
           pip install -r requirements.txt
           pip install -v .
       - uses: actions/checkout@v2
@@ -78,7 +78,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          pip install -v --no-cache-dir .
+          CUDA_EXT=1 pip install -v .
           pip install -r requirements/requirements-test.txt
       - name: Unit Testing
         run: |

diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml
@@ -32,7 +32,7 @@ jobs:
     name: Test for PyTorch Compatibility
     needs: matrix_preparation
     if: github.repository == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
+    runs-on: [self-hosted, 8-gpu]
     strategy:
       fail-fast: false
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
@@ -54,7 +54,7 @@ jobs:
       - name: Install tensornvme
         run: |
           cd TensorNVMe
-          conda install cmake
+          apt update && apt install -y cmake
           pip install -r requirements.txt
           pip install -v .
       - uses: actions/checkout@v2
@@ -75,7 +75,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          pip install -v --no-cache-dir .
+          CUDA_EXT=1 pip install -v .
           pip install -r requirements/requirements-test.txt
 
       - name: Unit Testing

diff --git a/.github/workflows/doc_check_on_pr.yml b/.github/workflows/doc_check_on_pr.yml
@@ -17,7 +17,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: ubuntu-latest
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-check-i18n
       cancel-in-progress: true
     steps:
       - uses: actions/checkout@v2
@@ -35,7 +35,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: ubuntu-latest
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-check-doc
       cancel-in-progress: true
     steps:
       - uses: actions/checkout@v2

diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
@@ -20,7 +20,7 @@ jobs:
       any_changed: ${{ steps.changed-files.outputs.any_changed }}
       changed_files: ${{ steps.changed-files.outputs.all_changed_files }}
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change
       cancel-in-progress: true
     name: Detect changed example files
     steps:
@@ -63,7 +63,7 @@ jobs:
       run:
         shell: bash
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-doctest
       cancel-in-progress: true
     steps:
       - name: Checkout ColossalAI-Documentation

diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
@@ -21,7 +21,7 @@ jobs:
       anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
     name: Detect changed example files
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change
       cancel-in-progress: true
     steps:
       - uses: actions/checkout@v3
@@ -81,7 +81,7 @@ jobs:
       options: --gpus all --rm -v /data/scratch/examples-data:/data/
     timeout-minutes: 10
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example
       cancel-in-progress: true
     steps:
       - uses: actions/checkout@v3

diff --git a/applications/Chat/tests/test_dataset.py b/applications/Chat/tests/test_dataset.py
@@ -14,29 +14,43 @@
 from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
 SFT_DATASET = [
     {
-        "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
-        "input": "",
-        "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
-        "id": 0
+        "instruction":
+            "Provide a list of the top 10 most popular mobile games in Asia",
+        "input":
+            "",
+        "output":
+            "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
+        "id":
+            0
     },
     {
-        "instruction": "Please provide an action plan for reducing carbon footprint on a corporate level",
-        "input": "",
-        "output": "An action plan for reducing carbon footprint on a corporate level could include switching to renewable energy sources (e.g., solar or wind powered), investing in energy efficient infrastructure, implementing green production practices (e.g., reduce water and paper usage, dispose of materials responsibly, recycle/reuse materials), reducing or eliminating travel, encouraging employees to be conscious of and reduce their own energy usage, and incentivizing employees to participate in sustainable work practices.",
-        "id": 1
+        "instruction":
+            "Please provide an action plan for reducing carbon footprint on a corporate level",
+        "input":
+            "",
+        "output":
+            "An action plan for reducing carbon footprint on a corporate level could include switching to renewable energy sources (e.g., solar or wind powered), investing in energy efficient infrastructure, implementing green production practices (e.g., reduce water and paper usage, dispose of materials responsibly, recycle/reuse materials), reducing or eliminating travel, encouraging employees to be conscious of and reduce their own energy usage, and incentivizing employees to participate in sustainable work practices.",
+        "id":
+            1
     },
     {
-        "instruction": "Write a persuasive email to your boss explaining why you should have a pay raise",
-        "input": "",
-        "output": "Dear [Boss], \n\nI hope this message finds you well. I am writing to request a pay raise.\n\nAs you know, I have been a dedicated and hardworking member of the team since I started working here [insert number] of months/years ago. My enthusiasm and passion for my job has remained consistent over the years, and I have always given 100% to my role. \n\nI understand that the current financial situation is challenging, however, I would sincerely appreciate you taking the time to consider my request. I believe that my dedication to the job and the value that I bring to the organization warrants a raise. I work diligently and am confident that I can continue to be an asset to the company. \n\nI hope my request is taken into account and I thank you in advance for your understanding. I look forward to our conversation. \n\nSincerely,\n[Your Name]",
-        "id": 2
+        "instruction":
+            "Write a persuasive email to your boss explaining why you should have a pay raise",
+        "input":
+            "",
+        "output":
+            "Dear [Boss], \n\nI hope this message finds you well. I am writing to request a pay raise.\n\nAs you know, I have been a dedicated and hardworking member of the team since I started working here [insert number] of months/years ago. My enthusiasm and passion for my job has remained consistent over the years, and I have always given 100% to my role. \n\nI understand that the current financial situation is challenging, however, I would sincerely appreciate you taking the time to consider my request. I believe that my dedication to the job and the value that I bring to the organization warrants a raise. I work diligently and am confident that I can continue to be an asset to the company. \n\nI hope my request is taken into account and I thank you in advance for your understanding. I look forward to our conversation. \n\nSincerely,\n[Your Name]",
+        "id":
+            2
     },
 ]
 
 PROMPT_DATASET = [
     {
-        "instruction": "Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
-        "id": 0
+        "instruction":
+            "Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
+        "id":
+            0
     },
     {
         "instruction": "Write a descriptive paragraph about a memorable vacation you went on",
@@ -73,9 +87,7 @@ def make_tokenizer(model: str):
     return tokenizer
 
 
-def check_content(input_ids_stripped: torch.Tensor,
-                  tokenizer: PreTrainedTokenizer,
-                  model: str):
+def check_content(input_ids_stripped: torch.Tensor, tokenizer: PreTrainedTokenizer, model: str):
     if model == "opt":
         # NOTE:  Contrary to GPT2, OPT adds the EOS token </s> to the beginning of every prompt.
         assert input_ids_stripped[0] == tokenizer.eos_token_id
@@ -98,13 +110,10 @@ def check_content(input_ids_stripped: torch.Tensor,
         assert input_ids_stripped != tokenizer.mask_token_id
 
 
-@pytest.mark.cpu
 @pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
 @pytest.mark.parametrize("max_length", [32, 1024])
 @pytest.mark.parametrize("max_datasets_size", [2])
-def test_prompt_dataset(model: str,
-                        max_datasets_size: int,
-                        max_length: int):
+def test_prompt_dataset(model: str, max_datasets_size: int, max_length: int):
     with tempfile.TemporaryDirectory() as tmp_dir:
         dataset_name = "prompt_dataset.json"
         with open(os.path.join(tmp_dir, dataset_name), "w") as f:
@@ -127,19 +136,12 @@ def test_prompt_dataset(model: str,
             check_content(input_ids.masked_select(attention_mask), tokenizer, model)
 
 
-@pytest.mark.cpu
 @pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
-@pytest.mark.parametrize(["dataset_path", "subset"], [
-    ("Anthropic/hh-rlhf", "harmless-base"),
-    ("Dahoas/rm-static", None)
-])
+@pytest.mark.parametrize(["dataset_path", "subset"], [("Anthropic/hh-rlhf", "harmless-base"),
+                                                      ("Dahoas/rm-static", None)])
 @pytest.mark.parametrize("max_datasets_size", [32])
 @pytest.mark.parametrize("max_length", [32, 1024])
-def test_reward_dataset(model: str,
-                        dataset_path: str,
-                        subset: Optional[str],
-                        max_datasets_size: int,
-                        max_length: int):
+def test_reward_dataset(model: str, dataset_path: str, subset: Optional[str], max_datasets_size: int, max_length: int):
     data = load_dataset(dataset_path, data_dir=subset)
     assert max_datasets_size <= len(data["train"]) \
         and max_datasets_size <= len(data["test"])
@@ -196,15 +198,12 @@ def test_reward_dataset(model: str,
             assert torch.all(r_mask)
 
 
-@pytest.mark.cpu
+
 @pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama", "chatglm"])
 @pytest.mark.parametrize("dataset_path", ["yizhongw/self_instruct", None])
 @pytest.mark.parametrize("max_dataset_size", [2])
 @pytest.mark.parametrize("max_length", [32, 1024])
-def test_sft_dataset(model: str,
-                     dataset_path: Optional[str],
-                     max_dataset_size: int,
-                     max_length: int):
+def test_sft_dataset(model: str, dataset_path: Optional[str], max_dataset_size: int, max_length: int):
     tokenizer = make_tokenizer(model)
     if dataset_path == "yizhongw/self_instruct":
         data = load_dataset(dataset_path, "super_natural_instructions")
@@ -253,10 +252,7 @@ def test_sft_dataset(model: str,
 
 
 if __name__ == "__main__":
-    test_sft_dataset(model="bloom",
-                     dataset_path="yizhongw/self_instruct",
-                     max_dataset_size=2,
-                     max_length=256)
+    test_sft_dataset(model="bloom", dataset_path="yizhongw/self_instruct", max_dataset_size=2, max_length=256)
 
     test_reward_dataset(model="gpt2",
                         dataset_path="Anthropic/hh-rlhf",
@@ -266,4 +262,5 @@ def test_sft_dataset(model: str,
 
     test_prompt_dataset(model="opt",
                         max_datasets_size=2,
-                        max_length=128)
+                        max_length=128)
+