jamesthesnake · jamesthesnake · Aug 4, 2023 · Jul 27, 2023 · Jul 28, 2023 · Jul 29, 2023
diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml
@@ -72,7 +72,7 @@ jobs:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
       - name: Download cub for CUDA 10.2
         run: |
-          CUDA_VERSION=$(cat $CUDA_HOME/version.txt | grep "CUDA Version" | awk '{print $NF}' | cut -d. -f1,2)
+          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
 
           # check if it is CUDA 10.2
           # download cub

diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
@@ -66,7 +66,7 @@ jobs:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
       - name: Download cub for CUDA 10.2
         run: |
-          CUDA_VERSION=$(cat $CUDA_HOME/version.txt | grep "CUDA Version" | awk '{print $NF}' | cut -d. -f1,2)
+          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
 
           # check if it is CUDA 10.2
           # download cub

diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml
@@ -61,6 +61,18 @@ jobs:
         with:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
 
+      - name: Download cub for CUDA 10.2
+        run: |
+          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
+
+          # check if it is CUDA 10.2
+          # download cub
+          if [ "$CUDA_VERSION" = "10.2" ]; then
+            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+            unzip 1.8.0.zip
+            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+          fi
+
       - name: Install Colossal-AI
         run: |
           pip install -v --no-cache-dir .

diff --git a/.github/workflows/cuda_ext_check_before_merge.yml b/.github/workflows/cuda_ext_check_before_merge.yml
@@ -37,6 +37,18 @@ jobs:
       - name: Install PyTorch
         run: eval ${{ matrix.build.torch_command }}
 
+      - name: Download cub for CUDA 10.2
+        run: |
+          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
+
+          # check if it is CUDA 10.2
+          # download cub
+          if [ "$CUDA_VERSION" = "10.2" ]; then
+            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+            unzip 1.8.0.zip
+            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+          fi
+
       - name: Build
         run: |
           CUDA_EXT=1 pip install -v .
diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
@@ -43,7 +43,9 @@ jobs:
         run: |
           cd applications/Chat
           rm -rf ~/.cache/colossalai
-          ./examples/test_ci.sh
+          ./tests/test_inference.sh
+          ./tests/test_benchmarks.sh
+          ./tests/test_train.sh
         env:
           NCCL_SHM_DISABLE: 1
           MAX_JOBS: 8

diff --git a/applications/Chat/coati/dataset/__init__.py b/applications/Chat/coati/dataset/__init__.py
@@ -1,9 +1,10 @@
 from .prompt_dataset import PromptDataset
 from .reward_dataset import HhRlhfDataset, RmStaticDataset
-from .sft_dataset import DataCollatorForSupervisedDataset, SFTDataset, SupervisedDataset
+from .sft_dataset import SFTDataset, SupervisedDataset
 from .utils import is_rank_0
 
 __all__ = [
-    'RmStaticDataset', 'HhRlhfDataset', 'is_rank_0', 'SFTDataset', 'SupervisedDataset',
-    'DataCollatorForSupervisedDataset', 'PromptDataset'
+    'RmStaticDataset', 'HhRlhfDataset',
+    'SFTDataset', 'SupervisedDataset',
+    'PromptDataset', 'is_rank_0',
 ]
diff --git a/applications/Chat/coati/dataset/conversation.py b/applications/Chat/coati/dataset/conversation.py
@@ -0,0 +1,87 @@
+#    Copyright 2023 lm-sys@FastChat
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import dataclasses
+from enum import Enum, auto
+from typing import List
+
+
+class SeparatorStyle(Enum):
+    ADD_EOS_TOKEN = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_EOS_TOKEN
+    sep: str = "</s>"
+
+    skip_next: bool = False
+
+    def get_prompt(self):
+        if self.sep_style == SeparatorStyle.ADD_EOS_TOKEN:
+            ret = self.system
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(system=self.system,
+                            roles=self.roles,
+                            messages=[[x, y] for x, y in self.messages],
+                            offset=self.offset,
+                            sep_style=self.sep_style,
+                            sep=self.sep)
+
+    def dict(self):
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep
+        }
+
+
+conv = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    roles=("Human", "Assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.ADD_EOS_TOKEN,
+    sep="</s>",
+)
+
+default_conversation = conv
diff --git a/applications/Chat/coati/dataset/prompt_dataset.py b/applications/Chat/coati/dataset/prompt_dataset.py
@@ -1,20 +1,13 @@
-import copy
-import random
 from collections import defaultdict
-from dataclasses import dataclass, field
-from typing import Callable, Dict, Sequence
+from typing import Dict
 
 import torch
-import torch.distributed as dist
 import transformers
 from torch.utils.data import Dataset
-from tqdm import tqdm
 
 from colossalai.logging import get_dist_logger
 
-from .utils import is_rank_0, jload
-
-logger = get_dist_logger()
+from .utils import jload
 
 
 class PromptDataset(Dataset):
@@ -27,12 +20,13 @@ def __init__(self,
                  max_length: int = 96):
         super(PromptDataset, self).__init__()
         self.keyed_prompt = defaultdict(list)
-        logger.info("Loading data...")
+        self.logger = get_dist_logger()
+        self.logger.info("Loading data...")
         list_data_dict = jload(data_path)
-        logger.info(f"Loaded {len(list_data_dict)} examples.")
+        self.logger.info(f"Loaded {len(list_data_dict)} examples.")
 
         if max_datasets_size is not None:
-            logger.info(f"Limiting dataset to {max_datasets_size} examples.")
+            self.logger.info(f"Limiting dataset to {max_datasets_size} examples.")
             list_data_dict = list_data_dict[:max_datasets_size]
 
         instructions = [data_dict["instruction"] for data_dict in list_data_dict]

diff --git a/applications/Chat/coati/dataset/reward_dataset.py b/applications/Chat/coati/dataset/reward_dataset.py
@@ -20,44 +20,44 @@ class RmStaticDataset(Dataset):
 
     def __init__(self, dataset, tokenizer: Callable, max_length: int, special_token=None) -> None:
         super().__init__()
-        self.chosen = []
-        self.reject = []
-        if special_token is None:
-            self.end_token = tokenizer.eos_token
-        else:
-            self.end_token = special_token
-        for data in tqdm(dataset, disable=not is_rank_0()):
-            prompt = data['prompt']
-
-            chosen = prompt + data['chosen'] + self.end_token
-            chosen_token = tokenizer(chosen,
-                                     max_length=max_length,
-                                     padding="max_length",
-                                     truncation=True,
-                                     return_tensors="pt")
-            self.chosen.append({
-                "input_ids": chosen_token['input_ids'],
-                "attention_mask": chosen_token['attention_mask']
-            })
-
-            reject = prompt + data['rejected'] + self.end_token
-            reject_token = tokenizer(reject,
-                                     max_length=max_length,
-                                     padding="max_length",
-                                     truncation=True,
-                                     return_tensors="pt")
-            self.reject.append({
-                "input_ids": reject_token['input_ids'],
-                "attention_mask": reject_token['attention_mask']
-            })
+        self.end_token = tokenizer.eos_token \
+            if special_token is None else special_token
+
+        chosen = [
+            data["prompt"] + data["chosen"] + self.end_token
+            for data in tqdm(dataset, disable=not is_rank_0())
+        ]
+        chosen_token = tokenizer(chosen,
+                                 max_length=max_length,
+                                 padding="max_length",
+                                 truncation=True,
+                                 return_tensors="pt")
+        self.chosen = {
+            "input_ids": chosen_token["input_ids"],
+            "attention_mask": chosen_token["attention_mask"]
+        }
+
+        reject = [
+            data["prompt"] + data["rejected"] + self.end_token
+            for data in tqdm(dataset, disable=not is_rank_0())
+        ]
+        reject_token = tokenizer(reject,
+                                 max_length=max_length,
+                                 padding="max_length",
+                                 truncation=True,
+                                 return_tensors="pt")
+        self.reject = {
+            "input_ids": reject_token["input_ids"],
+            "attention_mask": reject_token["attention_mask"]
+        }
 
     def __len__(self):
-        length = len(self.chosen)
+        length = self.chosen["input_ids"].shape[0]
         return length
 
     def __getitem__(self, idx):
-        return self.chosen[idx]["input_ids"], self.chosen[idx]["attention_mask"], self.reject[idx][
-            "input_ids"], self.reject[idx]["attention_mask"]
+        return self.chosen["input_ids"][idx], self.chosen["attention_mask"][idx], \
+            self.reject["input_ids"][idx], self.reject["attention_mask"][idx]
 
 
 # Anthropic/hh-rlhf
@@ -74,39 +74,41 @@ class HhRlhfDataset(Dataset):
 
     def __init__(self, dataset, tokenizer: Callable, max_length: int, special_token=None) -> None:
         super().__init__()
-        self.chosen = []
-        self.reject = []
-        if special_token is None:
-            self.end_token = tokenizer.eos_token
-        else:
-            self.end_token = special_token
-        for data in tqdm(dataset, disable=not is_rank_0()):
-            chosen = data['chosen'] + self.end_token
-            chosen_token = tokenizer(chosen,
-                                     max_length=max_length,
-                                     padding="max_length",
-                                     truncation=True,
-                                     return_tensors="pt")
-            self.chosen.append({
-                "input_ids": chosen_token['input_ids'],
-                "attention_mask": chosen_token['attention_mask']
-            })
-
-            reject = data['rejected'] + self.end_token
-            reject_token = tokenizer(reject,
-                                     max_length=max_length,
-                                     padding="max_length",
-                                     truncation=True,
-                                     return_tensors="pt")
-            self.reject.append({
-                "input_ids": reject_token['input_ids'],
-                "attention_mask": reject_token['attention_mask']
-            })
+        self.end_token = tokenizer.eos_token \
+            if special_token is None else special_token
+
+        chosen = [
+            data["chosen"] + self.end_token
+            for data in tqdm(dataset, disable=not is_rank_0())
+        ]
+        chosen_token = tokenizer(chosen,
+                                 max_length=max_length,
+                                 padding="max_length",
+                                 truncation=True,
+                                 return_tensors="pt")
+        self.chosen = {
+            "input_ids": chosen_token["input_ids"],
+            "attention_mask": chosen_token["attention_mask"]
+        }
+
+        reject = [
+            data["rejected"] + self.end_token
+            for data in tqdm(dataset, disable=not is_rank_0())
+        ]
+        reject_token = tokenizer(reject,
+                                 max_length=max_length,
+                                 padding="max_length",
+                                 truncation=True,
+                                 return_tensors="pt")
+        self.reject = {
+            "input_ids": reject_token["input_ids"],
+            "attention_mask": reject_token["attention_mask"]
+        }
 
     def __len__(self):
-        length = len(self.chosen)
+        length = self.chosen["input_ids"].shape[0]
         return length
 
     def __getitem__(self, idx):
-        return self.chosen[idx]["input_ids"], self.chosen[idx]["attention_mask"], self.reject[idx][
-            "input_ids"], self.reject[idx]["attention_mask"]
+        return self.chosen["input_ids"][idx], self.chosen["attention_mask"][idx], \
+            self.reject["input_ids"][idx], self.reject["attention_mask"][idx]