jamesthesnake · jamesthesnake · Nov 26, 2023 · Sep 17, 2023 · Oct 27, 2023 · Oct 27, 2023
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1 @@
+*   @hpcaitech/colossalai-qa
diff --git a/.github/workflows/release_test_pypi_before_merge.yml b/.github/workflows/release_test_pypi_before_merge.yml
@@ -27,7 +27,9 @@ jobs:
         echo $new_version > ./version.txt
         echo "version=$new_version" >> $GITHUB_OUTPUT
 
-    - run: python setup.py sdist build
+    - run: |
+        pip install --upgrade pip
+        python setup.py sdist build
 
     # publish to PyPI if executed on the main branch
     - name: Publish package to PyPI

diff --git a/.github/workflows/run_colossalqa_unit_tests.yml b/.github/workflows/run_colossalqa_unit_tests.yml
@@ -0,0 +1,54 @@
+name: Run colossalqa unit tests
+
+on:
+  pull_request:
+    types: [synchronize, opened, reopened]
+    paths:
+      - 'applications/ColossalQA/colossalqa/**'
+      - 'applications/ColossalQA/requirements.txt'
+      - 'applications/ColossalQA/setup.py'
+      - 'applications/ColossalQA/tests/**'
+      - 'applications/ColossalQA/pytest.ini'
+
+jobs:
+  tests:
+    name: Run colossalqa unit tests
+    if: |
+      github.event.pull_request.draft == false &&
+      github.base_ref == 'main' &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      volumes:
+        - /data/scratch/test_data_colossalqa:/data/scratch/test_data_colossalqa
+        - /data/scratch/llama-tiny:/data/scratch/llama-tiny
+      options: --gpus all --rm
+    timeout-minutes: 30
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Checkout ColossalAI
+        uses: actions/checkout@v2
+
+      - name: Install colossalqa
+        run: |
+          cd applications/ColossalQA
+          pip install -e .
+
+      - name: Execute Unit Testing
+        run: |
+          cd applications/ColossalQA
+          pytest tests/
+        env:
+          NCCL_SHM_DISABLE: 1
+          MAX_JOBS: 8
+          ZH_MODEL_PATH: bigscience/bloom-560m
+          ZH_MODEL_NAME: bloom
+          EN_MODEL_PATH: bigscience/bloom-560m
+          EN_MODEL_NAME: bloom
+          TEST_DATA_PATH_EN: /data/scratch/test_data_colossalqa/companies.txt
+          TEST_DATA_PATH_ZH: /data/scratch/test_data_colossalqa/companies_zh.txt
+          TEST_DOCUMENT_LOADER_DATA_PATH: /data/scratch/test_data_colossalqa/tests/*
+          SQL_FILE_PATH: /data/scratch/test_data_colossalqa/sql_file_path
diff --git a/.gitmodules b/.gitmodules
@@ -1,7 +1,3 @@
-[submodule "inference"]
-	path = inference
-	url = https://github.com/hpcaitech/EnergonAI.git
-	branch = main
 [submodule "examples/tutorial/fastfold/FastFold"]
 	path = examples/tutorial/fastfold/FastFold
 	url = https://github.com/hpcaitech/FastFold
diff --git a/LICENSE b/LICENSE
@@ -527,3 +527,28 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.
+
+
+   ---------------- LICENSE FOR LangChain TEAM ----------------
+
+   The MIT License
+
+   Copyright (c) Harrison Chase
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.  
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -118,7 +118,7 @@ def main(args):
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == "llama":
         tokenizer = LlamaTokenizer.from_pretrained(args.pretrain)
-        tokenizer.eos_token = "<\s>"
+        tokenizer.eos_token = "</s>"
         tokenizer.pad_token = tokenizer.unk_token
     else:
         raise ValueError(f'Unsupported model "{args.model}"')

diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py
@@ -68,7 +68,7 @@ def train(args):
             padding_side="right",
             use_fast=False,
         )
-        tokenizer.eos_token = "<\s>"
+        tokenizer.eos_token = "</s>"
         tokenizer.pad_token = tokenizer.unk_token
     else:
         raise ValueError(f'Unsupported model "{args.model}"')

diff --git a/applications/Chat/examples/inference.py b/applications/Chat/examples/inference.py
@@ -39,7 +39,7 @@ def eval(args):
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model == "llama":
         tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
-        tokenizer.eos_token = "<\s>"
+        tokenizer.eos_token = "</s>"
         tokenizer.pad_token = tokenizer.unk_token
     else:
         raise ValueError(f'Unsupported model "{args.model}"')

diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
@@ -125,7 +125,7 @@ def main(args):
         tokenizer = LlamaTokenizer.from_pretrained(
             "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
         )
-        tokenizer.eos_token = "<\s>"
+        tokenizer.eos_token = "</s>"
         tokenizer.pad_token = tokenizer.unk_token
     else:
         raise ValueError(f'Unsupported model "{args.model}"')

diff --git a/applications/Chat/examples/train_reward_model.py b/applications/Chat/examples/train_reward_model.py
@@ -72,7 +72,7 @@ def train(args):
         tokenizer = LlamaTokenizer.from_pretrained(
             "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
         )
-        tokenizer.eos_token = "<\s>"
+        tokenizer.eos_token = "</s>"
         tokenizer.pad_token = tokenizer.unk_token
     else:
         raise ValueError(f'Unsupported model "{args.model}"')

diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
@@ -75,7 +75,7 @@ def train(args):
         tokenizer = LlamaTokenizer.from_pretrained(
             "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
         )
-        tokenizer.eos_token = "<\s>"
+        tokenizer.eos_token = "</s>"
         tokenizer.pad_token = tokenizer.unk_token
     elif args.model == "chatglm":
         tokenizer = ChatGLMTokenizer.from_pretrained(

diff --git a/applications/ColossalEval/colossal_eval/dataset/__init__.py b/applications/ColossalEval/colossal_eval/dataset/__init__.py
@@ -6,6 +6,7 @@
 from .gaokaobench import GaoKaoBenchDataset
 from .longbench import LongBenchDataset
 from .mmlu import MMLUDataset
+from .mtbench import MTBenchDataset
 
 __all__ = [
     "AGIEvalDataset",
@@ -16,4 +17,5 @@
     "LongBenchDataset",
     "MMLUDataset",
     "ColossalDataset",
+    "MTBenchDataset",
 ]
diff --git a/applications/ColossalEval/colossal_eval/dataset/mtbench.py b/applications/ColossalEval/colossal_eval/dataset/mtbench.py
@@ -0,0 +1,72 @@
+import copy
+import json
+import os
+from collections import defaultdict
+from typing import Dict, List
+
+from colossal_eval.utils import get_json_list
+
+from colossalai.logging import DistributedLogger
+
+from .base import BaseDataset
+
+default_inference_kwargs = {
+    "calculate_loss": False,
+    "all_classes": None,
+    "language": "English",
+    "pretrain": False,
+    "max_new_tokens": 1024,
+    "turns": 2,
+}
+
+
+class MTBenchDataset(BaseDataset):
+    """
+    Dataset class for mt_bench dataset.
+    Data source: https://github.com/lm-sys/FastChat/blob/main/fastchat/llm_judge/data/mt_bench/question.jsonl
+    This dataset class will convert the original dataset into the inference dataset.
+    """
+
+    def __init__(self, path, logger, few_shot):
+        self.multiturn = True
+        self.dataset = self.load(path, logger, few_shot)
+
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"test": defaultdict(dict)}
+
+        file_path = os.path.join(path, "question.jsonl")
+        ref_path = os.path.join(path, "reference_answer/gpt-4.jsonl")
+
+        reference = defaultdict(list)
+        ref_origin = get_json_list(ref_path)
+        for ref in ref_origin:
+            reference[ref["question_id"]] = ref["choices"][0]["turns"]
+
+        with open(file_path, "r", encoding="utf-8") as file:
+            for line in file:
+                question = json.loads(line)
+                category = question["category"]
+                turn_number = len(question["turns"])
+                data_point = {
+                    "id": question["question_id"],
+                    "dataset": "mtbench",
+                    "split": "test",
+                    "category": category,
+                    "instruction": question["turns"],
+                    "input": "",
+                    "output": [],
+                    "target": [""] * turn_number
+                    if question["question_id"] not in reference
+                    else reference[question["question_id"]],
+                }
+
+                if category in dataset["test"]:
+                    dataset["test"][category]["data"].append(data_point)
+                else:
+                    dataset["test"][category] = {
+                        "data": [data_point],
+                        "inference_kwargs": copy.deepcopy(default_inference_kwargs),
+                    }
+
+        return dataset
diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py
@@ -1,12 +1,15 @@
+import os
 from typing import Dict, List
 
 import colossal_eval.evaluate.dataset_evaluator.metrics as metric_helper
 import numpy as np
 import tqdm
+from colossal_eval.utils import jdump
 
 LabelBasedMetrics = ["first_token_accuracy", "matthews_correlation"]
 LossBasedMetrics = ["perplexity", "ppl_score", "ppl_score_over_choices", "per_byte_perplexity", "per_byte_ppl_score"]
 CombinedMetrics = ["combined_single_choice_accuracy"]
+GPTMetrics = ["mtbench_single_judge"]
 OtherMetrics = [
     "f1_score",
     "f1_zh_score",
@@ -29,8 +32,9 @@ class DatasetEvaluator(object):
 
     """
 
-    def __init__(self):
-        pass
+    def __init__(self, config_path: str, save_path: str):
+        self.config_path = config_path
+        self.save_path = save_path
 
     def _calculate_label_metrics(self, metric: str, category: str):
         """Calculate label-based metrics."""
@@ -60,6 +64,11 @@ def _calculate_label_metrics(self, metric: str, category: str):
                             sample["output"], ref, all_classes=self.data[category]["inference_kwargs"]["all_classes"]
                         ),
                     )
+
+                    score = max(
+                        score,
+                        metric_helper.accuracy_by_options(sample["input"], sample["output"], ref),
+                    )
                 softmaxs.append(references[i] if score == 1 else -1)
             else:
                 softmaxs.append(np.argmax(np.array(list(sample["softmax_over_choices"].values()))))
@@ -151,6 +160,24 @@ def _calculate_other_metrics(self, metric: str, category: str):
         self.evaluation_results[metric][category] = (total_score, len(self.data[category]["data"]))
         self.evaluation_results[metric]["ALL"] += total_score * weight
 
+    def _calculate_gpt_metrics(self, metric: str, category: str):
+        """Calculate gpt metrics."""
+        weight = len(self.data[category]["data"]) / self.metric_total_length[metric]
+
+        metric_method = eval("gpt_helper." + metric)
+
+        judgements, avg_ratings = metric_method(self.data[category]["data"], self.config_path)
+        self.judgements[category] = judgements
+
+        self.evaluation_results[metric][category] = (np.mean(avg_ratings), len(self.data[category]["data"]))
+        self.evaluation_results[metric]["ALL"] += np.mean(avg_ratings) * weight
+
+        for i in range(avg_ratings.shape[0]):
+            if f"{metric}_{i+1}" not in self.evaluation_results:
+                self.evaluation_results[f"{metric}_{i+1}"] = {cat: 0 for cat in (["ALL"] + self.categories)}
+            self.evaluation_results[f"{metric}_{i+1}"][category] = (avg_ratings[i], len(self.data[category]["data"]))
+            self.evaluation_results[f"{metric}_{i+1}"]["ALL"] += avg_ratings[i] * weight
+
     def _calculate_loss_metrics(self, metric: str, category: str):
         """Calculate perplexity."""
         if metric == "perplexity":
@@ -212,10 +239,20 @@ def _evaluate(self):
                 for category in self.suggested_categories[metric]:
                     self._calculate_combined_metrics(metric, category)
                     pbar.update(1)
+            elif metric in GPTMetrics:
+                for category in self.suggested_categories[metric]:
+                    self._calculate_gpt_metrics(metric, category)
+                    pbar.update(1)
             elif metric in OtherMetrics:
                 for category in self.suggested_categories[metric]:
                     self._calculate_other_metrics(metric, category)
                     pbar.update(1)
+            else:
+                raise Exception(f"{metric} not supported.")
+
+        if self.judgements:
+            judgement_path = os.path.join(self.save_path, f"{self.model_name}_judgements.json")
+            jdump(self.judgements, judgement_path)
 
         return self.evaluation_results
 
@@ -235,6 +272,7 @@ def get_evaluation_results(self, data: List[Dict], dataset_name: str, model_name
         self.model_name = model_name
         self.categories = list(data.keys())
         self.metrics = metrics
+        self.judgements = {}
 
         self.evaluation_results = {
             metric: {category: 0 for category in (["ALL"] + self.categories)} for metric in self.metrics