From 2abf1910c063117e6728e90e3a723314a748730a Mon Sep 17 00:00:00 2001
From: Yuanchen Xu <yuanchen.xu00@gmail.com>
Date: Tue, 23 May 2023 15:11:04 +0800
Subject: [PATCH] support gpt evaluation

---
 applications/Chat/evaluate/eval.py            |  89 ++++--
 applications/Chat/evaluate/eval.sh            |   9 +
 applications/Chat/evaluate/evaluate.py        | 256 ------------------
 applications/Chat/evaluate/evaluate.sh        |   9 -
 applications/Chat/evaluate/evaluator.py       | 114 ++++++--
 .../Chat/evaluate/generate_answers.py         | 173 ------------
 .../Chat/evaluate/generate_answers.sh         |  25 --
 .../Chat/evaluate/generate_gpt35_answers.py   |  98 -------
 .../Chat/evaluate/generate_gpt35_answers.sh   |   6 -
 applications/Chat/evaluate/gpt_evaluate.py    | 112 +++-----
 applications/Chat/evaluate/merge.py           |  25 --
 applications/Chat/evaluate/requirements.txt   |  10 +
 applications/Chat/evaluate/utils.py           |  17 +-
 applications/Chat/requirements.txt            |   5 -
 14 files changed, 228 insertions(+), 720 deletions(-)
 create mode 100755 applications/Chat/evaluate/eval.sh
 delete mode 100644 applications/Chat/evaluate/evaluate.py
 delete mode 100755 applications/Chat/evaluate/evaluate.sh
 delete mode 100644 applications/Chat/evaluate/generate_answers.py
 delete mode 100755 applications/Chat/evaluate/generate_answers.sh
 delete mode 100644 applications/Chat/evaluate/generate_gpt35_answers.py
 delete mode 100755 applications/Chat/evaluate/generate_gpt35_answers.sh
 delete mode 100644 applications/Chat/evaluate/merge.py
 create mode 100644 applications/Chat/evaluate/requirements.txt

diff --git a/applications/Chat/evaluate/eval.py b/applications/Chat/evaluate/eval.py
index f34437751827..69f2c272a116 100644
--- a/applications/Chat/evaluate/eval.py
+++ b/applications/Chat/evaluate/eval.py
@@ -1,11 +1,17 @@
 import argparse
 import json
+import os
 
+import openai
 from evaluator import Evaluator
 from utils import jload
 
+
 def main(args):
-     # load config
+    assert len(args.answer_file_list) == len(
+        args.model_name_list), "The number of answer files and model names should be equal!"
+
+    # load config
     config = jload(args.config_file)
 
     if config["language"] == "cn":
@@ -16,28 +22,77 @@ def main(args):
             for metric_type, metrics in config["category"][category].items():
                 metrics_all[metric_type] = metrics
             metrics_per_category[category] = metrics_all
-            
+
+        battle_prompt = None
+        if args.battle_prompt_file:
+            battle_prompt = jload(args.battle_prompt_file)
+
+        gpt_evaluation_prompt = None
+        if args.gpt_evaluation_prompt_file:
+            gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)
+
+        if len(args.model_name_list) == 2 and not battle_prompt:
+            raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")
+
+        if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
+            raise Exception(
+                "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
+
         # initialize evaluator
-        evaluator = Evaluator(metrics_per_category)
-        if args.answers2_file:
-            answers1 = jload(args.answers1_file)
-            answers2 = jload(args.answers2_file)
+        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
+        if len(args.model_name_list) == 2:
+            answers1 = jload(args.answer_file_list[0])
+            answers2 = jload(args.answer_file_list[1])
+
+            assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"
+
             evaluator.battle(answers1=answers1, answers2=answers2)
-        else:
+            evaluator.save(args.save_path, args.model_name_list)
+        elif len(args.model_name_list) == 1:
             targets = jload(args.target_file)
-            answers = jload(args.answers1_file)
+            answers = jload(args.answer_file_list[0])
+
+            assert len(targets) == len(answers), "The number of target answers and model answers should be equal!"
+
             evaluator.evaluate(answers=answers, targets=targets)
-            evaluator.save(args.save_path)
+            evaluator.save(args.save_path, args.model_name_list)
+        else:
+            raise ValueError("Unsupported number of answer files and model names!")
     else:
-        raise ValueError(f'Unsupported language')
+        raise ValueError(f'Unsupported language {config["language"]}!')
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--config_file', type=str, default=None, help='path to the file of target results')
-    parser.add_argument('--target_file', type=str, default=None, help='path to the file of target results')
-    parser.add_argument('--answers1_file', type=str, default=None, help='path to the file of one model prediction')
-    parser.add_argument('--answers2_file', type=str, default=None, help='path to the file of the other model prediction')
-    parser.add_argument('--save_path', type=str, default="results.csv", help='path to the csv file to save evaluation results')
+    parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
+    parser.add_argument('--config_file',
+                        type=str,
+                        default=None,
+                        required=True,
+                        help='path to the file of target results')
+    parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
+    parser.add_argument('--gpt_evaluation_prompt_file',
+                        type=str,
+                        default=None,
+                        help='path to the prompt file for gpt evaluation')
+    parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
+    parser.add_argument('--answer_file_list',
+                        type=str,
+                        nargs='+',
+                        default=[],
+                        required=True,
+                        help='path to the answer files of at most 2 models')
+    parser.add_argument('--model_name_list',
+                        type=str,
+                        nargs='+',
+                        default=[],
+                        required=True,
+                        help='the names of at most 2 models')
+    parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
+    parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
     args = parser.parse_args()
-    main(args)
\ No newline at end of file
+
+    if args.openai_key is not None:
+        os.environ["OPENAI_API_KEY"] = args.openai_key
+    openai.api_key = os.getenv("OPENAI_API_KEY")
+
+    main(args)
diff --git a/applications/Chat/evaluate/eval.sh b/applications/Chat/evaluate/eval.sh
new file mode 100755
index 000000000000..f5729e6ee5c7
--- /dev/null
+++ b/applications/Chat/evaluate/eval.sh
@@ -0,0 +1,9 @@
+python eval.py \
+    --config_file "path to the config file" \
+    --battle_prompt_file "path to the prompt file for battle" \
+    --gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
+    --target_file "path to the target answer file" \
+    --answer_file_list "path to the answer files of at most 2 models" \
+    --model_name_list "the names of at most 2 models" \
+    --save_path "path to save results" \
+    --openai_key "your openai key" \
diff --git a/applications/Chat/evaluate/evaluate.py b/applications/Chat/evaluate/evaluate.py
deleted file mode 100644
index 2f9c9ce8e10d..000000000000
--- a/applications/Chat/evaluate/evaluate.py
+++ /dev/null
@@ -1,256 +0,0 @@
-#    Adapted form https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py
-#    Copyright 2023 LM-SYS@FastChat
-
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-
-#        http://www.apache.org/licenses/LICENSE-2.0
-
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-
-import argparse
-import json
-import os
-import time
-import re
-import concurrent.futures
-
-import openai
-import tqdm
-import shortuuid
-import logging
-
-from utils import jload, jdump, get_json_list
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-MAX_API_RETRY = 3
-
-
-def get_eval(sys_prompt, user_prompt: str, answer_id: int, max_tokens: int, model: str):
-    logging.basicConfig(level=logging.INFO)
-    for _ in range(MAX_API_RETRY):
-        try:
-            response = openai.ChatCompletion.create(
-                model=model,
-                messages=[{
-                    'role': 'system',
-                    'content': sys_prompt
-                }, {
-                    'role': 'user',
-                    'content': user_prompt,
-                }],
-                temperature=0.2,
-                max_tokens=max_tokens,
-            )
-            review = response['choices'][0]['message']['content']
-            return {"review": review, 'id': answer_id}
-        except Exception as e:
-            logger.error(e)
-            time.sleep(1)
-    logger.error(f' Review {answer_id} failed after {MAX_API_RETRY} retries.')
-    return 'error'
-
-
-def parse_score(review):
-    try:
-        pattern = re.compile('([0-9]|10) out of 10')
-        sp = re.findall(pattern, review)
-        if len(re.findall(pattern, review)) == 2:
-            return [float(sp[0]), float(sp[1])]
-
-        pattern = re.compile('a score of ([0-9]|10)')
-        sp = re.findall(pattern, review)
-        if len(re.findall(pattern, review)) == 2:
-            return [float(sp[0]), float(sp[1])]
-
-        pattern = re.compile('([0-9]|10)/10')
-        sp = re.findall(pattern, review)
-        if len(re.findall(pattern, review)) == 2:
-            return [float(sp[0]), float(sp[1])]
-
-        score_pair = review.split('\n')[0]
-        score_pair = score_pair.replace(',', ' ')
-        sp = score_pair.split(' ')
-        if len(sp) == 2:
-            return [float(sp[0]), float(sp[1])]
-        else:
-            raise Exception('Invalid score pair.')
-    except Exception as e:
-        return [-1, -1]
-
-
-def gen_prompt(reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2):
-    reviewer_idx = 0
-    for idx, reviewer in enumerate(reviewer_jsons):
-        if reviewer['category'] == cat:
-            reviewer_idx = idx
-            break
-    prompt_id = reviewer_jsons[reviewer_idx]['prompt_id']
-    prompt_json = prompt_jsons[prompt_id-1]
-    assert prompt_json['prompt_id'] == prompt_id
-
-    sys_prompt = prompt_json['system_prompt']
-    prompt_template = prompt_json['prompt_template']
-    defaults = prompt_json['defaults']
-    prompt = prompt_template.format(
-        question=ques, answer_1=ans1, answer_2=ans2, **defaults)
-
-    return sys_prompt, prompt, reviewer_idx+1
-
-
-def evaluate(args):
-    answer1_jsons = jload(args.answer_file_list[0])
-    answer2_jsons = jload(args.answer_file_list[1])
-    reviewer_jsons = get_json_list(args.reviewer_file)
-    prompt_jsons = get_json_list(args.prompt_file)
-
-    assert len(answer1_jsons) == len(answer2_jsons)
-
-    handles = []
-    review_jsons = []
-
-    total_len = len(answer1_jsons)
-    question_idx_list = list(range(total_len))
-
-    logger.info(
-        f' Total number of answers: {len(answer2_jsons)}.')
-
-    reviews = []
-    with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor:
-        futures = []
-        for i in question_idx_list:
-            assert answer1_jsons[i]['id'] == answer2_jsons[i]['id']
-            answer_id = answer1_jsons[i]['id']
-
-            ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instruction'] + \
-                " " + answer1_jsons[i]['input']
-            cat = answer1_jsons[i]['category']
-            ans1 = answer1_jsons[i]['output']
-            ans2 = answer2_jsons[i]['output']
-
-            sys_prompt, prompt, reviewer_id = gen_prompt(
-                reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2)
-
-            review_id = shortuuid.uuid()
-            review_jsons.append({
-                'review_id': review_id,
-                'id': answer_id,
-                'reviewer_id': reviewer_id,
-                'metadata': {}
-            })
-
-            future = executor.submit(
-                get_eval, sys_prompt, prompt, answer_id, args.max_tokens, args.model)
-            futures.append(future)
-
-        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
-            reviews.append(future.result())
-
-    reviews.sort(key=lambda x: x['id'])
-    review_jsons.sort(key=lambda x: x['id'])
-
-    ans1_score = 0
-    ans2_score = 0
-    better_count = 0
-    worse_count = 0
-    tie_count = 0
-    invalid_count = 0
-
-    better_file = []
-    worse_file = []
-    tie_file = []
-    invalid_file = []
-    output_review_file = []
-
-    for idx, review in enumerate(reviews):
-        scores = parse_score(review['review'])
-        review_jsons[idx]['review'] = review['review']
-        review_jsons[idx]['score'] = scores
-
-        if scores[0] == -1 and scores[1] == -1:
-            invalid_count += 1
-            invalid_file.append(review_jsons[idx])
-            logger.info(f' Invalid score pair: {review_jsons[idx]["id"]}.')
-        else:
-            if scores[0] > scores[1]:
-                worse_count += 1
-                worse_file.append(review_jsons[idx])
-            elif scores[0] < scores[1]:
-                better_count += 1
-                better_file.append(review_jsons[idx])
-            else:
-                tie_count += 1
-                tie_file.append(review_jsons[idx])
-            ans1_score += scores[0]
-            ans2_score += scores[1]
-
-        output_review_file.append(review_jsons[idx])
-
-    better_file.sort(key=lambda x: x['id'])
-    worse_file.sort(key=lambda x: x['id'])
-    tie_file.sort(key=lambda x: x['id'])
-    invalid_file.sort(key=lambda x: x['id'])
-    output_review_file.sort(key=lambda x: x['id'])
-
-    name1 = os.path.basename(args.answer_file_list[0]).split("_answers")[0]
-    name2 = os.path.basename(args.answer_file_list[1]).split("_answers")[0]
-    prefix = f"{name1}_vs_{name2}"
-
-    jdump(better_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_better.json"))
-    jdump(worse_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_worse.json"))
-    jdump(tie_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_tie.json"))
-    jdump(invalid_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_invalid.json"))
-    jdump(output_review_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_review.json"))
-
-    if os.path.exists(os.path.join(args.output_folder, "results.json")):
-        results = jload(os.path.join(args.output_folder, "results.json"))
-    else:
-        results = {}
-    results[prefix] = {'model': [name1, name2], 'better': better_count, 'worse': worse_count, 'tie': tie_count, 'win_rate': better_count /
-                       (len(reviews)-invalid_count), 'score': [ans1_score/(len(reviews)-invalid_count), ans2_score/(len(reviews)-invalid_count)]}
-    jdump(results, os.path.join(args.output_folder, "results.json"))
-
-    logger.info(f' Total {invalid_count} invalid score pair(s).')
-    logger.info(f' Model {name2} has {better_count} better answer(s).')
-    logger.info(f' Model {name2} has {worse_count} worse answer(s).')
-    logger.info(f' {tie_count} answer(s) play(s) to a tie.')
-    logger.info(
-        f' Win rate of model {name2}: {better_count/(len(reviews)-invalid_count):.2f}')
-    logger.info(
-        f' Model {name1} average score: {ans1_score/(len(reviews)-invalid_count):.2f}')
-    logger.info(
-        f' Model {name2} average score: {ans2_score/(len(reviews)-invalid_count):.2f}')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Model evaluation.')
-    parser.add_argument('--answer_file_list', nargs='+', default=[])
-    parser.add_argument('--prompt_file')
-    parser.add_argument('--reviewer_file')
-    parser.add_argument('--output_folder', type=str, default="./output")
-    parser.add_argument('--openai_key', type=str, default=None)
-    parser.add_argument('--model', type=str, default="gpt-4")
-    parser.add_argument('--num_workers', type=int, default=8)
-    parser.add_argument('--max_tokens', type=int, default=512,
-                        help='maximum number of tokens produced in the output')
-    args = parser.parse_args()
-
-    if args.openai_key is not None:
-        os.environ["OPENAI_API_KEY"] = args.openai_key
-    openai.api_key = os.getenv("OPENAI_API_KEY")
-
-    evaluate(args)
diff --git a/applications/Chat/evaluate/evaluate.sh b/applications/Chat/evaluate/evaluate.sh
deleted file mode 100755
index c51aa941019e..000000000000
--- a/applications/Chat/evaluate/evaluate.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-python evaluate.py \
-    --answer_file_list "path to answers of model 1" "path to answers of model 2" \
-    --prompt_file "path to prompt file" \
-    --reviewer_file "path to reviewer file" \
-    --output_folder "path to output folder" \
-    --openai_key "your openai key" \
-    --model "gpt-4" \
-    --num_workers 8 \
-    --max_tokens 512 \
diff --git a/applications/Chat/evaluate/evaluator.py b/applications/Chat/evaluate/evaluator.py
index e9e4a713e1e9..b99509c990a3 100644
--- a/applications/Chat/evaluate/evaluator.py
+++ b/applications/Chat/evaluate/evaluator.py
@@ -1,32 +1,43 @@
+import os
+from typing import Any, Dict, List
+
+import gpt_evaluate
+import metrics
 import pandas as pd
-from typing import Dict, List
+from utils import get_data_per_category, jdump
 
-import metrics as metrics
 
 class Evaluator(object):
     """
-        A class named Evaluator includes GPT-3.5/GPT-4 evaluation 
+        A class named Evaluator includes GPT-3.5/GPT-4 evaluation
         and automatic evaluation
-   
+
     """
-    def __init__(self, params: dict) -> None:
+
+    def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str,
+                                                                                                          Any]) -> None:
         self.params = params
-        self.stats = dict()
-    
+        self.battle_prompt = battle_prompt
+        self.gpt_evaluation_prompt = gpt_evaluation_prompt
+        self.automatic_metric_stats = dict()
+        self.gpt35_evaluation_results = dict()
+        self.battle_results = []
+
     def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
         """
         Comparison between two models using GPT-4 as the reviewer.
         """
-        pass
+
+        self.battle_results = gpt_evaluate.battle(answers1, answers2, self.battle_prompt)
 
     def evaluate(self, answers: List[Dict], targets: List[Dict]) -> None:
         """
         A comprehensive evaluation of the answers from the model.
-        The function evaluates the model's performance from different perspectives 
+        The function evaluates the model's performance from different perspectives
         using GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
 
         The metrics will be decided by the config file.
-   
+
         """
 
         def switch(metric):
@@ -47,28 +58,73 @@ def switch(metric):
             else:
                 raise ValueError(f"Unexpected metric")
 
+        answers_per_category = get_data_per_category(answers, list(self.params.keys()))
+        targets_per_category = get_data_per_category(targets, list(self.params.keys()))
+
         # automatic evaluation
         for category in self.params:
             category_metrics = self.params[category]["Metrics"]
-            targets_list = []
-            predicts_list = []
-            self.stats[category] = {}
-            
-            for dict in targets:
-                if dict["category"] == category:
-                    if(dict["target"]):
-                        targets_list.append(dict["target"])
-                    else:
-                        targets_list.append(dict["output"])
-
-            for dict in answers: 
-                if dict["category"] == category:
-                    predicts_list.append(dict["output"])
+            self.automatic_metric_stats[category] = {}
+
+            targets_list = [
+                target["target"] if target["target"] else target["output"] for target in targets_per_category[category]
+            ]
+            predicts_list = [answer["output"] for answer in answers_per_category[category]]
 
             for metric in category_metrics:
-                self.stats[category].update(switch(metric=metric))
+                self.automatic_metric_stats[category].update(switch(metric=metric))
+
+        # gpt35 evaluation
+        for category in self.params:
+            category_metrics = self.params[category]["GPT-3.5"]
+
+            prompt = self.gpt_evaluation_prompt.get(category, None)
+            if prompt is None:
+                print(f"No prompt for category {category}! Use prompt for category general now.")
+                prompt = self.gpt_evaluation_prompt["general"]
+
+            self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category],
+                                                                                  prompt, category_metrics, category)
+
+    def save(self, path: str, model_name_list: List[str]) -> None:
+        """
+        Save evaluation results of GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
+
+        """
+
+        if len(model_name_list) == 2:
+            save_path = os.path.join(path, "gpt_evaluate", "battle_results")
+            gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
+        else:
+            # save evaluation results for automatic metrics
+            automatic_df = pd.DataFrame(self.automatic_metric_stats)
+
+            automatic_results_save_path = os.path.join(path, "automatic_results")
+            if not os.path.exists(automatic_results_save_path):
+                os.makedirs(automatic_results_save_path)
+            automatic_df.to_csv(os.path.join(automatic_results_save_path, f"{model_name_list[0]}.csv"), index=True)
+
+            # Save evaluation results for GPT-3.5 evaluation metrics.
+            all_evaluations = []
+            base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results")
+            evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
+
+            for category, evaluations in self.gpt35_evaluation_results.items():
+                jdump(
+                    evaluations,
+                    os.path.join(evaluation_results_save_path, model_name_list[0],
+                                 f"{category}_evaluation_results.json"))
+                all_evaluations.extend(evaluations)
+
+            jdump(all_evaluations,
+                  os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json"))
+
+            # Start to calculate scores and save statictics.
+            evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
+            gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations,
+                                                          evaluation_statistics_save_path)
 
-    def save(self, path: str) -> None:
-        # automatic evaluation result
-        automatic_df = pd.DataFrame(self.stats)
-        automatic_df.to_csv(path,index=True)
+            # Save charts and csv.
+            evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
+            gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path,
+                                                             evaluation_analyses_save_path)
diff --git a/applications/Chat/evaluate/generate_answers.py b/applications/Chat/evaluate/generate_answers.py
deleted file mode 100644
index fbebf5c5e6f6..000000000000
--- a/applications/Chat/evaluate/generate_answers.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import argparse
-import os
-import random
-import copy
-import math
-from tqdm import tqdm
-
-import torch
-import torch.distributed as dist
-import transformers
-
-from coati.models.bloom import BLOOMActor
-from coati.models.gpt import GPTActor
-from coati.models.opt import OPTActor
-from coati.models.roberta import RoBERTaActor
-from coati.models.llama import LlamaActor
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from transformers import AutoTokenizer, RobertaTokenizer
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.logging import get_dist_logger
-
-from utils import jload, jdump, is_rank_0
-
-
-logger = get_dist_logger()
-
-PROMPT_DICT = {
-    "prompt_input":
-        ("Below is an instruction that describes a task, paired with an input that provides further context. "
-         "Write a response that appropriately completes the request.\n\n"
-         "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"),
-    "prompt_no_input": ("Below is an instruction that describes a task. "
-                        "Write a response that appropriately completes the request.\n\n"
-                        "### Instruction:\n{instruction}\n\n### Response:"),
-}
-
-
-def generate(args):
-    # torch.cuda.set_per_process_memory_fraction(0.4)
-    if args.strategy == 'naive':
-        strategy = NaiveStrategy()
-    elif args.strategy == 'ddp':
-        strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
-    elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
-    elif args.strategy == 'colossalai_zero2_cpu':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    world_size = dist.get_world_size()
-    rank = dist.get_rank()
-
-    with strategy.model_init_context():
-        if args.model == 'gpt2':
-            actor = GPTActor(pretrained=args.model_path).to(
-                torch.cuda.current_device())
-        elif args.model == 'bloom':
-            actor = BLOOMActor(pretrained=args.model_path).to(
-                torch.cuda.current_device())
-        elif args.model == 'opt':
-            actor = OPTActor(pretrained=args.model_path).to(
-                torch.cuda.current_device())
-        elif args.model == 'roberta':
-            actor = RoBERTaActor(pretrained=args.model_path).to(
-                torch.cuda.current_device())
-        elif args.model == 'llama':
-            actor = LlamaActor(pretrained=args.model_path).to(
-                torch.float16).to(torch.cuda.current_device())
-        else:
-            raise ValueError(f'Unsupported model "{args.model}"')
-
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = AutoTokenizer.from_pretrained('bigscience/bloom-560m')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
-    elif args.model == 'roberta':
-        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
-    elif args.model == 'llama':
-        tokenizer = AutoTokenizer.from_pretrained(args.model_path,
-                                                  padding_side="right",
-                                                  use_fast=False,
-                                                  )
-        tokenizer.eos_token = '<\s>'
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-    
-    questions = []
-    if args.max_datasets_size is not None:
-        questions = random.sample(jload(args.dataset), args.max_datasets_size)
-        if is_rank_0():
-            logger.info(
-                f"Limiting dataset to {args.max_datasets_size} examples.")
-        questions = questions[rank:args.max_datasets_size:world_size]
-
-    answers = copy.deepcopy(questions)
-
-    prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
-    sources = [
-        prompt_input.format_map(example) if example.get(
-            "input", "") != "" else prompt_no_input.format_map(example)
-        for example in questions
-    ]
-
-    if is_rank_0():
-        logger.info("Tokenizing inputs... This may take some time...")
-
-    input_ids_list = []
-
-    for string in sources:
-        input_ids = tokenizer.encode(string, return_tensors='pt').squeeze(0)
-        input_ids_list.append(input_ids)
-
-    bar = tqdm(range(math.ceil(len(input_ids_list)/args.batch_size)),
-               desc=f'steps', disable=not is_rank_0())
-
-    actor.eval()
-    with torch.no_grad():
-        for i in range(0, len(input_ids_list), args.batch_size):
-            batch = input_ids_list[i:i+args.batch_size]
-            batch = [i.flip(dims=[0]) for i in batch]
-            batch = torch.nn.utils.rnn.pad_sequence(batch,
-                                                    batch_first=True,
-                                                    padding_value=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0).to(torch.cuda.current_device())
-            batch = batch.flip(dims=[1])
-            attention_mask = batch.ne(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0)
-
-            outputs = actor.model.generate(batch, attention_mask=attention_mask,
-                                           max_length=args.max_length,
-                                           do_sample=True,
-                                           top_k=50,
-                                           top_p=0.95,
-                                           num_return_sequences=1)
-
-            outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-            for j in range(batch.size(0)):
-                answers[i +
-                        j]['output'] = outputs[j].split("### Response:")[1].strip()
-
-            bar.update()
-
-    jdump(answers, os.path.join(args.answer_path,
-          f'{args.model_name}_answers_rank{rank}.json'))
-
-    if is_rank_0():
-        logger.info(
-            f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini',
-                                 'colossalai_zero2', 'colossalai_zero2_cpu'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2',
-                        choices=['gpt2', 'bloom', 'opt', 'roberta', 'llama'])
-    parser.add_argument('--model_path', type=str, default=None)
-    parser.add_argument('--model_name', type=str, default='model')
-    parser.add_argument('--dataset', type=str, default=None)
-    parser.add_argument('--batch_size', type=int, default=1)
-    parser.add_argument('--max_datasets_size', type=int, default=None)
-    parser.add_argument('--answer_path', type=str, default="answer")
-    parser.add_argument('--max_length', type=int, default=1024)
-    args = parser.parse_args()
-    generate(args)
diff --git a/applications/Chat/evaluate/generate_answers.sh b/applications/Chat/evaluate/generate_answers.sh
deleted file mode 100755
index 36881f5f4f29..000000000000
--- a/applications/Chat/evaluate/generate_answers.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-device_number=number of your devices
-model_name="name of your model"
-model_path="path to your model"
-dataset="path to the question dataset"
-answer_path="path to save the model answers"
-
-torchrun --standalone --nproc_per_node=$device_number generate_answers.py \
-    --model 'llama' \
-    --strategy ddp \
-    --model_path $model_path \
-    --model_name $model_name \
-    --dataset $dataset \
-    --batch_size 8 \
-    --max_datasets_size 80 \
-    --answer_path $answer_path \
-    --max_length 512
-
-python merge.py \
-    --model_name $model_name \
-    --shards $device_number \
-    --answer_path $answer_path \
-
-for (( i=0; i<device_number; i++ )) do
-    rm -rf "${answer_path}/${model_name}_answers_rank${i}.json"
-done
diff --git a/applications/Chat/evaluate/generate_gpt35_answers.py b/applications/Chat/evaluate/generate_gpt35_answers.py
deleted file mode 100644
index db95cd2febf4..000000000000
--- a/applications/Chat/evaluate/generate_gpt35_answers.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#    Adapted form https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/qa_baseline_gpt35.py
-#    Copyright 2023 LM-SYS@FastChat
-
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-
-#        http://www.apache.org/licenses/LICENSE-2.0
-
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-
-import argparse
-import json
-import os
-import time
-import concurrent.futures
-
-import openai
-import tqdm
-import shortuuid
-import logging
-
-from utils import jload, jdump
-
-MODEL = 'gpt-3.5-turbo'
-MAX_API_RETRY = 3
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-def get_answer(question: str, max_tokens: int):
-    answer = question
-    prompt = question['instruction'] if question['input'] == "" else question['instruction'] + \
-            " " + question['input']
-    for _ in range(MAX_API_RETRY):
-        try:
-            response = openai.ChatCompletion.create(
-                model='gpt-3.5-turbo',
-                messages=[{
-                    'role': 'system',
-                    'content': 'You are a helpful assistant.'
-                }, {
-                    'role': 'user',
-                    'content': prompt,
-                }],
-                max_tokens=max_tokens,
-            )
-            answer['output'] = response['choices'][0]['message']['content']
-            return answer
-        except Exception as e:
-            logger.error(e)
-            time.sleep(1)
-    logger.error(f' Answer {question["id"]} failed after {MAX_API_RETRY} retries.')
-    return answer
-
-def evaluate_gpt35(args):
-    questions=jload(args.dataset)
-    
-    logger.info(
-        f' Total number of answers: {len(questions)}.')
-    logger.info(
-        f' Waiting for {args.request_time_gap} seconds before sending the next request.')
-    
-    answers = []
-    with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor:
-        futures = []
-        for question in questions:
-            future = executor.submit(get_answer, question, args.max_tokens)
-            futures.append(future)
-
-        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
-            answers.append(future.result())
-
-    answers.sort(key=lambda x: x['id'])
-
-    jdump(answers, os.path.join(args.answer_path,
-          f'gpt35_answers.json'))
-        
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Evaluate GPT 3.5.')
-    parser.add_argument('--dataset', type=str, default="questions.json")
-    parser.add_argument('--answer_path', type=str, default="answer")
-    parser.add_argument('--num_workers', type=int, default=4)
-    parser.add_argument('--openai_key', type=str, default=None)
-    parser.add_argument('--max_tokens', type=int, default=1024)
-    
-    args = parser.parse_args()
-    
-    if args.openai_key is not None:
-        os.environ["OPENAI_API_KEY"] = args.openai_key
-    openai.api_key = os.getenv("OPENAI_API_KEY")
-        
-    evaluate_gpt35(args)
diff --git a/applications/Chat/evaluate/generate_gpt35_answers.sh b/applications/Chat/evaluate/generate_gpt35_answers.sh
deleted file mode 100755
index 645e982638f5..000000000000
--- a/applications/Chat/evaluate/generate_gpt35_answers.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-python generate_gpt35_answers.py \
-    --dataset "path to the question dataset" \
-    --answer_path "path to answer folder" \
-    --num_workers 4 \
-    --openai_key "your openai key" \
-    --max_tokens 512 \
diff --git a/applications/Chat/evaluate/gpt_evaluate.py b/applications/Chat/evaluate/gpt_evaluate.py
index 119054299517..c7e668df9f57 100644
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -1,6 +1,5 @@
 import concurrent.futures
 import os
-import random
 import re
 import time
 from copy import deepcopy
@@ -124,8 +123,8 @@ def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]
             assert answer1[i]["id"] == answer2[i]["id"]
             answer_id = answer1[i]["id"]
 
-            ques = (answer1[i]["instruction"] if answer1[i]["input"] == "" else answer1[i]["instuction"] + " " +
-                    answer1[i]["input"])
+            ques = answer1[i]["instruction"] if answer1[i][
+                "input"] == "" else answer1[i]["instruction"] + " " + answer1[i]["input"]
             cat = answer1[i]["category"]
             ans1 = answer1[i]["output"]
             ans2 = answer2[i]["output"]
@@ -234,13 +233,17 @@ def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_pa
     print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
 
 
-def get_gpt35_evaluation(prompt: Dict[str, Any], inst: Dict[str, Any], max_tokens: int = 2048) -> Dict[str, Any]:
+def get_gpt35_evaluation(prompt: Dict[str, Any],
+                         inst: Dict[str, Any],
+                         metrics: List[str],
+                         max_tokens: int = 2048) -> Dict[str, Any]:
     """
     Use GPT-3.5 to evaluate one model answer.
 
     Args:
         prompt: a dictionary including prompt template, CoT and metrics.
         inst: the instruction that is needed to be evaluated.
+        metrics: the metrics for evaluation.
         max_tokens: the maximum number of tokens to generate in the completion.
 
     Returns:
@@ -251,10 +254,13 @@ def get_gpt35_evaluation(prompt: Dict[str, Any], inst: Dict[str, Any], max_token
 
     question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
     answer = inst["output"]
-    metrics = prompt["metrics"]
     inst["evaluation"] = {}
 
-    for metric in metrics.keys():
+    for metric in metrics:
+        if prompt["metrics"].get(metric, None) is None:
+            raise Exception(
+                f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
+            )
         for i in range(MAX_API_RETRY):
             try:
                 response = openai.Completion.create(
@@ -282,88 +288,48 @@ def get_gpt35_evaluation(prompt: Dict[str, Any], inst: Dict[str, Any], max_token
 
 def gpt35_evaluate(
     answers: List[Dict],
-    prompts: List[Dict],
-    model_name: str,
-    save_path: str,
+    prompt: Dict[str, Any],
+    metrics: List[str],
+    category: str,
 ) -> List[Dict]:
     """
     Use GPT-3.5 to evaluate model answers and save evaluation results.
 
     Args:
         answers: model answers.
-        prompts: prompts for all categories.
-        model_name: name of the model.
-        save_path: path to save GPT-3.5 evaluations.
+        prompt: prompt for GPT-3.5 evaluation.
+        metrics: metrics for GPT-3.5 evaluation.
+        category: the category of the model answers for evaluation.
 
     Returns:
-        All the evaluations of the given answers.
+        Evaluations of the given answers.
     """
 
-    prompt_per_category = {prompt["category"]: prompt for prompt in prompts}
-
-    data_per_category = {}
-    for answer in answers:
-        category = answer["category"]
-
-        if answer["category"] in data_per_category.keys():
-            data_per_category[category].append(answer)
-        else:
-            data_per_category[category] = [answer]
-
-    categories_str = ", ".join(x for x in list(data_per_category.keys()))
-    print(f"The evaluated categories are {categories_str}.")
-
-    if not os.path.exists(save_path):
-        os.makedirs(save_path)
-
-    all_evaluations = []
-
-    for category, data in data_per_category.items():
-        if prompt_per_category.get(category) == None:
-            print(f"No metrics for category {category}! Use category general now.")
-            prompt_per_category[category] = prompt_per_category["general"]
-
-        if os.path.exists(os.path.join(save_path, model_name, f"{category}_evaluation_results.json")):
-            print(
-                f"The evaluation file for category {category} already exists. You are now re-evaluating category {category}!"
-            )
-
-        print(f"The number of instances of category {category}'s is {len(data)}.")
-
-        evaluations = []
-
-        metrics_str = ", ".join(x for x in list(prompt_per_category[category]["metrics"].keys()))
-        print(f"Category {category}'s metrics are {metrics_str}.")
+    print(f"The number of instances of category {category}'s is {len(answers)}.")
 
-        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
-            futures = []
-            for inst in data:
-                future = executor.submit(get_gpt35_evaluation, prompt_per_category[category], inst, 1)
-                futures.append(future)
+    evaluations = []
 
-            for future in tqdm.tqdm(
-                    concurrent.futures.as_completed(futures),
-                    desc=f"{category}: ",
-                    total=len(futures),
-            ):
-                evaluations.append(future.result())
+    metrics_str = ", ".join(x for x in metrics)
+    print(f"Category {category}'s metrics are {metrics_str}.")
 
-        evaluations.sort(key=lambda x: x["id"])
+    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+        futures = []
+        for inst in answers:
+            future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1)
+            futures.append(future)
 
-        jdump(
-            evaluations,
-            os.path.join(save_path, model_name, f"{category}_evaluation_results.json"),
-        )
-        print(f"{category} done.")
+        for future in tqdm.tqdm(
+                concurrent.futures.as_completed(futures),
+                desc=f"{category}: ",
+                total=len(futures),
+        ):
+            evaluations.append(future.result())
 
-        all_evaluations.extend(evaluations)
+    evaluations.sort(key=lambda x: x["id"])
 
-    jdump(
-        all_evaluations,
-        os.path.join(save_path, f"{model_name}_evaluation_results.json"),
-    )
+    print(f"{category} done.")
 
-    return all_evaluations
+    return evaluations
 
 
 def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
@@ -458,7 +424,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->
     """
 
     if not os.path.exists(statistics_path):
-        raise Exception("The given directory doesn't exist! No statistics found!")
+        raise Exception(f'The given directory "{statistics_path}" doesn\'t exist! No statistics found!')
 
     all_statistics = {}
 
@@ -468,7 +434,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->
             all_statistics[model_name] = jload(os.path.join(statistics_path, file_name))
 
     if len(list(all_statistics.keys())) == 0:
-        raise Exception("There are no statistics in the given directory!")
+        raise Exception(f'There are no statistics in the given directory "{statistics_path}"!')
 
     frame_all = {
         "model": [],
diff --git a/applications/Chat/evaluate/merge.py b/applications/Chat/evaluate/merge.py
deleted file mode 100644
index 295dd7fa7cb3..000000000000
--- a/applications/Chat/evaluate/merge.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import argparse
-import os
-
-from utils import jload, jdump
-
-
-def generate(args):
-    dataset = []
-    for i in range(args.shards):
-        shard = jload(os.path.join(args.answer_path,
-                      f'{args.model_name}_answers_rank{i}.json'))
-        dataset.extend(shard)
-
-    dataset.sort(key=lambda x: x['id'])
-    jdump(dataset, os.path.join(args.answer_path,
-                                f'{args.model_name}_answers.json'))
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name', type=str, default='model')
-    parser.add_argument('--shards', type=int, default=4)
-    parser.add_argument('--answer_path', type=str, default="answer")
-    args = parser.parse_args()
-    generate(args)
diff --git a/applications/Chat/evaluate/requirements.txt b/applications/Chat/evaluate/requirements.txt
new file mode 100644
index 000000000000..b0301c2f17f8
--- /dev/null
+++ b/applications/Chat/evaluate/requirements.txt
@@ -0,0 +1,10 @@
+jieba
+bert-score
+rouge_chinese
+scikit-metrics
+nltk
+openai
+seaborn
+pandas
+matplotlib
+numpy
diff --git a/applications/Chat/evaluate/utils.py b/applications/Chat/evaluate/utils.py
index 692ee007c080..e855cd45221c 100644
--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
@@ -2,10 +2,6 @@
 import json
 import os
 
-import torch.distributed as dist
-
-def is_rank_0() -> bool:
-    return not dist.is_initialized() or dist.get_rank() == 0
 
 def _make_w_io_base(f, mode: str):
     if not isinstance(f, io.IOBase):
@@ -15,11 +11,13 @@ def _make_w_io_base(f, mode: str):
         f = open(f, mode=mode)
     return f
 
+
 def _make_r_io_base(f, mode: str):
     if not isinstance(f, io.IOBase):
         f = open(f, mode=mode)
     return f
 
+
 def jdump(obj, f, mode="w", indent=4, default=str):
     """Dump a str or dictionary to a file in json format.
     Args:
@@ -38,6 +36,7 @@ def jdump(obj, f, mode="w", indent=4, default=str):
         raise ValueError(f"Unexpected type: {type(obj)}")
     f.close()
 
+
 def jload(f, mode="r"):
     """Load a .json file into a dictionary."""
     f = _make_r_io_base(f, mode)
@@ -45,9 +44,19 @@ def jload(f, mode="r"):
     f.close()
     return jdict
 
+
 def get_json_list(file_path):
     with open(file_path, 'r') as f:
         json_list = []
         for line in f:
             json_list.append(json.loads(line))
         return json_list
+
+
+def get_data_per_category(data, categories):
+    data_per_category = {category: [] for category in categories}
+    for item in data:
+        category = item["category"]
+        data_per_category[category].append(item)
+
+    return data_per_category
diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt
index 3dd41310ef4d..af7ff67861eb 100644
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
@@ -11,8 +11,3 @@ sse_starlette
 wandb
 sentencepiece
 gpustat
-jieba
-bert-score
-rouge_chinese
-scikit-metrics
-nltk