From 2abf1910c063117e6728e90e3a723314a748730a Mon Sep 17 00:00:00 2001 From: Yuanchen Xu Date: Tue, 23 May 2023 15:11:04 +0800 Subject: [PATCH] support gpt evaluation --- applications/Chat/evaluate/eval.py | 89 ++++-- applications/Chat/evaluate/eval.sh | 9 + applications/Chat/evaluate/evaluate.py | 256 ------------------ applications/Chat/evaluate/evaluate.sh | 9 - applications/Chat/evaluate/evaluator.py | 114 ++++++-- .../Chat/evaluate/generate_answers.py | 173 ------------ .../Chat/evaluate/generate_answers.sh | 25 -- .../Chat/evaluate/generate_gpt35_answers.py | 98 ------- .../Chat/evaluate/generate_gpt35_answers.sh | 6 - applications/Chat/evaluate/gpt_evaluate.py | 112 +++----- applications/Chat/evaluate/merge.py | 25 -- applications/Chat/evaluate/requirements.txt | 10 + applications/Chat/evaluate/utils.py | 17 +- applications/Chat/requirements.txt | 5 - 14 files changed, 228 insertions(+), 720 deletions(-) create mode 100755 applications/Chat/evaluate/eval.sh delete mode 100644 applications/Chat/evaluate/evaluate.py delete mode 100755 applications/Chat/evaluate/evaluate.sh delete mode 100644 applications/Chat/evaluate/generate_answers.py delete mode 100755 applications/Chat/evaluate/generate_answers.sh delete mode 100644 applications/Chat/evaluate/generate_gpt35_answers.py delete mode 100755 applications/Chat/evaluate/generate_gpt35_answers.sh delete mode 100644 applications/Chat/evaluate/merge.py create mode 100644 applications/Chat/evaluate/requirements.txt diff --git a/applications/Chat/evaluate/eval.py b/applications/Chat/evaluate/eval.py index f34437751827..69f2c272a116 100644 --- a/applications/Chat/evaluate/eval.py +++ b/applications/Chat/evaluate/eval.py @@ -1,11 +1,17 @@ import argparse import json +import os +import openai from evaluator import Evaluator from utils import jload + def main(args): - # load config + assert len(args.answer_file_list) == len( + args.model_name_list), "The number of answer files and model names should be equal!" + + # load config config = jload(args.config_file) if config["language"] == "cn": @@ -16,28 +22,77 @@ def main(args): for metric_type, metrics in config["category"][category].items(): metrics_all[metric_type] = metrics metrics_per_category[category] = metrics_all - + + battle_prompt = None + if args.battle_prompt_file: + battle_prompt = jload(args.battle_prompt_file) + + gpt_evaluation_prompt = None + if args.gpt_evaluation_prompt_file: + gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file) + + if len(args.model_name_list) == 2 and not battle_prompt: + raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!") + + if len(args.model_name_list) == 1 and not gpt_evaluation_prompt: + raise Exception( + "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!") + # initialize evaluator - evaluator = Evaluator(metrics_per_category) - if args.answers2_file: - answers1 = jload(args.answers1_file) - answers2 = jload(args.answers2_file) + evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt) + if len(args.model_name_list) == 2: + answers1 = jload(args.answer_file_list[0]) + answers2 = jload(args.answer_file_list[1]) + + assert len(answers1) == len(answers2), "The number of answers for two models should be equal!" + evaluator.battle(answers1=answers1, answers2=answers2) - else: + evaluator.save(args.save_path, args.model_name_list) + elif len(args.model_name_list) == 1: targets = jload(args.target_file) - answers = jload(args.answers1_file) + answers = jload(args.answer_file_list[0]) + + assert len(targets) == len(answers), "The number of target answers and model answers should be equal!" + evaluator.evaluate(answers=answers, targets=targets) - evaluator.save(args.save_path) + evaluator.save(args.save_path, args.model_name_list) + else: + raise ValueError("Unsupported number of answer files and model names!") else: - raise ValueError(f'Unsupported language') + raise ValueError(f'Unsupported language {config["language"]}!') if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--config_file', type=str, default=None, help='path to the file of target results') - parser.add_argument('--target_file', type=str, default=None, help='path to the file of target results') - parser.add_argument('--answers1_file', type=str, default=None, help='path to the file of one model prediction') - parser.add_argument('--answers2_file', type=str, default=None, help='path to the file of the other model prediction') - parser.add_argument('--save_path', type=str, default="results.csv", help='path to the csv file to save evaluation results') + parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.') + parser.add_argument('--config_file', + type=str, + default=None, + required=True, + help='path to the file of target results') + parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle') + parser.add_argument('--gpt_evaluation_prompt_file', + type=str, + default=None, + help='path to the prompt file for gpt evaluation') + parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file') + parser.add_argument('--answer_file_list', + type=str, + nargs='+', + default=[], + required=True, + help='path to the answer files of at most 2 models') + parser.add_argument('--model_name_list', + type=str, + nargs='+', + default=[], + required=True, + help='the names of at most 2 models') + parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results') + parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key') args = parser.parse_args() - main(args) \ No newline at end of file + + if args.openai_key is not None: + os.environ["OPENAI_API_KEY"] = args.openai_key + openai.api_key = os.getenv("OPENAI_API_KEY") + + main(args) diff --git a/applications/Chat/evaluate/eval.sh b/applications/Chat/evaluate/eval.sh new file mode 100755 index 000000000000..f5729e6ee5c7 --- /dev/null +++ b/applications/Chat/evaluate/eval.sh @@ -0,0 +1,9 @@ +python eval.py \ + --config_file "path to the config file" \ + --battle_prompt_file "path to the prompt file for battle" \ + --gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \ + --target_file "path to the target answer file" \ + --answer_file_list "path to the answer files of at most 2 models" \ + --model_name_list "the names of at most 2 models" \ + --save_path "path to save results" \ + --openai_key "your openai key" \ diff --git a/applications/Chat/evaluate/evaluate.py b/applications/Chat/evaluate/evaluate.py deleted file mode 100644 index 2f9c9ce8e10d..000000000000 --- a/applications/Chat/evaluate/evaluate.py +++ /dev/null @@ -1,256 +0,0 @@ -# Adapted form https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py -# Copyright 2023 LM-SYS@FastChat - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import argparse -import json -import os -import time -import re -import concurrent.futures - -import openai -import tqdm -import shortuuid -import logging - -from utils import jload, jdump, get_json_list - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -MAX_API_RETRY = 3 - - -def get_eval(sys_prompt, user_prompt: str, answer_id: int, max_tokens: int, model: str): - logging.basicConfig(level=logging.INFO) - for _ in range(MAX_API_RETRY): - try: - response = openai.ChatCompletion.create( - model=model, - messages=[{ - 'role': 'system', - 'content': sys_prompt - }, { - 'role': 'user', - 'content': user_prompt, - }], - temperature=0.2, - max_tokens=max_tokens, - ) - review = response['choices'][0]['message']['content'] - return {"review": review, 'id': answer_id} - except Exception as e: - logger.error(e) - time.sleep(1) - logger.error(f' Review {answer_id} failed after {MAX_API_RETRY} retries.') - return 'error' - - -def parse_score(review): - try: - pattern = re.compile('([0-9]|10) out of 10') - sp = re.findall(pattern, review) - if len(re.findall(pattern, review)) == 2: - return [float(sp[0]), float(sp[1])] - - pattern = re.compile('a score of ([0-9]|10)') - sp = re.findall(pattern, review) - if len(re.findall(pattern, review)) == 2: - return [float(sp[0]), float(sp[1])] - - pattern = re.compile('([0-9]|10)/10') - sp = re.findall(pattern, review) - if len(re.findall(pattern, review)) == 2: - return [float(sp[0]), float(sp[1])] - - score_pair = review.split('\n')[0] - score_pair = score_pair.replace(',', ' ') - sp = score_pair.split(' ') - if len(sp) == 2: - return [float(sp[0]), float(sp[1])] - else: - raise Exception('Invalid score pair.') - except Exception as e: - return [-1, -1] - - -def gen_prompt(reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2): - reviewer_idx = 0 - for idx, reviewer in enumerate(reviewer_jsons): - if reviewer['category'] == cat: - reviewer_idx = idx - break - prompt_id = reviewer_jsons[reviewer_idx]['prompt_id'] - prompt_json = prompt_jsons[prompt_id-1] - assert prompt_json['prompt_id'] == prompt_id - - sys_prompt = prompt_json['system_prompt'] - prompt_template = prompt_json['prompt_template'] - defaults = prompt_json['defaults'] - prompt = prompt_template.format( - question=ques, answer_1=ans1, answer_2=ans2, **defaults) - - return sys_prompt, prompt, reviewer_idx+1 - - -def evaluate(args): - answer1_jsons = jload(args.answer_file_list[0]) - answer2_jsons = jload(args.answer_file_list[1]) - reviewer_jsons = get_json_list(args.reviewer_file) - prompt_jsons = get_json_list(args.prompt_file) - - assert len(answer1_jsons) == len(answer2_jsons) - - handles = [] - review_jsons = [] - - total_len = len(answer1_jsons) - question_idx_list = list(range(total_len)) - - logger.info( - f' Total number of answers: {len(answer2_jsons)}.') - - reviews = [] - with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor: - futures = [] - for i in question_idx_list: - assert answer1_jsons[i]['id'] == answer2_jsons[i]['id'] - answer_id = answer1_jsons[i]['id'] - - ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instruction'] + \ - " " + answer1_jsons[i]['input'] - cat = answer1_jsons[i]['category'] - ans1 = answer1_jsons[i]['output'] - ans2 = answer2_jsons[i]['output'] - - sys_prompt, prompt, reviewer_id = gen_prompt( - reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2) - - review_id = shortuuid.uuid() - review_jsons.append({ - 'review_id': review_id, - 'id': answer_id, - 'reviewer_id': reviewer_id, - 'metadata': {} - }) - - future = executor.submit( - get_eval, sys_prompt, prompt, answer_id, args.max_tokens, args.model) - futures.append(future) - - for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): - reviews.append(future.result()) - - reviews.sort(key=lambda x: x['id']) - review_jsons.sort(key=lambda x: x['id']) - - ans1_score = 0 - ans2_score = 0 - better_count = 0 - worse_count = 0 - tie_count = 0 - invalid_count = 0 - - better_file = [] - worse_file = [] - tie_file = [] - invalid_file = [] - output_review_file = [] - - for idx, review in enumerate(reviews): - scores = parse_score(review['review']) - review_jsons[idx]['review'] = review['review'] - review_jsons[idx]['score'] = scores - - if scores[0] == -1 and scores[1] == -1: - invalid_count += 1 - invalid_file.append(review_jsons[idx]) - logger.info(f' Invalid score pair: {review_jsons[idx]["id"]}.') - else: - if scores[0] > scores[1]: - worse_count += 1 - worse_file.append(review_jsons[idx]) - elif scores[0] < scores[1]: - better_count += 1 - better_file.append(review_jsons[idx]) - else: - tie_count += 1 - tie_file.append(review_jsons[idx]) - ans1_score += scores[0] - ans2_score += scores[1] - - output_review_file.append(review_jsons[idx]) - - better_file.sort(key=lambda x: x['id']) - worse_file.sort(key=lambda x: x['id']) - tie_file.sort(key=lambda x: x['id']) - invalid_file.sort(key=lambda x: x['id']) - output_review_file.sort(key=lambda x: x['id']) - - name1 = os.path.basename(args.answer_file_list[0]).split("_answers")[0] - name2 = os.path.basename(args.answer_file_list[1]).split("_answers")[0] - prefix = f"{name1}_vs_{name2}" - - jdump(better_file, os.path.join( - args.output_folder, prefix, f"{prefix}_better.json")) - jdump(worse_file, os.path.join( - args.output_folder, prefix, f"{prefix}_worse.json")) - jdump(tie_file, os.path.join( - args.output_folder, prefix, f"{prefix}_tie.json")) - jdump(invalid_file, os.path.join( - args.output_folder, prefix, f"{prefix}_invalid.json")) - jdump(output_review_file, os.path.join( - args.output_folder, prefix, f"{prefix}_review.json")) - - if os.path.exists(os.path.join(args.output_folder, "results.json")): - results = jload(os.path.join(args.output_folder, "results.json")) - else: - results = {} - results[prefix] = {'model': [name1, name2], 'better': better_count, 'worse': worse_count, 'tie': tie_count, 'win_rate': better_count / - (len(reviews)-invalid_count), 'score': [ans1_score/(len(reviews)-invalid_count), ans2_score/(len(reviews)-invalid_count)]} - jdump(results, os.path.join(args.output_folder, "results.json")) - - logger.info(f' Total {invalid_count} invalid score pair(s).') - logger.info(f' Model {name2} has {better_count} better answer(s).') - logger.info(f' Model {name2} has {worse_count} worse answer(s).') - logger.info(f' {tie_count} answer(s) play(s) to a tie.') - logger.info( - f' Win rate of model {name2}: {better_count/(len(reviews)-invalid_count):.2f}') - logger.info( - f' Model {name1} average score: {ans1_score/(len(reviews)-invalid_count):.2f}') - logger.info( - f' Model {name2} average score: {ans2_score/(len(reviews)-invalid_count):.2f}') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Model evaluation.') - parser.add_argument('--answer_file_list', nargs='+', default=[]) - parser.add_argument('--prompt_file') - parser.add_argument('--reviewer_file') - parser.add_argument('--output_folder', type=str, default="./output") - parser.add_argument('--openai_key', type=str, default=None) - parser.add_argument('--model', type=str, default="gpt-4") - parser.add_argument('--num_workers', type=int, default=8) - parser.add_argument('--max_tokens', type=int, default=512, - help='maximum number of tokens produced in the output') - args = parser.parse_args() - - if args.openai_key is not None: - os.environ["OPENAI_API_KEY"] = args.openai_key - openai.api_key = os.getenv("OPENAI_API_KEY") - - evaluate(args) diff --git a/applications/Chat/evaluate/evaluate.sh b/applications/Chat/evaluate/evaluate.sh deleted file mode 100755 index c51aa941019e..000000000000 --- a/applications/Chat/evaluate/evaluate.sh +++ /dev/null @@ -1,9 +0,0 @@ -python evaluate.py \ - --answer_file_list "path to answers of model 1" "path to answers of model 2" \ - --prompt_file "path to prompt file" \ - --reviewer_file "path to reviewer file" \ - --output_folder "path to output folder" \ - --openai_key "your openai key" \ - --model "gpt-4" \ - --num_workers 8 \ - --max_tokens 512 \ diff --git a/applications/Chat/evaluate/evaluator.py b/applications/Chat/evaluate/evaluator.py index e9e4a713e1e9..b99509c990a3 100644 --- a/applications/Chat/evaluate/evaluator.py +++ b/applications/Chat/evaluate/evaluator.py @@ -1,32 +1,43 @@ +import os +from typing import Any, Dict, List + +import gpt_evaluate +import metrics import pandas as pd -from typing import Dict, List +from utils import get_data_per_category, jdump -import metrics as metrics class Evaluator(object): """ - A class named Evaluator includes GPT-3.5/GPT-4 evaluation + A class named Evaluator includes GPT-3.5/GPT-4 evaluation and automatic evaluation - + """ - def __init__(self, params: dict) -> None: + + def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, + Any]) -> None: self.params = params - self.stats = dict() - + self.battle_prompt = battle_prompt + self.gpt_evaluation_prompt = gpt_evaluation_prompt + self.automatic_metric_stats = dict() + self.gpt35_evaluation_results = dict() + self.battle_results = [] + def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None: """ Comparison between two models using GPT-4 as the reviewer. """ - pass + + self.battle_results = gpt_evaluate.battle(answers1, answers2, self.battle_prompt) def evaluate(self, answers: List[Dict], targets: List[Dict]) -> None: """ A comprehensive evaluation of the answers from the model. - The function evaluates the model's performance from different perspectives + The function evaluates the model's performance from different perspectives using GPT-3.5, GPT-4, and off-the-shelf evaluation metrics. The metrics will be decided by the config file. - + """ def switch(metric): @@ -47,28 +58,73 @@ def switch(metric): else: raise ValueError(f"Unexpected metric") + answers_per_category = get_data_per_category(answers, list(self.params.keys())) + targets_per_category = get_data_per_category(targets, list(self.params.keys())) + # automatic evaluation for category in self.params: category_metrics = self.params[category]["Metrics"] - targets_list = [] - predicts_list = [] - self.stats[category] = {} - - for dict in targets: - if dict["category"] == category: - if(dict["target"]): - targets_list.append(dict["target"]) - else: - targets_list.append(dict["output"]) - - for dict in answers: - if dict["category"] == category: - predicts_list.append(dict["output"]) + self.automatic_metric_stats[category] = {} + + targets_list = [ + target["target"] if target["target"] else target["output"] for target in targets_per_category[category] + ] + predicts_list = [answer["output"] for answer in answers_per_category[category]] for metric in category_metrics: - self.stats[category].update(switch(metric=metric)) + self.automatic_metric_stats[category].update(switch(metric=metric)) + + # gpt35 evaluation + for category in self.params: + category_metrics = self.params[category]["GPT-3.5"] + + prompt = self.gpt_evaluation_prompt.get(category, None) + if prompt is None: + print(f"No prompt for category {category}! Use prompt for category general now.") + prompt = self.gpt_evaluation_prompt["general"] + + self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category], + prompt, category_metrics, category) + + def save(self, path: str, model_name_list: List[str]) -> None: + """ + Save evaluation results of GPT-3.5, GPT-4, and off-the-shelf evaluation metrics. + + """ + + if len(model_name_list) == 2: + save_path = os.path.join(path, "gpt_evaluate", "battle_results") + gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path) + else: + # save evaluation results for automatic metrics + automatic_df = pd.DataFrame(self.automatic_metric_stats) + + automatic_results_save_path = os.path.join(path, "automatic_results") + if not os.path.exists(automatic_results_save_path): + os.makedirs(automatic_results_save_path) + automatic_df.to_csv(os.path.join(automatic_results_save_path, f"{model_name_list[0]}.csv"), index=True) + + # Save evaluation results for GPT-3.5 evaluation metrics. + all_evaluations = [] + base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results") + evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results") + + for category, evaluations in self.gpt35_evaluation_results.items(): + jdump( + evaluations, + os.path.join(evaluation_results_save_path, model_name_list[0], + f"{category}_evaluation_results.json")) + all_evaluations.extend(evaluations) + + jdump(all_evaluations, + os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json")) + + # Start to calculate scores and save statictics. + evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics") + gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations, + evaluation_statistics_save_path) - def save(self, path: str) -> None: - # automatic evaluation result - automatic_df = pd.DataFrame(self.stats) - automatic_df.to_csv(path,index=True) + # Save charts and csv. + evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses") + gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path, + evaluation_analyses_save_path) diff --git a/applications/Chat/evaluate/generate_answers.py b/applications/Chat/evaluate/generate_answers.py deleted file mode 100644 index fbebf5c5e6f6..000000000000 --- a/applications/Chat/evaluate/generate_answers.py +++ /dev/null @@ -1,173 +0,0 @@ -import argparse -import os -import random -import copy -import math -from tqdm import tqdm - -import torch -import torch.distributed as dist -import transformers - -from coati.models.bloom import BLOOMActor -from coati.models.gpt import GPTActor -from coati.models.opt import OPTActor -from coati.models.roberta import RoBERTaActor -from coati.models.llama import LlamaActor -from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy -from transformers import AutoTokenizer, RobertaTokenizer -from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer - -from colossalai.logging import get_dist_logger - -from utils import jload, jdump, is_rank_0 - - -logger = get_dist_logger() - -PROMPT_DICT = { - "prompt_input": - ("Below is an instruction that describes a task, paired with an input that provides further context. " - "Write a response that appropriately completes the request.\n\n" - "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"), - "prompt_no_input": ("Below is an instruction that describes a task. " - "Write a response that appropriately completes the request.\n\n" - "### Instruction:\n{instruction}\n\n### Response:"), -} - - -def generate(args): - # torch.cuda.set_per_process_memory_fraction(0.4) - if args.strategy == 'naive': - strategy = NaiveStrategy() - elif args.strategy == 'ddp': - strategy = DDPStrategy() - elif args.strategy == 'colossalai_gemini': - strategy = ColossalAIStrategy(stage=3, placement_policy='cuda') - elif args.strategy == 'colossalai_zero2': - strategy = ColossalAIStrategy(stage=2, placement_policy='cuda') - elif args.strategy == 'colossalai_zero2_cpu': - strategy = ColossalAIStrategy(stage=2, placement_policy='cpu') - else: - raise ValueError(f'Unsupported strategy "{args.strategy}"') - - world_size = dist.get_world_size() - rank = dist.get_rank() - - with strategy.model_init_context(): - if args.model == 'gpt2': - actor = GPTActor(pretrained=args.model_path).to( - torch.cuda.current_device()) - elif args.model == 'bloom': - actor = BLOOMActor(pretrained=args.model_path).to( - torch.cuda.current_device()) - elif args.model == 'opt': - actor = OPTActor(pretrained=args.model_path).to( - torch.cuda.current_device()) - elif args.model == 'roberta': - actor = RoBERTaActor(pretrained=args.model_path).to( - torch.cuda.current_device()) - elif args.model == 'llama': - actor = LlamaActor(pretrained=args.model_path).to( - torch.float16).to(torch.cuda.current_device()) - else: - raise ValueError(f'Unsupported model "{args.model}"') - - if args.model == 'gpt2': - tokenizer = GPT2Tokenizer.from_pretrained('gpt2') - tokenizer.pad_token = tokenizer.eos_token - elif args.model == 'bloom': - tokenizer = AutoTokenizer.from_pretrained('bigscience/bloom-560m') - tokenizer.pad_token = tokenizer.eos_token - elif args.model == 'opt': - tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m') - elif args.model == 'roberta': - tokenizer = RobertaTokenizer.from_pretrained("roberta-base") - elif args.model == 'llama': - tokenizer = AutoTokenizer.from_pretrained(args.model_path, - padding_side="right", - use_fast=False, - ) - tokenizer.eos_token = '<\s>' - else: - raise ValueError(f'Unsupported model "{args.model}"') - - questions = [] - if args.max_datasets_size is not None: - questions = random.sample(jload(args.dataset), args.max_datasets_size) - if is_rank_0(): - logger.info( - f"Limiting dataset to {args.max_datasets_size} examples.") - questions = questions[rank:args.max_datasets_size:world_size] - - answers = copy.deepcopy(questions) - - prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"] - sources = [ - prompt_input.format_map(example) if example.get( - "input", "") != "" else prompt_no_input.format_map(example) - for example in questions - ] - - if is_rank_0(): - logger.info("Tokenizing inputs... This may take some time...") - - input_ids_list = [] - - for string in sources: - input_ids = tokenizer.encode(string, return_tensors='pt').squeeze(0) - input_ids_list.append(input_ids) - - bar = tqdm(range(math.ceil(len(input_ids_list)/args.batch_size)), - desc=f'steps', disable=not is_rank_0()) - - actor.eval() - with torch.no_grad(): - for i in range(0, len(input_ids_list), args.batch_size): - batch = input_ids_list[i:i+args.batch_size] - batch = [i.flip(dims=[0]) for i in batch] - batch = torch.nn.utils.rnn.pad_sequence(batch, - batch_first=True, - padding_value=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0).to(torch.cuda.current_device()) - batch = batch.flip(dims=[1]) - attention_mask = batch.ne(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0) - - outputs = actor.model.generate(batch, attention_mask=attention_mask, - max_length=args.max_length, - do_sample=True, - top_k=50, - top_p=0.95, - num_return_sequences=1) - - outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) - for j in range(batch.size(0)): - answers[i + - j]['output'] = outputs[j].split("### Response:")[1].strip() - - bar.update() - - jdump(answers, os.path.join(args.answer_path, - f'{args.model_name}_answers_rank{rank}.json')) - - if is_rank_0(): - logger.info( - f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--strategy', - choices=['naive', 'ddp', 'colossalai_gemini', - 'colossalai_zero2', 'colossalai_zero2_cpu'], - default='naive') - parser.add_argument('--model', default='gpt2', - choices=['gpt2', 'bloom', 'opt', 'roberta', 'llama']) - parser.add_argument('--model_path', type=str, default=None) - parser.add_argument('--model_name', type=str, default='model') - parser.add_argument('--dataset', type=str, default=None) - parser.add_argument('--batch_size', type=int, default=1) - parser.add_argument('--max_datasets_size', type=int, default=None) - parser.add_argument('--answer_path', type=str, default="answer") - parser.add_argument('--max_length', type=int, default=1024) - args = parser.parse_args() - generate(args) diff --git a/applications/Chat/evaluate/generate_answers.sh b/applications/Chat/evaluate/generate_answers.sh deleted file mode 100755 index 36881f5f4f29..000000000000 --- a/applications/Chat/evaluate/generate_answers.sh +++ /dev/null @@ -1,25 +0,0 @@ -device_number=number of your devices -model_name="name of your model" -model_path="path to your model" -dataset="path to the question dataset" -answer_path="path to save the model answers" - -torchrun --standalone --nproc_per_node=$device_number generate_answers.py \ - --model 'llama' \ - --strategy ddp \ - --model_path $model_path \ - --model_name $model_name \ - --dataset $dataset \ - --batch_size 8 \ - --max_datasets_size 80 \ - --answer_path $answer_path \ - --max_length 512 - -python merge.py \ - --model_name $model_name \ - --shards $device_number \ - --answer_path $answer_path \ - -for (( i=0; i Dict[str, Any]: +def get_gpt35_evaluation(prompt: Dict[str, Any], + inst: Dict[str, Any], + metrics: List[str], + max_tokens: int = 2048) -> Dict[str, Any]: """ Use GPT-3.5 to evaluate one model answer. Args: prompt: a dictionary including prompt template, CoT and metrics. inst: the instruction that is needed to be evaluated. + metrics: the metrics for evaluation. max_tokens: the maximum number of tokens to generate in the completion. Returns: @@ -251,10 +254,13 @@ def get_gpt35_evaluation(prompt: Dict[str, Any], inst: Dict[str, Any], max_token question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"]) answer = inst["output"] - metrics = prompt["metrics"] inst["evaluation"] = {} - for metric in metrics.keys(): + for metric in metrics: + if prompt["metrics"].get(metric, None) is None: + raise Exception( + f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!" + ) for i in range(MAX_API_RETRY): try: response = openai.Completion.create( @@ -282,88 +288,48 @@ def get_gpt35_evaluation(prompt: Dict[str, Any], inst: Dict[str, Any], max_token def gpt35_evaluate( answers: List[Dict], - prompts: List[Dict], - model_name: str, - save_path: str, + prompt: Dict[str, Any], + metrics: List[str], + category: str, ) -> List[Dict]: """ Use GPT-3.5 to evaluate model answers and save evaluation results. Args: answers: model answers. - prompts: prompts for all categories. - model_name: name of the model. - save_path: path to save GPT-3.5 evaluations. + prompt: prompt for GPT-3.5 evaluation. + metrics: metrics for GPT-3.5 evaluation. + category: the category of the model answers for evaluation. Returns: - All the evaluations of the given answers. + Evaluations of the given answers. """ - prompt_per_category = {prompt["category"]: prompt for prompt in prompts} - - data_per_category = {} - for answer in answers: - category = answer["category"] - - if answer["category"] in data_per_category.keys(): - data_per_category[category].append(answer) - else: - data_per_category[category] = [answer] - - categories_str = ", ".join(x for x in list(data_per_category.keys())) - print(f"The evaluated categories are {categories_str}.") - - if not os.path.exists(save_path): - os.makedirs(save_path) - - all_evaluations = [] - - for category, data in data_per_category.items(): - if prompt_per_category.get(category) == None: - print(f"No metrics for category {category}! Use category general now.") - prompt_per_category[category] = prompt_per_category["general"] - - if os.path.exists(os.path.join(save_path, model_name, f"{category}_evaluation_results.json")): - print( - f"The evaluation file for category {category} already exists. You are now re-evaluating category {category}!" - ) - - print(f"The number of instances of category {category}'s is {len(data)}.") - - evaluations = [] - - metrics_str = ", ".join(x for x in list(prompt_per_category[category]["metrics"].keys())) - print(f"Category {category}'s metrics are {metrics_str}.") + print(f"The number of instances of category {category}'s is {len(answers)}.") - with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: - futures = [] - for inst in data: - future = executor.submit(get_gpt35_evaluation, prompt_per_category[category], inst, 1) - futures.append(future) + evaluations = [] - for future in tqdm.tqdm( - concurrent.futures.as_completed(futures), - desc=f"{category}: ", - total=len(futures), - ): - evaluations.append(future.result()) + metrics_str = ", ".join(x for x in metrics) + print(f"Category {category}'s metrics are {metrics_str}.") - evaluations.sort(key=lambda x: x["id"]) + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [] + for inst in answers: + future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1) + futures.append(future) - jdump( - evaluations, - os.path.join(save_path, model_name, f"{category}_evaluation_results.json"), - ) - print(f"{category} done.") + for future in tqdm.tqdm( + concurrent.futures.as_completed(futures), + desc=f"{category}: ", + total=len(futures), + ): + evaluations.append(future.result()) - all_evaluations.extend(evaluations) + evaluations.sort(key=lambda x: x["id"]) - jdump( - all_evaluations, - os.path.join(save_path, f"{model_name}_evaluation_results.json"), - ) + print(f"{category} done.") - return all_evaluations + return evaluations def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float: @@ -458,7 +424,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> """ if not os.path.exists(statistics_path): - raise Exception("The given directory doesn't exist! No statistics found!") + raise Exception(f'The given directory "{statistics_path}" doesn\'t exist! No statistics found!') all_statistics = {} @@ -468,7 +434,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> all_statistics[model_name] = jload(os.path.join(statistics_path, file_name)) if len(list(all_statistics.keys())) == 0: - raise Exception("There are no statistics in the given directory!") + raise Exception(f'There are no statistics in the given directory "{statistics_path}"!') frame_all = { "model": [], diff --git a/applications/Chat/evaluate/merge.py b/applications/Chat/evaluate/merge.py deleted file mode 100644 index 295dd7fa7cb3..000000000000 --- a/applications/Chat/evaluate/merge.py +++ /dev/null @@ -1,25 +0,0 @@ -import argparse -import os - -from utils import jload, jdump - - -def generate(args): - dataset = [] - for i in range(args.shards): - shard = jload(os.path.join(args.answer_path, - f'{args.model_name}_answers_rank{i}.json')) - dataset.extend(shard) - - dataset.sort(key=lambda x: x['id']) - jdump(dataset, os.path.join(args.answer_path, - f'{args.model_name}_answers.json')) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--model_name', type=str, default='model') - parser.add_argument('--shards', type=int, default=4) - parser.add_argument('--answer_path', type=str, default="answer") - args = parser.parse_args() - generate(args) diff --git a/applications/Chat/evaluate/requirements.txt b/applications/Chat/evaluate/requirements.txt new file mode 100644 index 000000000000..b0301c2f17f8 --- /dev/null +++ b/applications/Chat/evaluate/requirements.txt @@ -0,0 +1,10 @@ +jieba +bert-score +rouge_chinese +scikit-metrics +nltk +openai +seaborn +pandas +matplotlib +numpy diff --git a/applications/Chat/evaluate/utils.py b/applications/Chat/evaluate/utils.py index 692ee007c080..e855cd45221c 100644 --- a/applications/Chat/evaluate/utils.py +++ b/applications/Chat/evaluate/utils.py @@ -2,10 +2,6 @@ import json import os -import torch.distributed as dist - -def is_rank_0() -> bool: - return not dist.is_initialized() or dist.get_rank() == 0 def _make_w_io_base(f, mode: str): if not isinstance(f, io.IOBase): @@ -15,11 +11,13 @@ def _make_w_io_base(f, mode: str): f = open(f, mode=mode) return f + def _make_r_io_base(f, mode: str): if not isinstance(f, io.IOBase): f = open(f, mode=mode) return f + def jdump(obj, f, mode="w", indent=4, default=str): """Dump a str or dictionary to a file in json format. Args: @@ -38,6 +36,7 @@ def jdump(obj, f, mode="w", indent=4, default=str): raise ValueError(f"Unexpected type: {type(obj)}") f.close() + def jload(f, mode="r"): """Load a .json file into a dictionary.""" f = _make_r_io_base(f, mode) @@ -45,9 +44,19 @@ def jload(f, mode="r"): f.close() return jdict + def get_json_list(file_path): with open(file_path, 'r') as f: json_list = [] for line in f: json_list.append(json.loads(line)) return json_list + + +def get_data_per_category(data, categories): + data_per_category = {category: [] for category in categories} + for item in data: + category = item["category"] + data_per_category[category].append(item) + + return data_per_category diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt index 3dd41310ef4d..af7ff67861eb 100644 --- a/applications/Chat/requirements.txt +++ b/applications/Chat/requirements.txt @@ -11,8 +11,3 @@ sse_starlette wandb sentencepiece gpustat -jieba -bert-score -rouge_chinese -scikit-metrics -nltk