diff --git a/applications/Chat/evaluate/gpt_evaluate.py b/applications/Chat/evaluate/gpt_evaluate.py new file mode 100644 index 000000000000..119054299517 --- /dev/null +++ b/applications/Chat/evaluate/gpt_evaluate.py @@ -0,0 +1,530 @@ +import concurrent.futures +import os +import random +import re +import time +from copy import deepcopy +from typing import Any, Dict, List + +import matplotlib.pyplot as plt +import numpy as np +import openai +import pandas as pd +import seaborn as sns +import tqdm +from utils import jdump, jload + + +def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]: + """ + Get evaluation from GPT-4. + + Args: + sys_prompt: prompt for the system. + user_prompt: prompt for the user. + id: id of the answers for comparison. + max_tokens: the maximum number of tokens to generate in the chat completion. + + Returns: + An evaluation of one comparison. + """ + + MAX_API_RETRY = 3 + for _ in range(MAX_API_RETRY): + try: + response = openai.ChatCompletion.create( + model="gpt-4", + messages=[ + { + "role": "system", + "content": sys_prompt + }, + { + "role": "user", + "content": user_prompt, + }, + ], + temperature=0.2, + max_tokens=max_tokens, + ) + evaluation = response["choices"][0]["message"]["content"] + return {"evaluation": evaluation, "id": id} + except Exception as e: + print(e) + time.sleep(1) + print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.") + return {"evaluation": "", "id": id} + + +def parse_battle_score(evaluation: str) -> List[float]: + """ + Parse evaluation from GPT-4 and get the scores of model 1 and 2. + + Args: + evaluation: evaluation from GPT-4. + + Returns: + A score pair of two different model answers. + """ + + try: + pattern = re.compile("([0-9]|10) out of 10") + sp = re.findall(pattern, evaluation) + if len(re.findall(pattern, evaluation)) == 2: + return [float(sp[0]), float(sp[1])] + + pattern = re.compile("a score of ([0-9]|10)") + sp = re.findall(pattern, evaluation) + if len(re.findall(pattern, evaluation)) == 2: + return [float(sp[0]), float(sp[1])] + + pattern = re.compile("([0-9]|10)/10") + sp = re.findall(pattern, evaluation) + if len(re.findall(pattern, evaluation)) == 2: + return [float(sp[0]), float(sp[1])] + + score_pair = evaluation.split("\n")[0] + score_pair = score_pair.replace(",", " ") + sp = score_pair.split(" ") + if len(sp) == 2: + return [float(sp[0]), float(sp[1])] + else: + raise Exception(f"Invalid score pair. Got {evaluation}.") + except Exception as e: + return [-1, -1] + + +def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]) -> List[Dict]: + """ + Use GPT-4 to compare answers of two different models. + + Args: + answer1: answers of model 1. + answer2: answers of model 2. + prompt_dict: prompt for battle. + + Returns: + Evaluations of all comparison pairs. + """ + + assert len(answer1) == len(answer2) + + handles = [] + evaluation_file = [] + + total_len = len(answer1) + question_idx_list = list(range(total_len)) + + print(f" Total number of answers: {len(answer1)}.") + + evaluations = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [] + for i in question_idx_list: + assert answer1[i]["id"] == answer2[i]["id"] + answer_id = answer1[i]["id"] + + ques = (answer1[i]["instruction"] if answer1[i]["input"] == "" else answer1[i]["instuction"] + " " + + answer1[i]["input"]) + cat = answer1[i]["category"] + ans1 = answer1[i]["output"] + ans2 = answer2[i]["output"] + + sys_prompt = prompt_dict["system_prompt"] + prompt_template = prompt_dict["prompt_template"] + prompt = prompt_template.format( + question=ques, + answer_1=ans1, + answer_2=ans2, + prompt=prompt_dict["prompt"], + ) + + future = executor.submit(get_battle_result, sys_prompt, prompt, answer_id, 2048) + futures.append(future) + + for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): + evaluations.append(future.result()) + + evaluations.sort(key=lambda x: x["id"]) + + return evaluations + + +def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_path: str) -> None: + """ + Save evaluation results (model 1 vs model 2) from GPT-4. + + Args: + evaluations: evaluation results from GPT-4. + name1: model 1 's name. + name2: model 2 's name. + save_path: path to save battle results. + """ + + evaluation_file = deepcopy(evaluations) + + ans1_score = 0 + ans2_score = 0 + better_count = 0 + worse_count = 0 + tie_count = 0 + invalid_count = 0 + + better_file = [] + worse_file = [] + tie_file = [] + invalid_file = [] + + for idx, evaluation in enumerate(evaluations): + scores = parse_battle_score(evaluation["evaluation"]) + evaluation_file[idx]["score"] = scores + + if scores[0] == -1 and scores[1] == -1: + invalid_count += 1 + invalid_file.append(evaluation_file[idx]) + print(f'Invalid score pair: {evaluation_file[idx]["id"]}.') + else: + if scores[0] > scores[1]: + worse_count += 1 + worse_file.append(evaluation_file[idx]) + elif scores[0] < scores[1]: + better_count += 1 + better_file.append(evaluation_file[idx]) + else: + tie_count += 1 + tie_file.append(evaluation_file[idx]) + ans1_score += scores[0] + ans2_score += scores[1] + + prefix = f"{name1}_vs_{name2}" + + if not os.path.exists(save_path): + os.makedirs(save_path) + + jdump(better_file, os.path.join(save_path, prefix, f"{name2}_better.json")) + jdump(worse_file, os.path.join(save_path, prefix, f"{name2}_worse.json")) + jdump(tie_file, os.path.join(save_path, prefix, f"{prefix}_tie.json")) + jdump(invalid_file, os.path.join(save_path, prefix, f"{prefix}_invalid.json")) + jdump(evaluation_file, os.path.join(save_path, prefix, f"{prefix}_evaluations.json")) + + if os.path.exists(os.path.join(save_path, "battle_results.json")): + results = jload(os.path.join(save_path, "battle_results.json")) + else: + results = {} + + results[prefix] = { + "model": [name1, name2], + "better": better_count, + "worse": worse_count, + "tie": tie_count, + "win_rate": better_count / (len(evaluations) - invalid_count), + "score": [ + ans1_score / (len(evaluations) - invalid_count), + ans2_score / (len(evaluations) - invalid_count), + ], + } + jdump(results, os.path.join(save_path, "battle_results.json")) + + print(f"Total {invalid_count} invalid score pair(s).") + print(f"Model {name2} has {better_count} better answer(s).") + print(f"Model {name2} has {worse_count} worse answer(s).") + print(f"{tie_count} answer(s) play(s) to a tie.") + print(f"Win rate of model {name2}: {better_count/(len(evaluations)-invalid_count):.2f}") + print(f"Model {name1} average score: {ans1_score/(len(evaluations)-invalid_count):.2f}") + print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}") + + +def get_gpt35_evaluation(prompt: Dict[str, Any], inst: Dict[str, Any], max_tokens: int = 2048) -> Dict[str, Any]: + """ + Use GPT-3.5 to evaluate one model answer. + + Args: + prompt: a dictionary including prompt template, CoT and metrics. + inst: the instruction that is needed to be evaluated. + max_tokens: the maximum number of tokens to generate in the completion. + + Returns: + An evaluation of one answer. + """ + + MAX_API_RETRY = 3 + + question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"]) + answer = inst["output"] + metrics = prompt["metrics"] + inst["evaluation"] = {} + + for metric in metrics.keys(): + for i in range(MAX_API_RETRY): + try: + response = openai.Completion.create( + model="text-davinci-003", + prompt=prompt["prompt"].format( + question=question, + answer=answer, + metric=prompt["metrics"][metric], + steps=prompt["CoT"][metric], + ), + logprobs=5, + temperature=0, + max_tokens=max_tokens, + ) + inst["evaluation"][metric] = { + "response": response["choices"][0]["text"], + "logprobs": response["choices"][0]["logprobs"]["top_logprobs"], + } + break + except Exception as e: + print(e) + time.sleep(1) + return inst + + +def gpt35_evaluate( + answers: List[Dict], + prompts: List[Dict], + model_name: str, + save_path: str, +) -> List[Dict]: + """ + Use GPT-3.5 to evaluate model answers and save evaluation results. + + Args: + answers: model answers. + prompts: prompts for all categories. + model_name: name of the model. + save_path: path to save GPT-3.5 evaluations. + + Returns: + All the evaluations of the given answers. + """ + + prompt_per_category = {prompt["category"]: prompt for prompt in prompts} + + data_per_category = {} + for answer in answers: + category = answer["category"] + + if answer["category"] in data_per_category.keys(): + data_per_category[category].append(answer) + else: + data_per_category[category] = [answer] + + categories_str = ", ".join(x for x in list(data_per_category.keys())) + print(f"The evaluated categories are {categories_str}.") + + if not os.path.exists(save_path): + os.makedirs(save_path) + + all_evaluations = [] + + for category, data in data_per_category.items(): + if prompt_per_category.get(category) == None: + print(f"No metrics for category {category}! Use category general now.") + prompt_per_category[category] = prompt_per_category["general"] + + if os.path.exists(os.path.join(save_path, model_name, f"{category}_evaluation_results.json")): + print( + f"The evaluation file for category {category} already exists. You are now re-evaluating category {category}!" + ) + + print(f"The number of instances of category {category}'s is {len(data)}.") + + evaluations = [] + + metrics_str = ", ".join(x for x in list(prompt_per_category[category]["metrics"].keys())) + print(f"Category {category}'s metrics are {metrics_str}.") + + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [] + for inst in data: + future = executor.submit(get_gpt35_evaluation, prompt_per_category[category], inst, 1) + futures.append(future) + + for future in tqdm.tqdm( + concurrent.futures.as_completed(futures), + desc=f"{category}: ", + total=len(futures), + ): + evaluations.append(future.result()) + + evaluations.sort(key=lambda x: x["id"]) + + jdump( + evaluations, + os.path.join(save_path, model_name, f"{category}_evaluation_results.json"), + ) + print(f"{category} done.") + + all_evaluations.extend(evaluations) + + jdump( + all_evaluations, + os.path.join(save_path, f"{model_name}_evaluation_results.json"), + ) + + return all_evaluations + + +def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float: + """ + Calculate score from log probabilities returned by text-davinci-003. + Only openai.Completion can return logprobs. + + Calculation formula: + score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability. + + Ref: https://arxiv.org/abs/2303.16634 + This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling). + + Args: + logprobs: logprobs returned by openai.Completion. + + Returns: + Score of one answer. + """ + + # GPT-3.5 only returns score of 1 to 5. + prob = np.zeros(5) + + for key, value in logprobs.items(): + # Sometimes the key will be one byte of a unicode character which takes the form of "bytes:\\xe7". + # It is meaningless and thus we don't calculate probability. + if "bytes" in key: + continue + # results[0] is the score which corresponds to the key(predicted token). + # For example, key "5" corresponds to score 5. + results = re.findall(r"\d", key) + if len(results) == 1: + prob[int(results[0]) - 1] = prob[int(results[0]) - 1] + np.exp(value) + + score = np.dot(np.arange(1, 6), prob) + + return score + + +def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None: + """ + Generate statistics for one model. + + Args: + model_name: name of the model for saving statistics. + evaluations: evaluations for all of the model answers. + save_path: path to save GPT-3.5 evaluation statistics. + """ + + if not os.path.exists(save_path): + os.makedirs(save_path) + + data_per_category = {} + for evaluation in evaluations: + category = evaluation["category"] + if evaluation["category"] in data_per_category.keys(): + data_per_category[category].append(evaluation) + else: + data_per_category[category] = [evaluation] + + all_statistics = {} + for category, data in data_per_category.items(): + metrics = data[0]["evaluation"].keys() + scores = {metric: [] for metric in metrics} + for evaluation in data: + for metric in metrics: + scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0])) + + statistics = {} + for metric in metrics: + arg_sort = np.argsort(scores[metric]) + statistics[metric] = {} + statistics[metric]["avg_score"] = sum(scores[metric]) / len(data) + statistics[metric]["best_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[-3:][::-1]} + statistics[metric]["worst_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[:3]} + + all_statistics[category] = statistics + + jdump( + all_statistics, + os.path.join(save_path, f"{model_name}_evaluation_statistics.json"), + ) + + +def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None: + """ + Analyze and visualize all GPT-3.5 evaluation statistics in the given directory. + + Args: + statistics_path: path to all the models' statistics. + save_path: path to save table and visualization results. + """ + + if not os.path.exists(statistics_path): + raise Exception("The given directory doesn't exist! No statistics found!") + + all_statistics = {} + + for file_name in os.listdir(statistics_path): + if file_name.endswith("_evaluation_statistics.json"): + model_name = file_name.split("_evaluation_statistics.json")[0] + all_statistics[model_name] = jload(os.path.join(statistics_path, file_name)) + + if len(list(all_statistics.keys())) == 0: + raise Exception("There are no statistics in the given directory!") + + frame_all = { + "model": [], + "category": [], + "metric": [], + "avg_score": [], + "best_3": [], + "worst_3": [], + } + frame_per_category = {} + for model_name, model_statistics in all_statistics.items(): + for category, category_statistics in model_statistics.items(): + if frame_per_category.get(category) is None: + frame_per_category[category] = { + "model": [], + "metric": [], + "avg_score": [], + "best_3": [], + "worst_3": [], + } + + for metric, metric_statistics in category_statistics.items(): + frame_all["model"].append(model_name) + frame_all["category"].append(category) + frame_all["metric"].append(metric) + frame_all["avg_score"].append(metric_statistics["avg_score"]) + frame_all["best_3"].append(metric_statistics["best_3"]) + frame_all["worst_3"].append(metric_statistics["worst_3"]) + + frame_per_category[category]["model"].append(model_name) + frame_per_category[category]["metric"].append(metric) + frame_per_category[category]["avg_score"].append(metric_statistics["avg_score"]) + frame_per_category[category]["best_3"].append(metric_statistics["best_3"]) + frame_per_category[category]["worst_3"].append(metric_statistics["worst_3"]) + + if not os.path.exists(save_path): + os.makedirs(save_path) + + frame_all = pd.DataFrame(frame_all) + frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv")) + + for category in tqdm.tqdm( + frame_per_category.keys(), + desc=f"category: ", + total=len(frame_per_category.keys()), + ): + data = pd.DataFrame(frame_per_category[category]) + + sns.set() + fig = plt.figure(figsize=(16, 10)) + plt.ylim((0, 5)) + + fig = sns.barplot(x="metric", y="avg_score", hue="model", data=data, dodge=True) + fig.set_title(f"Comparison between Different Models for Category {category.title()}") + plt.xlabel("Evaluation Metric") + plt.ylabel("Average Score") + + figure = fig.get_figure() + figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)