Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
434 changes: 252 additions & 182 deletions applications/Chat/evaluate/README.md

Large diffs are not rendered by default.

123 changes: 123 additions & 0 deletions applications/Chat/evaluate/config/config_cn.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
{
"language": "cn",
"category": {
"brainstorming": {
"GPT-3.5": [
"language organization",
"relevance",
"creativity",
"practicality",
"correctness"
],
"Metrics": [
"Distinct"
]
},
"chat": {
"GPT-3.5": [
"language organization",
"relevance",
"naturalness",
"engagingness",
"reasonableness"
],
"Metrics": [
"Distinct"
]
},
"classification": {
"GPT-3.5": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score"
]
},
"closed_qa": {
"GPT-3.5": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"extraction": {
"GPT-3.5": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score"
]
},
"generation": {
"GPT-3.5": [
"language organization",
"relevance",
"diversity"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"open_qa": {
"GPT-3.5": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Distinct"
]
},
"rewriting": {
"GPT-3.5": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"roleplay": {
"GPT-3.5": [
"language organization",
"relevance",
"fidelity",
"creativity"
],
"Metrics": [
"Distinct"
]
},
"summarization": {
"GPT-3.5": [
"language organization",
"relevance",
"correctness",
"conciseness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
}
}
}
98 changes: 98 additions & 0 deletions applications/Chat/evaluate/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import argparse
import json
import os

import openai
from evaluator import Evaluator
from utils import jload


def main(args):
assert len(args.answer_file_list) == len(
args.model_name_list), "The number of answer files and model names should be equal!"

# load config
config = jload(args.config_file)

if config["language"] == "cn":
# get metric settings for all categories
metrics_per_category = {}
for category in config["category"].keys():
metrics_all = {}
for metric_type, metrics in config["category"][category].items():
metrics_all[metric_type] = metrics
metrics_per_category[category] = metrics_all

battle_prompt = None
if args.battle_prompt_file:
battle_prompt = jload(args.battle_prompt_file)

gpt_evaluation_prompt = None
if args.gpt_evaluation_prompt_file:
gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)

if len(args.model_name_list) == 2 and not battle_prompt:
raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")

if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
raise Exception(
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")

# initialize evaluator
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
if len(args.model_name_list) == 2:
answers1 = jload(args.answer_file_list[0])
answers2 = jload(args.answer_file_list[1])

assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"

evaluator.battle(answers1=answers1, answers2=answers2)
evaluator.save(args.save_path, args.model_name_list)
elif len(args.model_name_list) == 1:
targets = jload(args.target_file)
answers = jload(args.answer_file_list[0])

assert len(targets) == len(answers), "The number of target answers and model answers should be equal!"

evaluator.evaluate(answers=answers, targets=targets)
evaluator.save(args.save_path, args.model_name_list)
else:
raise ValueError("Unsupported number of answer files and model names!")
else:
raise ValueError(f'Unsupported language {config["language"]}!')


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
parser.add_argument('--config_file',
type=str,
default=None,
required=True,
help='path to the file of target results')
parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
parser.add_argument('--gpt_evaluation_prompt_file',
type=str,
default=None,
help='path to the prompt file for gpt evaluation')
parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
parser.add_argument('--answer_file_list',
type=str,
nargs='+',
default=[],
required=True,
help='path to the answer files of at most 2 models')
parser.add_argument('--model_name_list',
type=str,
nargs='+',
default=[],
required=True,
help='the names of at most 2 models')
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
args = parser.parse_args()

if args.openai_key is not None:
os.environ["OPENAI_API_KEY"] = args.openai_key
openai.api_key = os.getenv("OPENAI_API_KEY")

main(args)
9 changes: 9 additions & 0 deletions applications/Chat/evaluate/eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
python eval.py \
--config_file "path to the config file" \
--battle_prompt_file "path to the prompt file for battle" \
--gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
--target_file "path to the target answer file" \
--answer_file_list "path to the answer files of at most 2 models" \
--model_name_list "the names of at most 2 models" \
--save_path "path to save results" \
--openai_key "your openai key" \
Loading