Skip to content
Merged

ff #49

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
245 changes: 161 additions & 84 deletions applications/Chat/evaluate/README.md

Large diffs are not rendered by default.

20 changes: 10 additions & 10 deletions applications/Chat/evaluate/config/config_cn.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"language": "cn",
"category": {
"brainstorming": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"creativity",
Expand All @@ -14,7 +14,7 @@
]
},
"chat": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"naturalness",
Expand All @@ -26,7 +26,7 @@
]
},
"classification": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
Expand All @@ -38,7 +38,7 @@
]
},
"closed_qa": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
Expand All @@ -50,7 +50,7 @@
]
},
"extraction": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
Expand All @@ -62,7 +62,7 @@
]
},
"generation": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"diversity"
Expand All @@ -74,7 +74,7 @@
]
},
"open_qa": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
Expand All @@ -84,7 +84,7 @@
]
},
"rewriting": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
Expand All @@ -96,7 +96,7 @@
]
},
"roleplay": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"fidelity",
Expand All @@ -107,7 +107,7 @@
]
},
"summarization": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness",
Expand Down
7 changes: 6 additions & 1 deletion applications/Chat/evaluate/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def main(args):
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")

# initialize evaluator
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
config["language"])
if len(args.model_name_list) == 2:
answers1 = jload(args.answer_file_list[0])
answers2 = jload(args.answer_file_list[1])
Expand Down Expand Up @@ -87,6 +88,10 @@ def main(args):
default=[],
required=True,
help='the names of at most 2 models')
parser.add_argument('--gpt_model',
default="gpt-3.5-turbo",
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
help='which GPT model to use for evaluation')
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
args = parser.parse_args()
Expand Down
36 changes: 23 additions & 13 deletions applications/Chat/evaluate/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@ class Evaluator(object):

"""

def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str,
Any]) -> None:
def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
gpt_model: str, language: str) -> None:
self.params = params
self.battle_prompt = battle_prompt
self.gpt_evaluation_prompt = gpt_evaluation_prompt
self.gpt_model = gpt_model
self.language = language
self.automatic_metric_stats = dict()
self.gpt35_evaluation_results = dict()
self.gpt_evaluation_results = dict()
self.battle_results = []

def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
Expand Down Expand Up @@ -63,6 +65,10 @@ def switch(metric):

# automatic evaluation
for category in self.params:
if len(answers_per_category[category]) == 0:
print(f"Category {category} specified in your config doesn't have corresponding answers!")
continue

category_metrics = self.params[category]["Metrics"]
self.automatic_metric_stats[category] = {}

Expand All @@ -74,17 +80,21 @@ def switch(metric):
for metric in category_metrics:
self.automatic_metric_stats[category].update(switch(metric=metric))

# gpt35 evaluation
# gpt evaluation
for category in self.params:
category_metrics = self.params[category]["GPT-3.5"]
if len(answers_per_category[category]) == 0:
print(f"Category {category} specified in your config doesn't have corresponding answers!")
continue

category_metrics = self.params[category]["GPT"]

prompt = self.gpt_evaluation_prompt.get(category, None)
if prompt is None:
print(f"No prompt for category {category}! Use prompt for category general now.")
prompt = self.gpt_evaluation_prompt["general"]

self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category],
prompt, category_metrics, category)
self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(answers_per_category[category], prompt,
category_metrics, category, self.gpt_model)

def save(self, path: str, model_name_list: List[str]) -> None:
"""
Expand All @@ -106,10 +116,10 @@ def save(self, path: str, model_name_list: List[str]) -> None:

# Save evaluation results for GPT-3.5 evaluation metrics.
all_evaluations = []
base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results")
base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")

for category, evaluations in self.gpt35_evaluation_results.items():
for category, evaluations in self.gpt_evaluation_results.items():
jdump(
evaluations,
os.path.join(evaluation_results_save_path, model_name_list[0],
Expand All @@ -121,10 +131,10 @@ def save(self, path: str, model_name_list: List[str]) -> None:

# Start to calculate scores and save statistics.
evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations,
evaluation_statistics_save_path)
gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
evaluation_statistics_save_path)

# Save charts and csv.
evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path,
evaluation_analyses_save_path)
gpt_evaluate.analyze_gpt_evaluation_statistics(evaluation_statistics_save_path,
evaluation_analyses_save_path)
Loading