Skip to content
Merged

f #56

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 25 additions & 24 deletions applications/Chat/evaluate/README.md

Large diffs are not rendered by default.

123 changes: 123 additions & 0 deletions applications/Chat/evaluate/config/config_en.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
{
"language": "en",
"category": {
"brainstorming": {
"GPT": [
"language organization",
"relevance",
"creativity",
"practicality",
"correctness"
],
"Metrics": [
"Distinct"
]
},
"chat": {
"GPT": [
"language organization",
"relevance",
"naturalness",
"engagingness",
"reasonableness"
],
"Metrics": [
"Distinct"
]
},
"classification": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score"
]
},
"closed_qa": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"extraction": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score"
]
},
"generation": {
"GPT": [
"language organization",
"relevance",
"diversity"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"open_qa": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Distinct"
]
},
"rewriting": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"roleplay": {
"GPT": [
"language organization",
"relevance",
"fidelity",
"creativity"
],
"Metrics": [
"Distinct"
]
},
"summarization": {
"GPT": [
"language organization",
"relevance",
"correctness",
"conciseness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
}
}
}
2 changes: 1 addition & 1 deletion applications/Chat/evaluate/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def main(args):
# load config
config = jload(args.config_file)

if config["language"] == "cn":
if config["language"] in ["cn", "en"]:
# get metric settings for all categories
metrics_per_category = {}
for category in config["category"].keys():
Expand Down
60 changes: 27 additions & 33 deletions applications/Chat/evaluate/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import gpt_evaluate
import metrics
import pandas as pd
from utils import get_data_per_category, jdump
from utils import analyze_automatic_results, get_data_per_category, save_automatic_results


class Evaluator(object):
Expand Down Expand Up @@ -42,21 +42,21 @@ def evaluate(self, answers: List[Dict], targets: List[Dict]) -> None:

"""

def switch(metric):
def switch(metric, language):
if metric == "BLEU":
return metrics.bleu_score(preds=predicts_list, targets=targets_list)
return metrics.bleu_score(preds=predicts_list, targets=targets_list, language=language)
elif metric == "ROUGE":
return metrics.rouge_cn_score(preds=predicts_list, targets=targets_list)
return metrics.rouge_score(preds=predicts_list, targets=targets_list, language=language)
elif (metric == "Distinct"):
return metrics.distinct_score(preds=predicts_list)
return metrics.distinct_score(preds=predicts_list, language=language)
elif (metric == "BERTScore"):
return metrics.bert_score(preds=predicts_list, targets=targets_list)
return metrics.bert_score(preds=predicts_list, targets=targets_list, language=language)
elif (metric == "Precision"):
return metrics.precision(preds=predicts_list, targets=targets_list)
return metrics.precision(preds=predicts_list, targets=targets_list, language=language)
elif (metric == "Recall"):
return metrics.recall(preds=predicts_list, targets=targets_list)
return metrics.recall(preds=predicts_list, targets=targets_list, language=language)
elif (metric == "F1 score"):
return metrics.F1_score(preds=predicts_list, targets=targets_list)
return metrics.F1_score(preds=predicts_list, targets=targets_list, language=language)
else:
raise ValueError(f"Unexpected metric")

Expand All @@ -78,7 +78,7 @@ def switch(metric):
predicts_list = [answer["output"] for answer in answers_per_category[category]]

for metric in category_metrics:
self.automatic_metric_stats[category].update(switch(metric=metric))
self.automatic_metric_stats[category].update(switch(metric=metric, language=self.language))

# gpt evaluation
for category in self.params:
Expand Down Expand Up @@ -106,35 +106,29 @@ def save(self, path: str, model_name_list: List[str]) -> None:
save_path = os.path.join(path, "gpt_evaluate", "battle_results")
gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
else:
# save evaluation results for automatic metrics
automatic_df = pd.DataFrame(self.automatic_metric_stats)
# Save evaluation results for automatic metrics
automatic_base_save_path = os.path.join(path, "automatic_results")
automatic_results_save_path = os.path.join(automatic_base_save_path, "evaluation_results")

automatic_results_save_path = os.path.join(path, "automatic_results")
if not os.path.exists(automatic_results_save_path):
os.makedirs(automatic_results_save_path)
automatic_df.to_csv(os.path.join(automatic_results_save_path, f"{model_name_list[0]}.csv"), index=True)
save_automatic_results(model_name_list[0], self.automatic_metric_stats, automatic_results_save_path)

# Save evaluation results for GPT-3.5 evaluation metrics.
all_evaluations = []
base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
# Save charts and csv.
automatic_analyses_save_path = os.path.join(automatic_base_save_path, "evaluation_analyses")
analyze_automatic_results(automatic_results_save_path, automatic_analyses_save_path)

for category, evaluations in self.gpt_evaluation_results.items():
jdump(
evaluations,
os.path.join(evaluation_results_save_path, model_name_list[0],
f"{category}_evaluation_results.json"))
all_evaluations.extend(evaluations)
# Save evaluation results for GPT evaluation metrics.
gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")

jdump(all_evaluations,
os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json"))
all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0], self.gpt_evaluation_results,
gpt_evaluation_results_save_path)

# Start to calculate scores and save statistics.
evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
evaluation_statistics_save_path)
gpt_evaluation_statistics_save_path)

# Save charts and csv.
evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
gpt_evaluate.analyze_gpt_evaluation_statistics(evaluation_statistics_save_path,
evaluation_analyses_save_path)
gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
gpt_evaluation_analyses_save_path)
27 changes: 25 additions & 2 deletions applications/Chat/evaluate/gpt_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,14 +461,35 @@ def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) ->
return 0


def save_gpt_evaluation_results(model_name: str, gpt_evaluation_results: Dict[str, Any],
save_path: str) -> Dict[str, Any]:
"""
Save evaluation results for different categories for one model.

Args:
model_name: name of the model for saving evaluation results.
gpt_evaluation_results: evaluations results for all of the model answers.
save_path: path to save GPT evaluation statistics.
"""

all_evaluations = []
for category, evaluations in gpt_evaluation_results.items():
jdump(evaluations, os.path.join(save_path, model_name, f"{category}_evaluation_results.json"))
all_evaluations.extend(evaluations)

jdump(all_evaluations, os.path.join(save_path, f"{model_name}_evaluation_results.json"))

return all_evaluations


def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
"""
Generate statistics for one model.

Args:
model_name: name of the model for saving statistics.
evaluations: evaluations for all of the model answers.
save_path: path to save GPT-3.5 evaluation statistics.
save_path: path to save GPT evaluation statistics.
"""

if not os.path.exists(save_path):
Expand Down Expand Up @@ -516,7 +537,7 @@ def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], sav

def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
"""
Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
Analyze and visualize all GPT evaluation statistics in the given directory.

Args:
statistics_path: path to all the models' statistics.
Expand Down Expand Up @@ -594,3 +615,5 @@ def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> N

figure = fig.get_figure()
figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)

plt.close()
Loading