Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 72 additions & 17 deletions applications/Chat/evaluate/eval.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
import argparse
import json
import os

import openai
from evaluator import Evaluator
from utils import jload


def main(args):
# load config
assert len(args.answer_file_list) == len(
args.model_name_list), "The number of answer files and model names should be equal!"

# load config
config = jload(args.config_file)

if config["language"] == "cn":
Expand All @@ -16,28 +22,77 @@ def main(args):
for metric_type, metrics in config["category"][category].items():
metrics_all[metric_type] = metrics
metrics_per_category[category] = metrics_all


battle_prompt = None
if args.battle_prompt_file:
battle_prompt = jload(args.battle_prompt_file)

gpt_evaluation_prompt = None
if args.gpt_evaluation_prompt_file:
gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)

if len(args.model_name_list) == 2 and not battle_prompt:
raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")

if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
raise Exception(
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")

# initialize evaluator
evaluator = Evaluator(metrics_per_category)
if args.answers2_file:
answers1 = jload(args.answers1_file)
answers2 = jload(args.answers2_file)
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
if len(args.model_name_list) == 2:
answers1 = jload(args.answer_file_list[0])
answers2 = jload(args.answer_file_list[1])

assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"

evaluator.battle(answers1=answers1, answers2=answers2)
else:
evaluator.save(args.save_path, args.model_name_list)
elif len(args.model_name_list) == 1:
targets = jload(args.target_file)
answers = jload(args.answers1_file)
answers = jload(args.answer_file_list[0])

assert len(targets) == len(answers), "The number of target answers and model answers should be equal!"

evaluator.evaluate(answers=answers, targets=targets)
evaluator.save(args.save_path)
evaluator.save(args.save_path, args.model_name_list)
else:
raise ValueError("Unsupported number of answer files and model names!")
else:
raise ValueError(f'Unsupported language')
raise ValueError(f'Unsupported language {config["language"]}!')


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--config_file', type=str, default=None, help='path to the file of target results')
parser.add_argument('--target_file', type=str, default=None, help='path to the file of target results')
parser.add_argument('--answers1_file', type=str, default=None, help='path to the file of one model prediction')
parser.add_argument('--answers2_file', type=str, default=None, help='path to the file of the other model prediction')
parser.add_argument('--save_path', type=str, default="results.csv", help='path to the csv file to save evaluation results')
parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
parser.add_argument('--config_file',
type=str,
default=None,
required=True,
help='path to the file of target results')
parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
parser.add_argument('--gpt_evaluation_prompt_file',
type=str,
default=None,
help='path to the prompt file for gpt evaluation')
parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
parser.add_argument('--answer_file_list',
type=str,
nargs='+',
default=[],
required=True,
help='path to the answer files of at most 2 models')
parser.add_argument('--model_name_list',
type=str,
nargs='+',
default=[],
required=True,
help='the names of at most 2 models')
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
args = parser.parse_args()
main(args)

if args.openai_key is not None:
os.environ["OPENAI_API_KEY"] = args.openai_key
openai.api_key = os.getenv("OPENAI_API_KEY")

main(args)
9 changes: 9 additions & 0 deletions applications/Chat/evaluate/eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
python eval.py \
--config_file "path to the config file" \
--battle_prompt_file "path to the prompt file for battle" \
--gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
--target_file "path to the target answer file" \
--answer_file_list "path to the answer files of at most 2 models" \
--model_name_list "the names of at most 2 models" \
--save_path "path to save results" \
--openai_key "your openai key" \
256 changes: 0 additions & 256 deletions applications/Chat/evaluate/evaluate.py

This file was deleted.

9 changes: 0 additions & 9 deletions applications/Chat/evaluate/evaluate.sh

This file was deleted.

Loading