Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from typing import Dict, List
from typing import Dict, List, Union

import colossal_eval.evaluate.dataset_evaluator.metrics as metric_helper
import numpy as np
Expand Down Expand Up @@ -279,7 +279,9 @@ def _evaluate(self):

return self.evaluation_results

def get_evaluation_results(self, data: List[Dict], dataset_name: str, model_name: str, metrics: List[str]):
def get_evaluation_results(
self, data: Dict[str, Union[str, Dict]], dataset_name: str, model_name: str, metrics: List[str]
):
"""
Evaluate inference data on the given metrics.

Expand All @@ -290,10 +292,11 @@ def get_evaluation_results(self, data: List[Dict], dataset_name: str, model_name
metrics: Metrics used to evaluate.

"""
self.data = data
self.data = data["inference_results"]
self.dataset_name = dataset_name
self.dataset_class = data["dataset_class"]
self.model_name = model_name
self.categories = list(data.keys())
self.categories = list(self.data.keys())
self.metrics = metrics
self.judgements = {}

Expand All @@ -313,9 +316,7 @@ def get_evaluation_results(self, data: List[Dict], dataset_name: str, model_name

for metric in self.metrics:
# Train and reference split use same metric as test split.
self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_name.split("_")[0]][
metric
]
self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_class][metric]
if "ALL" in self.suggested_categories[metric]:
self.suggested_categories[metric] = self.categories
self.metric_total_length[metric] = self.total_length
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"per_byte_ppl_score": ["ALL"],
},
# The commented are non 4-choice questions.
"agieval": {
"AGIEvalDataset": {
"combined_single_choice_accuracy": [
# "lsat-ar",
# "lsat-lr",
Expand Down Expand Up @@ -103,14 +103,14 @@
],
"ppl_score": ["ALL"],
},
"cmmlu": {
"CMMLUDataset": {
"first_token_accuracy": ["ALL"],
"single_choice_accuracy": ["ALL"],
"perplexity": ["ALL"],
"ppl_score_over_choices": ["ALL"],
"ppl_score": ["ALL"],
},
"gaokaobench": {
"GaoKaoBenchDataset": {
"combined_single_choice_accuracy": [
"English MCQs",
"Biology MCQs",
Expand Down Expand Up @@ -170,7 +170,7 @@
"ppl_score_over_choices": ["ALL"],
"ppl_score": ["ALL"],
},
"longbench": {
"LongBenchDataset": {
"f1_score": ["hotpotqa", "2wikimqa", "musique", "narrativeqa", "qasper", "multifieldqa_en", "triviaqa"],
"f1_zh_score": ["multifieldqa_zh"],
"rouge_score": ["gov_report", "qmsum", "multi_news", "samsum"],
Expand All @@ -183,19 +183,19 @@
"perplexity": ["ALL"],
"ppl_score": ["ALL"],
},
"mmlu": {
"MMLUDataset": {
"first_token_accuracy": ["ALL"],
"single_choice_accuracy": ["ALL"],
"accuracy": ["ALL"],
"perplexity": ["ALL"],
"ppl_score_over_choices": ["ALL"],
"ppl_score": ["ALL"],
},
"mtbench": {"mtbench_single_judge": ["ALL"]},
"cvalues": {"first_token_accuracy": ["ALL"]},
"safetybench_zh": {"first_token_accuracy": ["ALL"]},
"safetybench_en": {"first_token_accuracy": ["ALL"]},
"gsm": {
"MTBenchDataset": {"mtbench_single_judge": ["ALL"]},
"CValuesDataset": {"first_token_accuracy": ["ALL"]},
"SafetyBenchZHDataset": {"first_token_accuracy": ["ALL"]},
"SafetyBenchENDataset": {"first_token_accuracy": ["ALL"]},
"GSMDataset": {
"loss_over_all_tokens": ["ALL"],
"gsm_accuracy": ["ALL"],
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,13 @@
logger = get_dist_logger()


def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
def rm_and_merge(
dp_size: int,
save_path: str,
model_names: List[str],
dataset_names: Dict[str, List],
dataset_classes: Dict[str, List],
) -> None:
"""
Remove inference result per rank and merge them into one file.

Expand All @@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
save_path: The folder for storing inference results.
model_names: Names of models for inference.
dataset_names: Names of dataset for inference.
dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process.

"""

for model_name in model_names:
for dataset_name, categories in dataset_names.items():
all_answers_with_dataset_class = {}
all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name]

all_answers = {}
for category in categories:
all_answers[category] = {"data": []}
Expand Down Expand Up @@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n

all_answers[category] = answers

all_answers_with_dataset_class["inference_results"] = all_answers

logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
utils.jdump(
all_answers_with_dataset_class,
os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"),
)

logger.info(f"Save inference results of model {model_name} for all dataset.")
logger.info(f"Save inference results of all models for all dataset.")
Expand Down Expand Up @@ -98,6 +113,7 @@ def main(args):
)

inference_data = {}
dataset_classes = {}
debug_args = {}
few_shot_args = {}
multiturn_args = {}
Expand Down Expand Up @@ -128,6 +144,7 @@ def main(args):

continue

dataset_classes[dataset_name] = dataset_parameter["dataset_class"]
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
if not issubclass(dataset_class, dataset.BaseDataset):
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
Expand All @@ -149,12 +166,14 @@ def main(args):
debug_args[new_dataset_name] = dataset_parameter["debug"]
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
inference_data[new_dataset_name] = dataset_.dataset["train"]
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]

if load_reference and "reference" in dataset_.dataset:
new_dataset_name = f"{dataset_name}_reference"
debug_args[new_dataset_name] = dataset_parameter["debug"]
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
inference_data[new_dataset_name] = dataset_.dataset["reference"]
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]

if rank == 0:
logger.info(f"Dataset for inference are: {list(inference_data.keys())}")
Expand Down Expand Up @@ -225,7 +244,7 @@ def main(args):
if rank == 0:
model_names = [model_parameter["name"] for model_parameter in model_parameters]
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names)
rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes)


if __name__ == "__main__":
Expand Down
25 changes: 22 additions & 3 deletions applications/ColossalEval/examples/gpt_evaluation/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,13 @@
logger = get_dist_logger()


def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
def rm_and_merge(
dp_size: int,
save_path: str,
model_names: List[str],
dataset_names: Dict[str, List],
dataset_classes: Dict[str, List],
) -> None:
"""
Remove inference result per rank and merge them into one file.

Expand All @@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
save_path: The folder for storing inference results.
model_names: Names of models for inference.
dataset_names: Names of dataset for inference.
dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process.

"""

for model_name in model_names:
for dataset_name, categories in dataset_names.items():
all_answers_with_dataset_class = {}
all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name]

all_answers = {}
for category in categories:
all_answers[category] = {"data": []}
Expand Down Expand Up @@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n

all_answers[category] = answers

all_answers_with_dataset_class["inference_results"] = all_answers

logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
utils.jdump(
all_answers_with_dataset_class,
os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"),
)

logger.info(f"Save inference results of model {model_name} for all dataset.")
logger.info(f"Save inference results of all models for all dataset.")
Expand Down Expand Up @@ -98,6 +113,7 @@ def main(args):
)

inference_data = {}
dataset_classes = {}
debug_args = {}
few_shot_args = {}
multiturn_args = {}
Expand Down Expand Up @@ -128,6 +144,7 @@ def main(args):

continue

dataset_classes[dataset_name] = dataset_parameter["dataset_class"]
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
if not issubclass(dataset_class, dataset.BaseDataset):
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
Expand All @@ -149,12 +166,14 @@ def main(args):
debug_args[new_dataset_name] = dataset_parameter["debug"]
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
inference_data[new_dataset_name] = dataset_.dataset["train"]
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]

if load_reference and "reference" in dataset_.dataset:
new_dataset_name = f"{dataset_name}_reference"
debug_args[new_dataset_name] = dataset_parameter["debug"]
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
inference_data[new_dataset_name] = dataset_.dataset["reference"]
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]

if rank == 0:
logger.info(f"Dataset for inference are: {list(inference_data.keys())}")
Expand Down Expand Up @@ -225,7 +244,7 @@ def main(args):
if rank == 0:
model_names = [model_parameter["name"] for model_parameter in model_parameters]
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names)
rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes)


if __name__ == "__main__":
Expand Down