diff --git a/README.md b/README.md
index 8fa4dbe7f..ba5f698b8 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,9 @@
+
+
+
---
@@ -39,7 +42,10 @@ sample-by-sample results* to debug and see how your models stack-up.
## Available Tasks
-Lighteval supports **7,000+ evaluation tasks** across multiple domains and languages. Here's an overview of some *popular benchmarks*:
+Lighteval supports **1000+ evaluation tasks** across multiple domains and
+languages. Use [this
+space](https://huggingface.co/spaces/SaylorTwift/benchmark_finder) to find what
+you need, or, here's an overview of some *popular benchmarks*:
### 📚 **Knowledge**
@@ -62,7 +68,7 @@ Lighteval supports **7,000+ evaluation tasks** across multiple domains and langu
### 🌍 **Multilingual Evaluation**
- **Cross-lingual**: XTREME, Flores200 (200 languages), XCOPA, XQuAD
-- **Language-specific**:
+- **Language-specific**:
- **Arabic**: ArabicMMLU
- **Filipino**: FilBench
- **French**: IFEval-fr, GPQA-fr, BAC-fr
diff --git a/community_tasks/_template.py b/community_tasks/_template.py
deleted file mode 100644
index bfc7de505..000000000
--- a/community_tasks/_template.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# ruff: noqa: F405, F403, F401
-"""
-Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task.
-
-This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
-
-Author:
-"""
-
-import numpy as np
-
-from lighteval.metrics.metrics import SampleLevelMetric
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc, SamplingMethod
-
-
-# DEFINE YOUR PROMPT FUNCTIONS
-# Define as many as you need for your different tasks
-def prompt_fn(line, task_name: str = None):
- """Defines how to go from a dataset line to a doc object.
- Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
- about what this function should do in the README.
- """
- return Doc(
- task_name=task_name,
- query="",
- choices=[""],
- gold_index=0,
- instruction="",
- )
-
-
-# EVAL WITH NO SUBSET ##
-# This is how you create a simple task (like hellaswag) which has one single subset
-# attached to it, and one evaluation possible.
-task = LightevalTaskConfig(
- name="myothertask",
- prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
- suite=["community"],
- hf_repo="",
- hf_subset="default",
- hf_avail_splits=[],
- evaluation_splits=[],
- few_shots_split="",
- few_shots_select="",
- metrics=[], # select your metric in Metrics
-)
-
-# EVALS WITH SUBSET
-# This is how you create a subset task (like MMLU), which has several subset
-# each being its own evaluation task.
-
-# fmt: off
-SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval
-# fmt: on
-
-
-class CustomSubsetTask(LightevalTaskConfig):
- def __init__(
- self,
- name,
- hf_subset,
- ):
- super().__init__(
- name=name,
- hf_subset=hf_subset,
- prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
- hf_repo="",
- metrics=[custom_metric], # select your metric in Metrics or use your custom_metric
- hf_avail_splits=[],
- evaluation_splits=[],
- few_shots_split="",
- few_shots_select="",
- suite=["community"],
- generation_size=-1,
- stop_sequence=None,
- )
-
-
-# STORE YOUR EVALS
-SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
-TASKS_TABLE = SUBSET_TASKS + [task]
-
-
-# CUSTOM METRIC IF NEEDED
-custom_metric = SampleLevelMetric(
- metric_name="my_custom_metric_name",
- higher_is_better=True,
- category=SamplingMethod.GENERATIVE, # or LOGPROBS, PERPLEXITY, etc.
- sample_level_fn=lambda x: x, # how to compute score for one sample
- corpus_level_fn=np.mean, # aggregation
-)
diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py
deleted file mode 100644
index 7895cabff..000000000
--- a/community_tasks/aimo_evals.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# ruff: noqa: F405, F403, F401
-"""
-Task to evaluate LLMs on the training set of the Kaggle AIMO competition: https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize
-"""
-
-from lighteval.metrics.metrics import Metrics
-from lighteval.metrics.normalizations import math_normalizer
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
-
-
-def aimo_prompt(line, task_name: str = None):
- return Doc(
- task_name=task_name,
- choices=[str(line["answer"])],
- gold_index=0,
- query=line["problem"],
- )
-
-
-task = LightevalTaskConfig(
- name="aimo_progress_prize_1",
- prompt_function=aimo_prompt,
- suite=["community"],
- hf_subset="",
- hf_repo="lighteval/aimo_progress_prize_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split="train",
- few_shots_select="sequential",
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer})
- ],
- generation_size=2048,
- stop_sequence=None,
-)
-
-# STORE YOUR EVALS
-TASKS_TABLE = [task]
diff --git a/community_tasks/oz_evals.py b/community_tasks/oz_evals.py
deleted file mode 100644
index 61c762bef..000000000
--- a/community_tasks/oz_evals.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# ruff: noqa: F405, F403, F401
-"""
-Custom evaluation tasks for lighteval.
-
-This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
-
-OZ Eval (sr. Opšte Znanje Evaluacija) dataset was created for the purposes of evaluating General Knowledge of LLM models in Serbian language.
-Data consists of 1k+ high-quality questions and answers which were used as part of entry exams at the Faculty of Philosophy and Faculty of Organizational Sciences, University of Belgrade.
-The exams test the General Knowledge of students and were used in the enrollment periods from 2003 to 2024.
-For more details and results see: https://huggingface.co/datasets/DjMel/oz-eval
-"""
-
-from lighteval.metrics.metrics import Metrics
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
-
-
-def prompt_fn_oz_eval_task(line, task_name: str = None):
- query_template = """Pitanje: {question}\n
- Ponuđeni odgovori:
- A. {choice_a}
- B. {choice_b}
- C. {choice_c}
- D. {choice_d}
- E. {choice_e}
-
- Krajnji odgovor:"""
-
- options = line["options"]
-
- query = query_template.format(
- question=line["questions"],
- choice_a=options[0],
- choice_b=options[1],
- choice_c=options[2],
- choice_d=options[3],
- choice_e=options[4],
- )
-
- choices = ["A", "B", "C", "D", "E"]
- return Doc(
- task_name=task_name,
- query=query,
- choices=choices,
- gold_index=choices.index(line["answer"]),
- )
-
-
-oz_eval_task = LightevalTaskConfig(
- name="serbian_evals:oz_task",
- prompt_function=prompt_fn_oz_eval_task,
- suite=["community"],
- hf_repo="DjMel/oz-eval",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- metrics=[Metrics.loglikelihood_acc],
- version=0,
-)
-
-
-# STORE YOUR EVALS
-TASKS_TABLE = [oz_eval_task]
diff --git a/community_tasks/slr_bench_requirements.txt b/community_tasks/slr_bench_requirements.txt
deleted file mode 100644
index 57953d68e..000000000
--- a/community_tasks/slr_bench_requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-evaluate
-swipl
diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx
index a97a0fd42..52e6d4aa2 100644
--- a/docs/source/adding-a-custom-task.mdx
+++ b/docs/source/adding-a-custom-task.mdx
@@ -2,37 +2,17 @@
Lighteval provides a flexible framework for creating custom evaluation tasks. This guide explains how to create and integrate new tasks into the evaluation system.
-## Task Categories
-
-Before creating a custom task, consider which category it belongs to:
-
-### Core Evaluations
-Core evaluations are evaluations that only require standard logic in their
-metrics and processing, and that we will add to our test suite to ensure non-regression through time. They already see high usage in the community.
-
-### Extended Evaluations
-Extended evaluations are evaluations that require custom logic in their
-metrics (complex normalization, an LLM as a judge, etc.), that we added to
-facilitate the life of users. They already see high usage in the community.
-
-### Community Evaluations
-Community evaluations are submissions by the community of new tasks.
-
-A popular community evaluation can move to become an extended or core evaluation over time.
-
-> [!TIP]
-> You can find examples of custom tasks in the [community_tasks](https://github.com/huggingface/lighteval/tree/main/community_tasks) directory.
-
-## Step-by-Step Creation of a Custom Task
+## Step-by-Step Creation of a Task
> [!WARNING]
-> To contribute your custom task to the Lighteval repository, you would first need
+> To contribute your task to the Lighteval repository, you would first need
> to install the required dev dependencies by running `pip install -e .[dev]`
> and then run `pre-commit install` to install the pre-commit hooks.
### Step 1: Create the Task File
-First, create a Python file under the `community_tasks` directory.
+First, create a Python file or directory under the `src/lighteval/tasks/tasks` directory.
+A directory is helpfull if you need to split your file into multiple ones, just make sure to have one of the file named `main.py`.
### Step 2: Define the Prompt Function
@@ -135,12 +115,12 @@ class CustomSubsetTask(LightevalTaskConfig):
evaluation_splits=["test"],
few_shots_split="train",
few_shots_select="random_sampling_from_train",
- suite=["community"],
+ suite=["lighteval"],
generation_size=256,
stop_sequence=["\n", "Question:"],
)
-SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
+SUBSET_TASKS = [CustomSubsetTask(name=f"task:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
```
### Step 5: Add Tasks to the Table
@@ -169,7 +149,7 @@ Once your file is created, you can run the evaluation with the following command
```bash
lighteval accelerate \
"model_name=HuggingFaceH4/zephyr-7b-beta" \
- "community|{custom_task}|{fewshots}" \
+ "lighteval|{task}|{fewshots}" \
--custom-tasks {path_to_your_custom_task_file}
```
@@ -179,12 +159,12 @@ lighteval accelerate \
# Run a custom task with zero-shot evaluation
lighteval accelerate \
"model_name=openai-community/gpt2" \
- "community|myothertask|0" \
+ "lighteval|myothertask|0" \
--custom-tasks community_tasks/my_custom_task.py
# Run a custom task with few-shot evaluation
lighteval accelerate \
"model_name=openai-community/gpt2" \
- "community|myothertask|3" \
+ "lighteval|myothertask|3" \
--custom-tasks community_tasks/my_custom_task.py
```
diff --git a/docs/source/available-tasks.mdx b/docs/source/available-tasks.mdx
index 2acb4ef95..450b7ed49 100644
--- a/docs/source/available-tasks.mdx
+++ b/docs/source/available-tasks.mdx
@@ -1,8 +1,12 @@
-# Available Tasks
-## Discovering Available Tasks
+
+
-### List All Tasks
You can get a list of all available tasks by running:
@@ -10,8 +14,6 @@ You can get a list of all available tasks by running:
lighteval tasks list
```
-This command will display all tasks organized by their suites (e.g., leaderboard, lighteval, community).
-
### Inspect Specific Tasks
You can inspect a specific task to see its configuration, metrics, and requirements by running:
@@ -22,5 +24,5 @@ lighteval tasks inspect
For example:
```bash
-lighteval tasks inspect "leaderboard|truthfulqa:mc|0"
+lighteval tasks inspect "lighteval|truthfulqa:mc|0"
```
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
index d93af7078..e22ed3223 100644
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -7,6 +7,16 @@
Lighteval can be used with several different commands, each optimized for different evaluation scenarios.
+
+## Find your benchmark
+
+
+
## Available Commands
### Evaluation Backends
diff --git a/examples/custom_models/google_translate_model.py b/examples/custom_models/google_translate_model.py
index 04493fe35..1fe456900 100644
--- a/examples/custom_models/google_translate_model.py
+++ b/examples/custom_models/google_translate_model.py
@@ -110,7 +110,6 @@ def greedy_until(
Args:
requests (list[Request]): list of requests containing the context and ending conditions.
- override_bs (int, optional): Override the batch size for generation. Defaults to None.
Returns:
list[ModelResponse]: list of generated responses.
diff --git a/examples/custom_tasks_tests.py b/examples/custom_tasks_tests.py
index 34c871cd5..1a189c177 100644
--- a/examples/custom_tasks_tests.py
+++ b/examples/custom_tasks_tests.py
@@ -26,8 +26,8 @@
gsm8k_test = LightevalTaskConfig(
- name="gsm8k",
- suite=["test"],
+ name="gsm8k_test",
+ suite=["lighteval"],
prompt_function=prompt.gsm8k,
hf_repo="gsm8k",
hf_subset="main",
@@ -42,8 +42,8 @@
)
gpqa_diamond_test = LightevalTaskConfig(
- name="gpqa:diamond",
- suite=["test"],
+ name="gpqa:diamond_test",
+ suite=["lighteval"],
prompt_function=prompt.gpqa_instruct,
hf_repo="Idavidrein/gpqa",
hf_subset="gpqa_diamond",
diff --git a/examples/test_tasks.txt b/examples/test_tasks.txt
index 12c8662a9..14f847f06 100644
--- a/examples/test_tasks.txt
+++ b/examples/test_tasks.txt
@@ -1,8 +1,8 @@
-leaderboard|arc:challenge|25
-leaderboard|truthfulqa:mc|0
-leaderboard|hellaswag|10
-leaderboard|mmlu:college_chemistry|5
-leaderboard|mmlu:us_foreign_policy|5
+lighteval|arc:challenge|25
+lighteval|truthfulqa:mc|0
+lighteval|hellaswag|10
+lighteval|mmlu:college_chemistry|5
+lighteval|mmlu:us_foreign_policy|5
lighteval|agieval:aqua-rat|0
lighteval|agieval:logiqa-en|0
lighteval|agieval:lsat-ar|0
@@ -10,18 +10,18 @@ lighteval|agieval:lsat-lr|0
lighteval|agieval:lsat-rc|0
lighteval|agieval:sat-en-without-passage|0
lighteval|agieval:sat-en|0
-lighteval|bigbench:causal_judgment|3
-lighteval|bigbench:date_understanding|3
-lighteval|bigbench:disambiguation_qa|3
-lighteval|bigbench:geometric_shapes|3
-lighteval|bigbench:logical_deduction_five_objects|3
-lighteval|bigbench:logical_deduction_seven_objects|3
-lighteval|bigbench:movie_recommendation|3
-lighteval|bigbench:navigate|3
-lighteval|bigbench:ruin_names|3
-lighteval|bigbench:salient_translation_error_detection|3
-lighteval|bigbench:snarks|3
-lighteval|bigbench:temporal_sequences|3
-lighteval|bigbench:tracking_shuffled_objects_five_objects|3
-lighteval|bigbench:tracking_shuffled_objects_seven_objects|3
-test|gsm8k|0
+lighteval|bigbench_hard:causal_judgment|3
+lighteval|bigbench_hard:date_understanding|3
+lighteval|bigbench_hard:disambiguation_qa|3
+lighteval|bigbench_hard:geometric_shapes|3
+lighteval|bigbench_hard:logical_deduction_five_objects|3
+lighteval|bigbench_hard:logical_deduction_seven_objects|3
+lighteval|bigbench_hard:movie_recommendation|3
+lighteval|bigbench_hard:navigate|3
+lighteval|bigbench_hard:ruin_names|3
+lighteval|bigbench_hard:salient_translation_error_detection|3
+lighteval|bigbench_hard:snarks|3
+lighteval|bigbench_hard:temporal_sequences|3
+lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3
+lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3
+lighteval|gsm8k_test|0
diff --git a/pyproject.toml b/pyproject.toml
index 45b88d1f2..a89024487 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ line-length = 119
[tool.ruff.lint]
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
# Never enforce `E501` (line length violations).
-ignore = ["E501", "D100", "D101", "D102", "D103", "D104", "D415", "D105", "DOC501", "DOC201"]
+ignore = ["E501", "D100", "D101", "D102", "D103", "D104", "D415", "D105", "DOC501", "DOC201", "CPY001"]
select = ["C", "E", "F", "I", "W", "CPY", "D417", "DOC"]
preview = true
@@ -108,7 +108,8 @@ extended_tasks = [
"langdetect", # ifeval
"openai>1.87", # llm as a judge using openai models
"tiktoken",
- "emoji", "spacy", "syllapy" # ifbench
+ "emoji", "spacy", "syllapy", # ifbench
+ "evaluate", # slr_bench
]
s3 = ["s3fs"]
multilingual = [
diff --git a/src/lighteval/cli_args.py b/src/lighteval/cli_args.py
index 30e85a1a9..a8123218f 100644
--- a/src/lighteval/cli_args.py
+++ b/src/lighteval/cli_args.py
@@ -113,6 +113,16 @@ class Arg:
default="[('', '')]",
)
+load_tasks_multilingual = Arg(
+ type=Annotated[
+ bool,
+ Option(
+ help="Whether to load multilingual tasks.",
+ rich_help_panel=HELP_PANEL_NAME_1,
+ ),
+ ],
+ default=False,
+)
# Logging Parameters (HELP_PANEL_NAME_2)
output_dir = Arg(
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
index 3eca3b1c5..00fe25676 100644
--- a/src/lighteval/main_accelerate.py
+++ b/src/lighteval/main_accelerate.py
@@ -31,6 +31,7 @@
dataset_loading_processes,
job_id,
load_responses_from_details_date_id,
+ load_tasks_multilingual,
max_samples,
model_args,
num_fewshot_seeds,
@@ -59,8 +60,9 @@ def accelerate( # noqa C901
vision_model: Annotated[
bool, Option(help="Use vision model for evaluation.", rich_help_panel=HELP_PANEL_NAME_4)
] = False,
- dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
custom_tasks: custom_tasks.type = custom_tasks.default,
+ load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default,
+ dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default,
remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
@@ -105,9 +107,10 @@ def accelerate( # noqa C901
)
pipeline_params = PipelineParameters(
launcher_type=ParallelismManager.ACCELERATE,
+ custom_tasks_directory=custom_tasks,
+ load_tasks_multilingual=load_tasks_multilingual,
job_id=job_id,
dataset_loading_processes=dataset_loading_processes,
- custom_tasks_directory=custom_tasks,
num_fewshot_seeds=num_fewshot_seeds,
max_samples=max_samples,
remove_reasoning_tags=remove_reasoning_tags,
diff --git a/src/lighteval/main_baseline.py b/src/lighteval/main_baseline.py
index f082af726..2ba82095c 100644
--- a/src/lighteval/main_baseline.py
+++ b/src/lighteval/main_baseline.py
@@ -24,6 +24,7 @@
from lighteval.cli_args import (
custom_tasks,
dataset_loading_processes,
+ load_tasks_multilingual,
max_samples,
output_dir,
tasks,
@@ -32,8 +33,9 @@
def baseline(
tasks: tasks.type,
- custom_tasks: custom_tasks.type = custom_tasks.default,
+ load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default,
dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
output_dir: output_dir.type = output_dir.default,
max_samples: max_samples.type = max_samples.default,
):
@@ -55,7 +57,7 @@ def baseline(
from lighteval.tasks.requests import SamplingMethod
from lighteval.utils.utils import as_list
- registry = Registry(tasks=tasks, custom_tasks=custom_tasks)
+ registry = Registry(tasks=tasks, custom_tasks=custom_tasks, load_multilingual=load_tasks_multilingual)
tasks_dict: dict[str, LightevalTask] = registry.load_tasks()
evaluation_tracker = EvaluationTracker(
diff --git a/src/lighteval/main_custom.py b/src/lighteval/main_custom.py
index 1cef8f3dc..e6124ce62 100644
--- a/src/lighteval/main_custom.py
+++ b/src/lighteval/main_custom.py
@@ -29,6 +29,7 @@
custom_tasks,
dataset_loading_processes,
job_id,
+ load_tasks_multilingual,
max_samples,
num_fewshot_seeds,
output_dir,
@@ -55,9 +56,10 @@ def custom(
model_definition_file_path: Annotated[str, Argument(help="The model definition file path to evaluate")],
tasks: tasks.type,
# === Common parameters ===
+ load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default,
dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
- custom_tasks: custom_tasks.type = custom_tasks.default,
num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
reasoning_tags: reasoning_tags.type = reasoning_tags.default,
# === saving ===
@@ -102,6 +104,7 @@ def custom(
max_samples=max_samples,
remove_reasoning_tags=remove_reasoning_tags,
reasoning_tags=reasoning_tags,
+ load_tasks_multilingual=load_tasks_multilingual,
)
pipeline = Pipeline(
tasks=tasks,
diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py
index 060b93822..ece2ac430 100644
--- a/src/lighteval/main_endpoint.py
+++ b/src/lighteval/main_endpoint.py
@@ -31,6 +31,7 @@
dataset_loading_processes,
job_id,
load_responses_from_details_date_id,
+ load_tasks_multilingual,
max_samples,
num_fewshot_seeds,
output_dir,
@@ -65,6 +66,7 @@ def inference_endpoint(
),
] = False,
# === Common parameters ===
+ load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default,
dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
custom_tasks: custom_tasks.type = custom_tasks.default,
num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
@@ -121,6 +123,7 @@ def inference_endpoint(
load_responses_from_details_date_id=load_responses_from_details_date_id,
remove_reasoning_tags=remove_reasoning_tags,
reasoning_tags=reasoning_tags,
+ load_tasks_multilingual=load_tasks_multilingual,
)
pipeline = Pipeline(
tasks=tasks,
@@ -148,6 +151,7 @@ def tgi(
],
tasks: tasks.type,
# === Common parameters ===
+ load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default,
dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
custom_tasks: custom_tasks.type = custom_tasks.default,
num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
@@ -193,6 +197,7 @@ def tgi(
pipeline_params = PipelineParameters(
launcher_type=parallelism_manager,
+ load_tasks_multilingual=load_tasks_multilingual,
job_id=job_id,
dataset_loading_processes=dataset_loading_processes,
custom_tasks_directory=custom_tasks,
@@ -231,9 +236,10 @@ def litellm(
],
tasks: tasks.type,
# === Common parameters ===
+ load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default,
dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
- custom_tasks: custom_tasks.type = custom_tasks.default,
num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
+ custom_tasks: custom_tasks.type = custom_tasks.default,
load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default,
remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
reasoning_tags: reasoning_tags.type = reasoning_tags.default,
@@ -285,6 +291,7 @@ def litellm(
pipeline_params = PipelineParameters(
launcher_type=parallelism_manager,
+ load_tasks_multilingual=load_tasks_multilingual,
job_id=job_id,
dataset_loading_processes=dataset_loading_processes,
custom_tasks_directory=custom_tasks,
@@ -324,6 +331,7 @@ def inference_providers(
],
tasks: tasks.type,
# === Common parameters ===
+ load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default,
dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
custom_tasks: custom_tasks.type = custom_tasks.default,
num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
@@ -373,6 +381,7 @@ def inference_providers(
pipeline_params = PipelineParameters(
launcher_type=parallelism_manager,
+ load_tasks_multilingual=load_tasks_multilingual,
job_id=job_id,
dataset_loading_processes=dataset_loading_processes,
custom_tasks_directory=custom_tasks,
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index b844a74a4..9399e82cd 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -29,6 +29,7 @@
from yaml import SafeLoader
from lighteval.cli_args import (
+ load_tasks_multilingual,
reasoning_tags,
remove_reasoning_tags,
)
@@ -44,6 +45,7 @@ def nanotron(
str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.")
],
lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")],
+ load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default,
remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default,
reasoning_tags: reasoning_tags.type = reasoning_tags.default,
):
@@ -102,6 +104,7 @@ def nanotron(
max_samples=lighteval_config.tasks.max_samples,
remove_reasoning_tags=remove_reasoning_tags,
reasoning_tags=reasoning_tags,
+ load_tasks_multilingual=load_tasks_multilingual,
)
pipeline = Pipeline(
diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py
index 0b506988e..ab86349f9 100644
--- a/src/lighteval/main_sglang.py
+++ b/src/lighteval/main_sglang.py
@@ -25,6 +25,7 @@
dataset_loading_processes,
job_id,
load_responses_from_details_date_id,
+ load_tasks_multilingual,
max_samples,
model_args,
num_fewshot_seeds,
@@ -47,6 +48,7 @@ def sglang(
model_args: model_args.type,
tasks: tasks.type,
# === Common parameters ===
+ load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default,
dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default,
custom_tasks: custom_tasks.type = custom_tasks.default,
num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default,
@@ -91,6 +93,7 @@ def sglang(
pipeline_params = PipelineParameters(
launcher_type=ParallelismManager.SGLANG,
job_id=job_id,
+ load_tasks_multilingual=load_tasks_multilingual,
dataset_loading_processes=dataset_loading_processes,
custom_tasks_directory=custom_tasks,
num_fewshot_seeds=num_fewshot_seeds,
diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py
index 62f1129f4..230359730 100644
--- a/src/lighteval/main_tasks.py
+++ b/src/lighteval/main_tasks.py
@@ -25,7 +25,7 @@
from typer import Argument, Option
from typing_extensions import Annotated
-from lighteval.cli_args import custom_tasks
+from lighteval.cli_args import custom_tasks, load_tasks_multilingual
app = typer.Typer()
@@ -46,7 +46,7 @@ def inspect(
from lighteval.tasks.registry import Registry
- registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True)
+ registry = Registry(custom_tasks=custom_tasks, load_multilingual=True)
# Loading task
task_dict = registry.load_tasks()
@@ -64,19 +64,14 @@ def inspect(
@app.command()
def list(
+ load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default,
custom_tasks: custom_tasks.type = custom_tasks.default,
- suites: Annotated[
- str | None,
- Option(
- help="Comma-separated list of suites to display (e.g., 'helm,harness'). Use 'all' for all suites. If not specified, shows core suites only."
- ),
- ] = None,
):
"""List all tasks"""
from lighteval.tasks.registry import Registry
- registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True)
- registry.print_all_tasks(suites=suites)
+ registry = Registry(custom_tasks=custom_tasks, load_multilingual=load_tasks_multilingual)
+ registry.print_all_tasks()
@app.command()
diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py
index b8025bf3f..db9c16c34 100644
--- a/src/lighteval/metrics/imports/bert_scorer.py
+++ b/src/lighteval/metrics/imports/bert_scorer.py
@@ -110,7 +110,7 @@ def get_bert_embedding(
Args:
all_sens (list of str): sentences to encode.
- model: a BERT model from `pytorch_pretrained_bert`.
+ model: a BERT model.
tokenizer: a BERT tokenizer corresponds to `model`.
idf_dict (dict): mapping a word piece index to its inverse document frequency.
batch_size (int): batch size for processing, -1 for all sentences.
@@ -330,7 +330,6 @@ def __init__(
`model_type` or `lang`.
num_layers (int): The layer of representation to use.
Default using the number of layer tuned on WMT16 correlation data.
- verbose (bool): Turn on intermediate status update.
idf (bool): A boolean to specify whether to use idf or not (this should be True even if `idf_sents` is given).
device (str): On which the contextual embedding model will be allocated on.
If this argument is None, the model lives on cuda:0 if cuda is available.
@@ -340,7 +339,6 @@ def __init__(
lang (str): Language of the sentences; has to specify
at least one of `model_type` or `lang`. `lang` needs to be
specified when `rescale_with_baseline` is True.
- return_hash (bool): Return hash code of the setting.
rescale_with_baseline (bool): Rescale bertscore with pre-computed baseline.
baseline_path (str): Customized baseline file.
"""
diff --git a/src/lighteval/metrics/utils/math_comparison.py b/src/lighteval/metrics/utils/math_comparison.py
index 2329acfe0..974d6d2cc 100644
--- a/src/lighteval/metrics/utils/math_comparison.py
+++ b/src/lighteval/metrics/utils/math_comparison.py
@@ -297,7 +297,7 @@ def is_equation(expr: Basic | MatrixBase) -> bool:
Args:
expr: The expression to check
Returns:
- bool: True if expr is an equation, False otherwise
+ True if expr is an equation, False otherwise
"""
if isinstance(expr, Eq):
return True
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index 0f02c4b38..1f5da9c14 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -94,6 +94,7 @@ class PipelineParameters:
reasoning_tags: str | list[tuple[str, str]] = "[('', '')]"
load_responses_from_details_date_id: str | None = None
bootstrap_iters: int = 1000
+ load_tasks_multilingual: bool = False
def __post_init__(self): # noqa C901
if not isinstance(self.reasoning_tags, list):
@@ -210,7 +211,11 @@ def _init_tasks_and_requests(self, tasks: str):
logger.info("--- LOADING TASKS ---")
# The registry contains all the potential tasks
- self.registry = Registry(tasks=tasks, custom_tasks=self.pipeline_parameters.custom_tasks_directory)
+ self.registry = Registry(
+ tasks=tasks,
+ load_multilingual=self.pipeline_parameters.load_tasks_multilingual,
+ custom_tasks=self.pipeline_parameters.custom_tasks_directory,
+ )
# load the tasks from the configs and their datasets
self.tasks_dict: dict[str, LightevalTask] = self.registry.load_tasks()
diff --git a/src/lighteval/tasks/__init__.py b/src/lighteval/tasks/__init__.py
index a732db8d0..e3e34484b 100644
--- a/src/lighteval/tasks/__init__.py
+++ b/src/lighteval/tasks/__init__.py
@@ -19,3 +19,8 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
+
+"""
+Automatically imports all task configs from the tasks/ directory.
+This module dynamically loads all Python files in tasks/ and exposes their LightevalTaskConfig objects.
+"""
diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
deleted file mode 100644
index 1c72d5008..000000000
--- a/src/lighteval/tasks/default_tasks.py
+++ /dev/null
@@ -1,22871 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import lighteval.tasks.default_prompts as prompt
-from lighteval.metrics.metrics import Metrics
-from lighteval.metrics.normalizations import (
- LogProbCharNorm,
- gsm8k_normalizer,
- harness_triviaqa_normalizer,
- helm_normalizer,
- math_normalizer,
-)
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.templates.qa import get_qa_prompt_function
-from lighteval.utils.language import Language
-
-
-mmmu_pro_standard_4_options = LightevalTaskConfig(
- name="mmmu_pro:standard-4",
- suite=["lighteval"],
- prompt_function=prompt.mmmu_pro,
- hf_repo="MMMU/MMMU_pro",
- hf_subset="standard (4 options)",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30, # expected an answer in a format 'Answer: B'
- metrics=[Metrics.gpqa_instruct_metric],
- stop_sequence=None,
- version=0,
-)
-mmmu_pro_standard_10_options = LightevalTaskConfig(
- name="mmmu_pro:standard-10",
- suite=["lighteval"],
- prompt_function=prompt.mmmu_pro,
- hf_repo="MMMU/MMMU_pro",
- hf_subset="standard (10 options)",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30, # expected an answer in a format 'Answer: B'
- metrics=[Metrics.gpqa_instruct_metric],
- stop_sequence=None,
- version=0,
-)
-mmmu_pro_vision = LightevalTaskConfig(
- name="mmmu_pro:vision",
- suite=["lighteval"],
- prompt_function=prompt.mmmu_pro_vision,
- hf_repo="MMMU/MMMU_pro",
- hf_subset="vision",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30, # expected an answer in a format 'Answer: B'
- metrics=[Metrics.gpqa_instruct_metric],
- stop_sequence=None,
- version=0,
-)
-abstract_narrative_understanding_bigbench = LightevalTaskConfig(
- name="abstract_narrative_understanding",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="abstract_narrative_understanding",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-agieval_aqua_rat_lighteval = LightevalTaskConfig(
- name="agieval:aqua-rat",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-aqua-rat",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_gaokao_biology_lighteval = LightevalTaskConfig(
- name="agieval:gaokao-biology",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-gaokao-biology",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_gaokao_chemistry_lighteval = LightevalTaskConfig(
- name="agieval:gaokao-chemistry",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-gaokao-chemistry",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_gaokao_chinese_lighteval = LightevalTaskConfig(
- name="agieval:gaokao-chinese",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-gaokao-chinese",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_gaokao_english_lighteval = LightevalTaskConfig(
- name="agieval:gaokao-english",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-gaokao-english",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_gaokao_geography_lighteval = LightevalTaskConfig(
- name="agieval:gaokao-geography",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-gaokao-geography",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_gaokao_history_lighteval = LightevalTaskConfig(
- name="agieval:gaokao-history",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-gaokao-history",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_gaokao_mathqa_lighteval = LightevalTaskConfig(
- name="agieval:gaokao-mathqa",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-gaokao-mathqa",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_gaokao_physics_lighteval = LightevalTaskConfig(
- name="agieval:gaokao-physics",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-gaokao-physics",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_logiqa_en_lighteval = LightevalTaskConfig(
- name="agieval:logiqa-en",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-logiqa-en",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_logiqa_zh_lighteval = LightevalTaskConfig(
- name="agieval:logiqa-zh",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-logiqa-zh",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_lsat_ar_lighteval = LightevalTaskConfig(
- name="agieval:lsat-ar",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-lsat-ar",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_lsat_lr_lighteval = LightevalTaskConfig(
- name="agieval:lsat-lr",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-lsat-lr",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_lsat_rc_lighteval = LightevalTaskConfig(
- name="agieval:lsat-rc",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-lsat-rc",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_sat_en_lighteval = LightevalTaskConfig(
- name="agieval:sat-en",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-sat-en",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_sat_en_without_passage_lighteval = LightevalTaskConfig(
- name="agieval:sat-en-without-passage",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-sat-en-without-passage",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-agieval_sat_math_lighteval = LightevalTaskConfig(
- name="agieval:sat-math",
- suite=["lighteval"],
- prompt_function=prompt.agieval,
- hf_repo="dmayhem93/agieval-sat-math",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=None,
- version=0,
-)
-aime24 = LightevalTaskConfig(
- name="aime24",
- suite=["lighteval"],
- prompt_function=prompt.aime_prompt_fn,
- hf_repo="HuggingFaceH4/aime_2024",
- hf_subset="default",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.pass_at_k_math(sample_params={"k": 1}), Metrics.avg_at_n_math(sample_params={"n": 1})],
- version=2,
-)
-aime24_avg = LightevalTaskConfig(
- name="aime24_avg",
- suite=["lighteval"],
- prompt_function=prompt.aime_prompt_fn,
- hf_repo="HuggingFaceH4/aime_2024",
- hf_subset="default",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.avg_at_n_math(sample_params={"n": 64})],
- version=2,
-)
-aime24_gpassk = LightevalTaskConfig(
- name="aime24_gpassk",
- suite=["lighteval"],
- prompt_function=prompt.aime_prompt_fn,
- hf_repo="HuggingFaceH4/aime_2024",
- hf_subset="default",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8192,
- metrics=[Metrics.g_pass_at_k_math(sample_params={"k": 16, "n": 48})],
- version=1,
-)
-aime25 = LightevalTaskConfig(
- name="aime25",
- suite=["lighteval"],
- prompt_function=prompt.aime_prompt_fn,
- hf_repo="yentinglin/aime_2025",
- hf_subset="default",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10000,
- metrics=[Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1})],
- version=2,
-)
-aime25_gpassk = LightevalTaskConfig(
- name="aime25_gpassk",
- suite=["lighteval"],
- prompt_function=prompt.aime_prompt_fn,
- hf_repo="yentinglin/aime_2025",
- hf_subset="default",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8192,
- metrics=[Metrics.g_pass_at_k_math(sample_params={"k": 16, "n": 48})],
- version=1,
-)
-anachronisms_bigbench = LightevalTaskConfig(
- name="anachronisms",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="anachronisms",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-analogical_similarity_bigbench = LightevalTaskConfig(
- name="analogical_similarity",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="analogical_similarity",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-analytic_entailment_bigbench = LightevalTaskConfig(
- name="analytic_entailment",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="analytic_entailment",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-anli_r1_lighteval = LightevalTaskConfig(
- name="anli:r1",
- suite=["lighteval", "anli"],
- prompt_function=prompt.anli,
- hf_repo="anli",
- hf_subset="plain_text",
- hf_avail_splits=["train_r1", "dev_r1", "test_r1"],
- evaluation_splits=["test_r1"],
- few_shots_split="train_r1",
- few_shots_select="random_sampling_from_train",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-anli_r2_lighteval = LightevalTaskConfig(
- name="anli:r2",
- suite=["lighteval", "anli"],
- prompt_function=prompt.anli,
- hf_repo="anli",
- hf_subset="plain_text",
- hf_avail_splits=["train_r2", "dev_r2", "test_r2"],
- evaluation_splits=["test_r2"],
- few_shots_split="train_r2",
- few_shots_select="random_sampling_from_train",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-anli_r3_lighteval = LightevalTaskConfig(
- name="anli:r3",
- suite=["lighteval", "anli"],
- prompt_function=prompt.anli,
- hf_repo="anli",
- hf_subset="plain_text",
- hf_avail_splits=["train_r3", "dev_r3", "test_r3"],
- evaluation_splits=["test_r3"],
- few_shots_split="train_r3",
- few_shots_select="random_sampling_from_train",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-arc_agi_2 = LightevalTaskConfig(
- name="arc_agi_2",
- suite=["lighteval"],
- prompt_function=prompt.arc_agi_2,
- hf_repo="arc-agi-community/arc-agi-2",
- hf_subset="default",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[Metrics.exact_match],
- stop_sequence=None,
- version=0,
-)
-arc_c_letters_original = LightevalTaskConfig(
- name="arc:c:letters",
- suite=["original", "arc"],
- prompt_function=prompt.arc_with_options_letters_predict,
- hf_repo="ai2_arc",
- hf_subset="ARC-Challenge",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc, Metrics.exact_match],
- stop_sequence=["\n"],
- version=0,
-)
-arc_c_options_original = LightevalTaskConfig(
- name="arc:c:options",
- suite=["original", "arc"],
- prompt_function=prompt.arc_with_options,
- hf_repo="ai2_arc",
- hf_subset="ARC-Challenge",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-arc_c_simple_original = LightevalTaskConfig(
- name="arc:c:simple",
- suite=["original", "arc"],
- prompt_function=prompt.arc,
- hf_repo="ai2_arc",
- hf_subset="ARC-Challenge",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-arc_challenge_leaderboard = LightevalTaskConfig(
- name="arc:challenge",
- suite=["leaderboard", "arc"],
- prompt_function=prompt.arc,
- hf_repo="ai2_arc",
- hf_subset="ARC-Challenge",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling_from_train",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-arc_easy_lighteval = LightevalTaskConfig(
- name="arc:easy",
- suite=["lighteval", "arc"],
- prompt_function=prompt.arc,
- hf_repo="ai2_arc",
- hf_subset="ARC-Easy",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling_from_train",
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-arithmetic_1dc_lighteval = LightevalTaskConfig(
- name="arithmetic:1dc",
- suite=["lighteval", "arithmetic"],
- prompt_function=prompt.arithmetic,
- hf_repo="EleutherAI/arithmetic",
- hf_subset="arithmetic_1dc",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.acc_golds_likelihood],
- stop_sequence=["\n"],
- version=0,
-)
-arithmetic_2da_lighteval = LightevalTaskConfig(
- name="arithmetic:2da",
- suite=["lighteval", "arithmetic"],
- prompt_function=prompt.arithmetic,
- hf_repo="EleutherAI/arithmetic",
- hf_subset="arithmetic_2da",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.acc_golds_likelihood],
- stop_sequence=["\n"],
- version=0,
-)
-arithmetic_2dm_lighteval = LightevalTaskConfig(
- name="arithmetic:2dm",
- suite=["lighteval", "arithmetic"],
- prompt_function=prompt.arithmetic,
- hf_repo="EleutherAI/arithmetic",
- hf_subset="arithmetic_2dm",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.acc_golds_likelihood],
- stop_sequence=["\n"],
- version=0,
-)
-arithmetic_2ds_lighteval = LightevalTaskConfig(
- name="arithmetic:2ds",
- suite=["lighteval", "arithmetic"],
- prompt_function=prompt.arithmetic,
- hf_repo="EleutherAI/arithmetic",
- hf_subset="arithmetic_2ds",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.acc_golds_likelihood],
- stop_sequence=["\n"],
- version=0,
-)
-arithmetic_3da_lighteval = LightevalTaskConfig(
- name="arithmetic:3da",
- suite=["lighteval", "arithmetic"],
- prompt_function=prompt.arithmetic,
- hf_repo="EleutherAI/arithmetic",
- hf_subset="arithmetic_3da",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.acc_golds_likelihood],
- stop_sequence=["\n"],
- version=0,
-)
-arithmetic_3ds_lighteval = LightevalTaskConfig(
- name="arithmetic:3ds",
- suite=["lighteval", "arithmetic"],
- prompt_function=prompt.arithmetic,
- hf_repo="EleutherAI/arithmetic",
- hf_subset="arithmetic_3ds",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.acc_golds_likelihood],
- stop_sequence=["\n"],
- version=0,
-)
-arithmetic_4da_lighteval = LightevalTaskConfig(
- name="arithmetic:4da",
- suite=["lighteval", "arithmetic"],
- prompt_function=prompt.arithmetic,
- hf_repo="EleutherAI/arithmetic",
- hf_subset="arithmetic_4da",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.acc_golds_likelihood],
- stop_sequence=["\n"],
- version=0,
-)
-arithmetic_4ds_lighteval = LightevalTaskConfig(
- name="arithmetic:4ds",
- suite=["lighteval", "arithmetic"],
- prompt_function=prompt.arithmetic,
- hf_repo="EleutherAI/arithmetic",
- hf_subset="arithmetic_4ds",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.acc_golds_likelihood],
- stop_sequence=["\n"],
- version=0,
-)
-arithmetic_5da_lighteval = LightevalTaskConfig(
- name="arithmetic:5da",
- suite=["lighteval", "arithmetic"],
- prompt_function=prompt.arithmetic,
- hf_repo="EleutherAI/arithmetic",
- hf_subset="arithmetic_5da",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.acc_golds_likelihood],
- stop_sequence=["\n"],
- version=0,
-)
-arithmetic_5ds_lighteval = LightevalTaskConfig(
- name="arithmetic:5ds",
- suite=["lighteval", "arithmetic"],
- prompt_function=prompt.arithmetic,
- hf_repo="EleutherAI/arithmetic",
- hf_subset="arithmetic_5ds",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.acc_golds_likelihood],
- stop_sequence=["\n"],
- version=0,
-)
-arithmetic_bb_bigbench = LightevalTaskConfig(
- name="arithmetic_bb",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="arithmetic",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-ascii_word_recognition_bigbench = LightevalTaskConfig(
- name="ascii_word_recognition",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="ascii_word_recognition",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-asdiv_lighteval = LightevalTaskConfig(
- name="asdiv",
- suite=["lighteval"],
- prompt_function=prompt.asdiv,
- hf_repo="EleutherAI/asdiv",
- hf_subset="asdiv",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.acc_golds_likelihood],
- stop_sequence=["\n"],
- version=0,
-)
-authorship_verification_bigbench = LightevalTaskConfig(
- name="authorship_verification",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="authorship_verification",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-auto_categorization_bigbench = LightevalTaskConfig(
- name="auto_categorization",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="auto_categorization",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu],
- stop_sequence=["\n"],
- version=0,
-)
-auto_debugging_bigbench_lite = LightevalTaskConfig(
- name="auto_debugging",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_linefeed_before_and_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="auto_debugging",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=None,
- version=0,
-)
-babi_qa_helm = LightevalTaskConfig(
- name="babi_qa",
- suite=["helm"],
- prompt_function=prompt.babi_qa,
- hf_repo="facebook/babi_qa",
- hf_subset="en-valid-qa1",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_causal_judgment_lighteval = LightevalTaskConfig(
- name="bigbench:causal_judgment",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="causal_judgement",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_date_understanding_lighteval = LightevalTaskConfig(
- name="bigbench:date_understanding",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="date_understanding",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_disambiguation_qa_lighteval = LightevalTaskConfig(
- name="bigbench:disambiguation_qa",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="disambiguation_qa",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_geometric_shapes_lighteval = LightevalTaskConfig(
- name="bigbench:geometric_shapes",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="geometric_shapes",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_logical_deduction_five_objects_lighteval = LightevalTaskConfig(
- name="bigbench:logical_deduction_five_objects",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="logical_deduction_five_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_logical_deduction_seven_objects_lighteval = LightevalTaskConfig(
- name="bigbench:logical_deduction_seven_objects",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="logical_deduction_seven_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_logical_deduction_three_objects_lighteval = LightevalTaskConfig(
- name="bigbench:logical_deduction_three_objects",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="logical_deduction_three_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_movie_recommendation_lighteval = LightevalTaskConfig(
- name="bigbench:movie_recommendation",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="movie_recommendation",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_navigate_lighteval = LightevalTaskConfig(
- name="bigbench:navigate",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="navigate",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_reasoning_about_colored_objects_lighteval = LightevalTaskConfig(
- name="bigbench:reasoning_about_colored_objects",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="reasoning_about_colored_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_ruin_names_lighteval = LightevalTaskConfig(
- name="bigbench:ruin_names",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="ruin_names",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_salient_translation_error_detection_lighteval = LightevalTaskConfig(
- name="bigbench:salient_translation_error_detection",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="salient_translation_error_detection",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_snarks_lighteval = LightevalTaskConfig(
- name="bigbench:snarks",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="snarks",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_sports_understanding_lighteval = LightevalTaskConfig(
- name="bigbench:sports_understanding",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="sports_understanding",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_temporal_sequences_lighteval = LightevalTaskConfig(
- name="bigbench:temporal_sequences",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="temporal_sequences",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_tracking_shuffled_objects_five_objects_lighteval = LightevalTaskConfig(
- name="bigbench:tracking_shuffled_objects_five_objects",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="tracking_shuffled_objects_five_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_tracking_shuffled_objects_seven_objects_lighteval = LightevalTaskConfig(
- name="bigbench:tracking_shuffled_objects_seven_objects",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="tracking_shuffled_objects_seven_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_tracking_shuffled_objects_three_objects_lighteval = LightevalTaskConfig(
- name="bigbench:tracking_shuffled_objects_three_objects",
- suite=["lighteval"],
- prompt_function=prompt.bbh_lighteval,
- hf_repo="lighteval/bbh",
- hf_subset="tracking_shuffled_objects_three_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bigbench_causal_judgment_harness = LightevalTaskConfig(
- name="bigbench:causal_judgment",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="causal_judgement",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_date_understanding_harness = LightevalTaskConfig(
- name="bigbench:date_understanding",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="date_understanding",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_disambiguation_qa_harness = LightevalTaskConfig(
- name="bigbench:disambiguation_qa",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="disambiguation_qa",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_geometric_shapes_harness = LightevalTaskConfig(
- name="bigbench:geometric_shapes",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="geometric_shapes",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_logical_deduction_five_objects_harness = LightevalTaskConfig(
- name="bigbench:logical_deduction_five_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="logical_deduction_five_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_logical_deduction_seven_objects_harness = LightevalTaskConfig(
- name="bigbench:logical_deduction_seven_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="logical_deduction_seven_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_logical_deduction_three_objects_harness = LightevalTaskConfig(
- name="bigbench:logical_deduction_three_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="logical_deduction_three_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_movie_recommendation_harness = LightevalTaskConfig(
- name="bigbench:movie_recommendation",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="movie_recommendation",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_navigate_harness = LightevalTaskConfig(
- name="bigbench:navigate",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="navigate",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_reasoning_about_colored_objects_harness = LightevalTaskConfig(
- name="bigbench:reasoning_about_colored_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="reasoning_about_colored_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_ruin_names_harness = LightevalTaskConfig(
- name="bigbench:ruin_names",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="ruin_names",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_salient_translation_error_detection_harness = LightevalTaskConfig(
- name="bigbench:salient_translation_error_detection",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="salient_translation_error_detection",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_snarks_harness = LightevalTaskConfig(
- name="bigbench:snarks",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="snarks",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_sports_understanding_harness = LightevalTaskConfig(
- name="bigbench:sports_understanding",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="sports_understanding",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_temporal_sequences_harness = LightevalTaskConfig(
- name="bigbench:temporal_sequences",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="temporal_sequences",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_tracking_shuffled_objects_five_objects_harness = LightevalTaskConfig(
- name="bigbench:tracking_shuffled_objects_five_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="tracking_shuffled_objects_five_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_tracking_shuffled_objects_seven_objects_harness = LightevalTaskConfig(
- name="bigbench:tracking_shuffled_objects_seven_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="tracking_shuffled_objects_seven_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bigbench_tracking_shuffled_objects_three_objects_harness = LightevalTaskConfig(
- name="bigbench:tracking_shuffled_objects_three_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_harness,
- hf_repo="lighteval/bbh",
- hf_subset="tracking_shuffled_objects_three_objects",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- must_remove_duplicate_docs=True,
- version=0,
-)
-bbh_boolean_expressions_harness = LightevalTaskConfig(
- name="bbh:boolean_expressions",
- suite=["harness"],
- prompt_function=prompt.bbh_boolean_expressions,
- hf_repo="lukaemon/bbh",
- hf_subset="boolean_expressions",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_causal_judgment_harness = LightevalTaskConfig(
- name="bbh:causal_judgment",
- suite=["harness"],
- prompt_function=prompt.bbh_causal_judgment,
- hf_repo="lukaemon/bbh",
- hf_subset="causal_judgement",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_date_understanding_harness = LightevalTaskConfig(
- name="bbh:date_understanding",
- suite=["harness"],
- prompt_function=prompt.bbh_date_understanding,
- hf_repo="lukaemon/bbh",
- hf_subset="date_understanding",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_disambiguation_qa_harness = LightevalTaskConfig(
- name="bbh:disambiguation_qa",
- suite=["harness"],
- prompt_function=prompt.bbh_disambiguation_qa,
- hf_repo="lukaemon/bbh",
- hf_subset="disambiguation_qa",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_dyck_languages_harness = LightevalTaskConfig(
- name="bbh:dyck_languages",
- suite=["harness"],
- prompt_function=prompt.bbh_dyck_languages,
- hf_repo="lukaemon/bbh",
- hf_subset="dyck_languages",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_formal_fallacies_harness = LightevalTaskConfig(
- name="bbh:formal_fallacies",
- suite=["harness"],
- prompt_function=prompt.bbh_formal_fallacies,
- hf_repo="lukaemon/bbh",
- hf_subset="formal_fallacies",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_geometric_shapes_harness = LightevalTaskConfig(
- name="bbh:geometric_shapes",
- suite=["harness"],
- prompt_function=prompt.bbh_geometric_shapes,
- hf_repo="lukaemon/bbh",
- hf_subset="geometric_shapes",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_hyperbaton_harness = LightevalTaskConfig(
- name="bbh:hyperbaton",
- suite=["harness"],
- prompt_function=prompt.bbh_hyperbaton,
- hf_repo="lukaemon/bbh",
- hf_subset="hyperbaton",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_logical_deduction_five_objects_harness = LightevalTaskConfig(
- name="bbh:logical_deduction_five_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_logical_deduction_five_objects,
- hf_repo="lukaemon/bbh",
- hf_subset="logical_deduction_five_objects",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_logical_deduction_seven_objects_harness = LightevalTaskConfig(
- name="bbh:logical_deduction_seven_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_logical_deduction_seven_objects,
- hf_repo="lukaemon/bbh",
- hf_subset="logical_deduction_seven_objects",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_logical_deduction_three_objects_harness = LightevalTaskConfig(
- name="bbh:logical_deduction_three_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_logical_deduction_three_objects,
- hf_repo="lukaemon/bbh",
- hf_subset="logical_deduction_three_objects",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_movie_recommendation_harness = LightevalTaskConfig(
- name="bbh:movie_recommendation",
- suite=["harness"],
- prompt_function=prompt.bbh_movie_recommendation,
- hf_repo="lukaemon/bbh",
- hf_subset="movie_recommendation",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_multistep_arithmetic_two_harness = LightevalTaskConfig(
- name="bbh:multistep_arithmetic_two",
- suite=["harness"],
- prompt_function=prompt.bbh_multistep_arithmetic_two,
- hf_repo="lukaemon/bbh",
- hf_subset="multistep_arithmetic_two",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_navigate_harness = LightevalTaskConfig(
- name="bbh:navigate",
- suite=["harness"],
- prompt_function=prompt.bbh_navigate,
- hf_repo="lukaemon/bbh",
- hf_subset="navigate",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_object_counting_harness = LightevalTaskConfig(
- name="bbh:object_counting",
- suite=["harness"],
- prompt_function=prompt.bbh_object_counting,
- hf_repo="lukaemon/bbh",
- hf_subset="object_counting",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_penguins_in_a_table_harness = LightevalTaskConfig(
- name="bbh:penguins_in_a_table",
- suite=["harness"],
- prompt_function=prompt.bbh_penguins_in_a_table,
- hf_repo="lukaemon/bbh",
- hf_subset="penguins_in_a_table",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_reasoning_about_colored_objects_harness = LightevalTaskConfig(
- name="bbh:reasoning_about_colored_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_reasoning_about_colored_objects,
- hf_repo="lukaemon/bbh",
- hf_subset="reasoning_about_colored_objects",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_ruin_names_harness = LightevalTaskConfig(
- name="bbh:ruin_names",
- suite=["harness"],
- prompt_function=prompt.bbh_ruin_names,
- hf_repo="lukaemon/bbh",
- hf_subset="ruin_names",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_salient_translation_error_detection_harness = LightevalTaskConfig(
- name="bbh:salient_translation_error_detection",
- suite=["harness"],
- prompt_function=prompt.bbh_salient_translation_error_detection,
- hf_repo="lukaemon/bbh",
- hf_subset="salient_translation_error_detection",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_snarks_harness = LightevalTaskConfig(
- name="bbh:snarks",
- suite=["harness"],
- prompt_function=prompt.bbh_snarks,
- hf_repo="lukaemon/bbh",
- hf_subset="snarks",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_sports_understanding_harness = LightevalTaskConfig(
- name="bbh:sports_understanding",
- suite=["harness"],
- prompt_function=prompt.bbh_sports_understanding,
- hf_repo="lukaemon/bbh",
- hf_subset="sports_understanding",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_temporal_sequences_harness = LightevalTaskConfig(
- name="bbh:temporal_sequences",
- suite=["harness"],
- prompt_function=prompt.bbh_temporal_sequences,
- hf_repo="lukaemon/bbh",
- hf_subset="temporal_sequences",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_tracking_shuffled_objects_five_objects_harness = LightevalTaskConfig(
- name="bbh:tracking_shuffled_objects_five_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_tracking_shuffled_objects_five_objects,
- hf_repo="lukaemon/bbh",
- hf_subset="tracking_shuffled_objects_five_objects",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_tracking_shuffled_objects_seven_objects_harness = LightevalTaskConfig(
- name="bbh:tracking_shuffled_objects_seven_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_tracking_shuffled_objects_seven_objects,
- hf_repo="lukaemon/bbh",
- hf_subset="tracking_shuffled_objects_seven_objects",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_tracking_shuffled_objects_three_objects_harness = LightevalTaskConfig(
- name="bbh:tracking_shuffled_objects_three_objects",
- suite=["harness"],
- prompt_function=prompt.bbh_tracking_shuffled_objects_three_objects,
- hf_repo="lukaemon/bbh",
- hf_subset="tracking_shuffled_objects_three_objects",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_web_of_lies_harness = LightevalTaskConfig(
- name="bbh:web_of_lies",
- suite=["harness"],
- prompt_function=prompt.bbh_web_of_lies,
- hf_repo="lukaemon/bbh",
- hf_subset="web_of_lies",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbh_word_sorting_harness = LightevalTaskConfig(
- name="bbh:word_sorting",
- suite=["harness"],
- prompt_function=prompt.bbh_word_sorting,
- hf_repo="lukaemon/bbh",
- hf_subset="word_sorting",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["", "Q=", "\n\n"],
- version=0,
-)
-bbq_helm = LightevalTaskConfig(
- name="bbq",
- suite=["helm"],
- prompt_function=prompt.bbq,
- hf_repo="lighteval/bbq_helm",
- hf_subset="all",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bbq_Age_helm = LightevalTaskConfig(
- name="bbq:Age",
- suite=["helm"],
- prompt_function=prompt.bbq,
- hf_repo="lighteval/bbq_helm",
- hf_subset="Age",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bbq_Disability_status_helm = LightevalTaskConfig(
- name="bbq:Disability_status",
- suite=["helm"],
- prompt_function=prompt.bbq,
- hf_repo="lighteval/bbq_helm",
- hf_subset="Disability_status",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bbq_Gender_identity_helm = LightevalTaskConfig(
- name="bbq:Gender_identity",
- suite=["helm"],
- prompt_function=prompt.bbq,
- hf_repo="lighteval/bbq_helm",
- hf_subset="Gender_identity",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bbq_Nationality_helm = LightevalTaskConfig(
- name="bbq:Nationality",
- suite=["helm"],
- prompt_function=prompt.bbq,
- hf_repo="lighteval/bbq_helm",
- hf_subset="Nationality",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bbq_Physical_appearance_helm = LightevalTaskConfig(
- name="bbq:Physical_appearance",
- suite=["helm"],
- prompt_function=prompt.bbq,
- hf_repo="lighteval/bbq_helm",
- hf_subset="Physical_appearance",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bbq_Race_ethnicity_helm = LightevalTaskConfig(
- name="bbq:Race_ethnicity",
- suite=["helm"],
- prompt_function=prompt.bbq,
- hf_repo="lighteval/bbq_helm",
- hf_subset="Race_ethnicity",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bbq_Race_x_SES_helm = LightevalTaskConfig(
- name="bbq:Race_x_SES",
- suite=["helm"],
- prompt_function=prompt.bbq,
- hf_repo="lighteval/bbq_helm",
- hf_subset="Race_x_SES",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bbq_Race_x_gender_helm = LightevalTaskConfig(
- name="bbq:Race_x_gender",
- suite=["helm"],
- prompt_function=prompt.bbq,
- hf_repo="lighteval/bbq_helm",
- hf_subset="Race_x_gender",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bbq_Religion_helm = LightevalTaskConfig(
- name="bbq:Religion",
- suite=["helm"],
- prompt_function=prompt.bbq,
- hf_repo="lighteval/bbq_helm",
- hf_subset="Religion",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bbq_SES_helm = LightevalTaskConfig(
- name="bbq:SES",
- suite=["helm"],
- prompt_function=prompt.bbq,
- hf_repo="lighteval/bbq_helm",
- hf_subset="SES",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bbq_Sexual_orientation_helm = LightevalTaskConfig(
- name="bbq:Sexual_orientation",
- suite=["helm"],
- prompt_function=prompt.bbq,
- hf_repo="lighteval/bbq_helm",
- hf_subset="Sexual_orientation",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bbq_lite_json_bigbench_lite = LightevalTaskConfig(
- name="bbq_lite_json",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="bbq_lite_json",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_auto_debugging_helm = LightevalTaskConfig(
- name="bigbench:auto_debugging",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="auto_debugging",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_age_ambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:age_ambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-age_ambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_age_disambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:age_disambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-age_disambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_disability_status_ambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:disability_status_ambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-disability_status_ambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_disability_status_disambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:disability_status_disambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-disability_status_disambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_gender_identity_ambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:gender_identity_ambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-gender_identity_ambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_gender_identity_disambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:gender_identity_disambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-gender_identity_disambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_nationality_ambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:nationality_ambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-nationality_ambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_nationality_disambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:nationality_disambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-nationality_disambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_physical_appearance_ambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:physical_appearance_ambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-physical_appearance_ambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_physical_appearance_disambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:physical_appearance_disambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-physical_appearance_disambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_race_ethnicity_ambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:race_ethnicity_ambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-race_ethnicity_ambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_race_ethnicity_disambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:race_ethnicity_disambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-race_ethnicity_disambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_religion_ambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:religion_ambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-religion_ambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_religion_disambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:religion_disambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-religion_disambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_ses_ambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:ses_ambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-ses_ambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_ses_disambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:ses_disambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-ses_disambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_sexual_orientation_ambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:sexual_orientation_ambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-sexual_orientation_ambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_bbq_lite_json_sexual_orientation_disambig_helm = LightevalTaskConfig(
- name="bigbench:bbq_lite_json:sexual_orientation_disambig",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="bbq_lite_json-sexual_orientation_disambig",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_code_line_description_helm = LightevalTaskConfig(
- name="bigbench:code_line_description",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="code_line_description",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conceptual_combinations_contradictions_helm = LightevalTaskConfig(
- name="bigbench:conceptual_combinations:contradictions",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conceptual_combinations-contradictions",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conceptual_combinations_emergent_properties_helm = LightevalTaskConfig(
- name="bigbench:conceptual_combinations:emergent_properties",
- suite=["helm"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conceptual_combinations-emergent_properties",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conceptual_combinations_fanciful_fictional_combinations_helm = LightevalTaskConfig(
- name="bigbench:conceptual_combinations:fanciful_fictional_combinations",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conceptual_combinations-fanciful_fictional_combinations",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conceptual_combinations_homonyms_helm = LightevalTaskConfig(
- name="bigbench:conceptual_combinations:homonyms",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conceptual_combinations-homonyms",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conceptual_combinations_invented_words_helm = LightevalTaskConfig(
- name="bigbench:conceptual_combinations:invented_words",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conceptual_combinations-invented_words",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_adna_from_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:adna_from",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-adna_from",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_adna_to_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:adna_to",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-adna_to",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_atikampe_from_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:atikampe_from",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-atikampe_from",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_atikampe_to_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:atikampe_to",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-atikampe_to",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_gornam_from_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:gornam_from",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-gornam_from",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_gornam_to_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:gornam_to",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-gornam_to",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_holuan_from_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:holuan_from",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-holuan_from",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_holuan_to_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:holuan_to",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-holuan_to",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_mkafala_from_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:mkafala_from",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-mkafala_from",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_mkafala_to_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:mkafala_to",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-mkafala_to",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_postpositive_english_from_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:postpositive_english_from",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-postpositive_english_from",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_postpositive_english_to_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:postpositive_english_to",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-postpositive_english_to",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_unapuri_from_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:unapuri_from",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-unapuri_from",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_unapuri_to_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:unapuri_to",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-unapuri_to",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_vaomi_from_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:vaomi_from",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-vaomi_from",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_conlang_translation_vaomi_to_helm = LightevalTaskConfig(
- name="bigbench:conlang_translation:vaomi_to",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="conlang_translation-vaomi_to",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_emoji_movie_helm = LightevalTaskConfig(
- name="bigbench:emoji_movie",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="emoji_movie",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_formal_fallacies_syllogisms_negation_helm = LightevalTaskConfig(
- name="bigbench:formal_fallacies_syllogisms_negation",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="formal_fallacies_syllogisms_negation",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_hindu_knowledge_helm = LightevalTaskConfig(
- name="bigbench:hindu_knowledge",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="hindu_knowledge",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_known_unknowns_helm = LightevalTaskConfig(
- name="bigbench:known_unknowns",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="known_unknowns",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_language_identification_helm = LightevalTaskConfig(
- name="bigbench:language_identification",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="language_identification",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_linguistics_puzzles_helm = LightevalTaskConfig(
- name="bigbench:linguistics_puzzles",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="linguistics_puzzles",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_logic_grid_puzzle_helm = LightevalTaskConfig(
- name="bigbench:logic_grid_puzzle",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="logic_grid_puzzle",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_logical_deduction_five_objects_helm = LightevalTaskConfig(
- name="bigbench:logical_deduction-five_objects",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="logical_deduction-five_objects",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_logical_deduction_seven_objects_helm = LightevalTaskConfig(
- name="bigbench:logical_deduction-seven_objects",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="logical_deduction-seven_objects",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_logical_deduction_three_objects_helm = LightevalTaskConfig(
- name="bigbench:logical_deduction-three_objects",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="logical_deduction-three_objects",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_misconceptions_russian_helm = LightevalTaskConfig(
- name="bigbench:misconceptions_russian",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="misconceptions_russian",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_novel_concepts_helm = LightevalTaskConfig(
- name="bigbench:novel_concepts",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="novel_concepts",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_operators_helm = LightevalTaskConfig(
- name="bigbench:operators",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="operators",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_parsinlu_reading_comprehension_helm = LightevalTaskConfig(
- name="bigbench:parsinlu_reading_comprehension",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="parsinlu_reading_comprehension",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_play_dialog_same_or_different_helm = LightevalTaskConfig(
- name="bigbench:play_dialog_same_or_different",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="play_dialog_same_or_different",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_repeat_copy_logic_helm = LightevalTaskConfig(
- name="bigbench:repeat_copy_logic",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="repeat_copy_logic",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_strange_stories_boolean_helm = LightevalTaskConfig(
- name="bigbench:strange_stories-boolean",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="strange_stories-boolean",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_strange_stories_multiple_choice_helm = LightevalTaskConfig(
- name="bigbench:strange_stories-multiple_choice",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="strange_stories-multiple_choice",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_strategyqa_helm = LightevalTaskConfig(
- name="bigbench:strategyqa",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="strategyqa",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_symbol_interpretation_adversarial_helm = LightevalTaskConfig(
- name="bigbench:symbol_interpretation-adversarial",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="symbol_interpretation-adversarial",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_symbol_interpretation_emoji_agnostic_helm = LightevalTaskConfig(
- name="bigbench:symbol_interpretation-emoji_agnostic",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="symbol_interpretation-emoji_agnostic",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_symbol_interpretation_name_agnostic_helm = LightevalTaskConfig(
- name="bigbench:symbol_interpretation-name_agnostic",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="symbol_interpretation-name_agnostic",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_symbol_interpretation_plain_helm = LightevalTaskConfig(
- name="bigbench:symbol_interpretation-plain",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="symbol_interpretation-plain",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_symbol_interpretation_tricky_helm = LightevalTaskConfig(
- name="bigbench:symbol_interpretation-tricky",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="symbol_interpretation-tricky",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_vitaminc_fact_verification_helm = LightevalTaskConfig(
- name="bigbench:vitaminc_fact_verification",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="vitaminc_fact_verification",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bigbench_winowhy_helm = LightevalTaskConfig(
- name="bigbench:winowhy",
- suite=["helm", "bigbench_scenario"],
- prompt_function=prompt.bigbench_helm,
- hf_repo="lighteval/bigbench_helm",
- hf_subset="winowhy",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_adjunct_island_lighteval = LightevalTaskConfig(
- name="blimp:adjunct_island",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="adjunct_island",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_adjunct_island_helm = LightevalTaskConfig(
- name="blimp:adjunct_island",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="adjunct_island",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_anaphor_gender_agreement_lighteval = LightevalTaskConfig(
- name="blimp:anaphor_gender_agreement",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="anaphor_gender_agreement",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_anaphor_gender_agreement_helm = LightevalTaskConfig(
- name="blimp:anaphor_gender_agreement",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="anaphor_gender_agreement",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_anaphor_number_agreement_lighteval = LightevalTaskConfig(
- name="blimp:anaphor_number_agreement",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="anaphor_number_agreement",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_anaphor_number_agreement_helm = LightevalTaskConfig(
- name="blimp:anaphor_number_agreement",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="anaphor_number_agreement",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_animate_subject_passive_lighteval = LightevalTaskConfig(
- name="blimp:animate_subject_passive",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="animate_subject_passive",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_animate_subject_passive_helm = LightevalTaskConfig(
- name="blimp:animate_subject_passive",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="animate_subject_passive",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_animate_subject_trans_lighteval = LightevalTaskConfig(
- name="blimp:animate_subject_trans",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="animate_subject_trans",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_animate_subject_trans_helm = LightevalTaskConfig(
- name="blimp:animate_subject_trans",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="animate_subject_trans",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_causative_lighteval = LightevalTaskConfig(
- name="blimp:causative",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="causative",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_causative_helm = LightevalTaskConfig(
- name="blimp:causative",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="causative",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_complex_NP_island_lighteval = LightevalTaskConfig(
- name="blimp:complex_NP_island",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="complex_NP_island",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_complex_NP_island_helm = LightevalTaskConfig(
- name="blimp:complex_NP_island",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="complex_NP_island",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_coordinate_structure_constraint_complex_left_branch_lighteval = LightevalTaskConfig(
- name="blimp:coordinate_structure_constraint_complex_left_branch",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="coordinate_structure_constraint_complex_left_branch",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_coordinate_structure_constraint_complex_left_branch_helm = LightevalTaskConfig(
- name="blimp:coordinate_structure_constraint_complex_left_branch",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="coordinate_structure_constraint_complex_left_branch",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_coordinate_structure_constraint_object_extraction_lighteval = LightevalTaskConfig(
- name="blimp:coordinate_structure_constraint_object_extraction",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="coordinate_structure_constraint_object_extraction",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_coordinate_structure_constraint_object_extraction_helm = LightevalTaskConfig(
- name="blimp:coordinate_structure_constraint_object_extraction",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="coordinate_structure_constraint_object_extraction",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_1_lighteval = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_1_helm = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_2_lighteval = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_2_helm = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_irregular_1_lighteval = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_irregular_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_irregular_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_irregular_1_helm = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_irregular_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_irregular_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_irregular_2_lighteval = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_irregular_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_irregular_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_irregular_2_helm = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_irregular_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_irregular_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_with_adj_2_lighteval = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_with_adj_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_with_adj_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_with_adj_2_helm = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_with_adj_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_with_adj_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_with_adj_irregular_1_lighteval = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_with_adj_irregular_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_with_adj_irregular_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_with_adj_irregular_1_helm = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_with_adj_irregular_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_with_adj_irregular_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_with_adj_irregular_2_lighteval = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_with_adj_irregular_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_with_adj_irregular_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_with_adj_irregular_2_helm = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_with_adj_irregular_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_with_adj_irregular_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_with_adjective_1_lighteval = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_with_adjective_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_with_adjective_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_determiner_noun_agreement_with_adjective_1_helm = LightevalTaskConfig(
- name="blimp:determiner_noun_agreement_with_adjective_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="determiner_noun_agreement_with_adjective_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_distractor_agreement_relational_noun_lighteval = LightevalTaskConfig(
- name="blimp:distractor_agreement_relational_noun",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="distractor_agreement_relational_noun",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_distractor_agreement_relational_noun_helm = LightevalTaskConfig(
- name="blimp:distractor_agreement_relational_noun",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="distractor_agreement_relational_noun",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_distractor_agreement_relative_clause_lighteval = LightevalTaskConfig(
- name="blimp:distractor_agreement_relative_clause",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="distractor_agreement_relative_clause",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_distractor_agreement_relative_clause_helm = LightevalTaskConfig(
- name="blimp:distractor_agreement_relative_clause",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="distractor_agreement_relative_clause",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_drop_argument_lighteval = LightevalTaskConfig(
- name="blimp:drop_argument",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="drop_argument",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_drop_argument_helm = LightevalTaskConfig(
- name="blimp:drop_argument",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="drop_argument",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_ellipsis_n_bar_1_lighteval = LightevalTaskConfig(
- name="blimp:ellipsis_n_bar_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="ellipsis_n_bar_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_ellipsis_n_bar_1_helm = LightevalTaskConfig(
- name="blimp:ellipsis_n_bar_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="ellipsis_n_bar_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_ellipsis_n_bar_2_lighteval = LightevalTaskConfig(
- name="blimp:ellipsis_n_bar_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="ellipsis_n_bar_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_ellipsis_n_bar_2_helm = LightevalTaskConfig(
- name="blimp:ellipsis_n_bar_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="ellipsis_n_bar_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_existential_there_object_raising_lighteval = LightevalTaskConfig(
- name="blimp:existential_there_object_raising",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="existential_there_object_raising",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_existential_there_object_raising_helm = LightevalTaskConfig(
- name="blimp:existential_there_object_raising",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="existential_there_object_raising",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_existential_there_quantifiers_1_lighteval = LightevalTaskConfig(
- name="blimp:existential_there_quantifiers_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="existential_there_quantifiers_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_existential_there_quantifiers_1_helm = LightevalTaskConfig(
- name="blimp:existential_there_quantifiers_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="existential_there_quantifiers_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_existential_there_quantifiers_2_lighteval = LightevalTaskConfig(
- name="blimp:existential_there_quantifiers_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="existential_there_quantifiers_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_existential_there_quantifiers_2_helm = LightevalTaskConfig(
- name="blimp:existential_there_quantifiers_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="existential_there_quantifiers_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_existential_there_subject_raising_lighteval = LightevalTaskConfig(
- name="blimp:existential_there_subject_raising",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="existential_there_subject_raising",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_existential_there_subject_raising_helm = LightevalTaskConfig(
- name="blimp:existential_there_subject_raising",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="existential_there_subject_raising",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_expletive_it_object_raising_lighteval = LightevalTaskConfig(
- name="blimp:expletive_it_object_raising",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="expletive_it_object_raising",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_expletive_it_object_raising_helm = LightevalTaskConfig(
- name="blimp:expletive_it_object_raising",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="expletive_it_object_raising",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_inchoative_lighteval = LightevalTaskConfig(
- name="blimp:inchoative",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="inchoative",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_inchoative_helm = LightevalTaskConfig(
- name="blimp:inchoative",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="inchoative",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_intransitive_lighteval = LightevalTaskConfig(
- name="blimp:intransitive",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="intransitive",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_intransitive_helm = LightevalTaskConfig(
- name="blimp:intransitive",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="intransitive",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_irregular_past_participle_adjectives_lighteval = LightevalTaskConfig(
- name="blimp:irregular_past_participle_adjectives",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="irregular_past_participle_adjectives",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_irregular_past_participle_adjectives_helm = LightevalTaskConfig(
- name="blimp:irregular_past_participle_adjectives",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="irregular_past_participle_adjectives",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_irregular_past_participle_verbs_lighteval = LightevalTaskConfig(
- name="blimp:irregular_past_participle_verbs",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="irregular_past_participle_verbs",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_irregular_past_participle_verbs_helm = LightevalTaskConfig(
- name="blimp:irregular_past_participle_verbs",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="irregular_past_participle_verbs",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_irregular_plural_subject_verb_agreement_1_lighteval = LightevalTaskConfig(
- name="blimp:irregular_plural_subject_verb_agreement_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="irregular_plural_subject_verb_agreement_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_irregular_plural_subject_verb_agreement_1_helm = LightevalTaskConfig(
- name="blimp:irregular_plural_subject_verb_agreement_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="irregular_plural_subject_verb_agreement_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_irregular_plural_subject_verb_agreement_2_lighteval = LightevalTaskConfig(
- name="blimp:irregular_plural_subject_verb_agreement_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="irregular_plural_subject_verb_agreement_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_irregular_plural_subject_verb_agreement_2_helm = LightevalTaskConfig(
- name="blimp:irregular_plural_subject_verb_agreement_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="irregular_plural_subject_verb_agreement_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_left_branch_island_echo_question_lighteval = LightevalTaskConfig(
- name="blimp:left_branch_island_echo_question",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="left_branch_island_echo_question",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_left_branch_island_echo_question_helm = LightevalTaskConfig(
- name="blimp:left_branch_island_echo_question",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="left_branch_island_echo_question",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_left_branch_island_simple_question_lighteval = LightevalTaskConfig(
- name="blimp:left_branch_island_simple_question",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="left_branch_island_simple_question",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_left_branch_island_simple_question_helm = LightevalTaskConfig(
- name="blimp:left_branch_island_simple_question",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="left_branch_island_simple_question",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_matrix_question_npi_licensor_present_lighteval = LightevalTaskConfig(
- name="blimp:matrix_question_npi_licensor_present",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="matrix_question_npi_licensor_present",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_matrix_question_npi_licensor_present_helm = LightevalTaskConfig(
- name="blimp:matrix_question_npi_licensor_present",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="matrix_question_npi_licensor_present",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_npi_present_1_lighteval = LightevalTaskConfig(
- name="blimp:npi_present_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="npi_present_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_npi_present_1_helm = LightevalTaskConfig(
- name="blimp:npi_present_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="npi_present_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_npi_present_2_lighteval = LightevalTaskConfig(
- name="blimp:npi_present_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="npi_present_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_npi_present_2_helm = LightevalTaskConfig(
- name="blimp:npi_present_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="npi_present_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_only_npi_licensor_present_lighteval = LightevalTaskConfig(
- name="blimp:only_npi_licensor_present",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="only_npi_licensor_present",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_only_npi_licensor_present_helm = LightevalTaskConfig(
- name="blimp:only_npi_licensor_present",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="only_npi_licensor_present",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_only_npi_scope_lighteval = LightevalTaskConfig(
- name="blimp:only_npi_scope",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="only_npi_scope",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_only_npi_scope_helm = LightevalTaskConfig(
- name="blimp:only_npi_scope",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="only_npi_scope",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_passive_1_lighteval = LightevalTaskConfig(
- name="blimp:passive_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="passive_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_passive_1_helm = LightevalTaskConfig(
- name="blimp:passive_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="passive_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_passive_2_lighteval = LightevalTaskConfig(
- name="blimp:passive_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="passive_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_passive_2_helm = LightevalTaskConfig(
- name="blimp:passive_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="passive_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_c_command_lighteval = LightevalTaskConfig(
- name="blimp:principle_A_c_command",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="principle_A_c_command",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_c_command_helm = LightevalTaskConfig(
- name="blimp:principle_A_c_command",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="principle_A_c_command",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_case_1_lighteval = LightevalTaskConfig(
- name="blimp:principle_A_case_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="principle_A_case_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_case_1_helm = LightevalTaskConfig(
- name="blimp:principle_A_case_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="principle_A_case_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_case_2_lighteval = LightevalTaskConfig(
- name="blimp:principle_A_case_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="principle_A_case_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_case_2_helm = LightevalTaskConfig(
- name="blimp:principle_A_case_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="principle_A_case_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_domain_1_lighteval = LightevalTaskConfig(
- name="blimp:principle_A_domain_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="principle_A_domain_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_domain_1_helm = LightevalTaskConfig(
- name="blimp:principle_A_domain_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="principle_A_domain_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_domain_2_lighteval = LightevalTaskConfig(
- name="blimp:principle_A_domain_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="principle_A_domain_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_domain_2_helm = LightevalTaskConfig(
- name="blimp:principle_A_domain_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="principle_A_domain_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_domain_3_lighteval = LightevalTaskConfig(
- name="blimp:principle_A_domain_3",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="principle_A_domain_3",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_domain_3_helm = LightevalTaskConfig(
- name="blimp:principle_A_domain_3",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="principle_A_domain_3",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_reconstruction_lighteval = LightevalTaskConfig(
- name="blimp:principle_A_reconstruction",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="principle_A_reconstruction",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_principle_A_reconstruction_helm = LightevalTaskConfig(
- name="blimp:principle_A_reconstruction",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="principle_A_reconstruction",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_regular_plural_subject_verb_agreement_1_lighteval = LightevalTaskConfig(
- name="blimp:regular_plural_subject_verb_agreement_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="regular_plural_subject_verb_agreement_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_regular_plural_subject_verb_agreement_1_helm = LightevalTaskConfig(
- name="blimp:regular_plural_subject_verb_agreement_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="regular_plural_subject_verb_agreement_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_regular_plural_subject_verb_agreement_2_lighteval = LightevalTaskConfig(
- name="blimp:regular_plural_subject_verb_agreement_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="regular_plural_subject_verb_agreement_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_regular_plural_subject_verb_agreement_2_helm = LightevalTaskConfig(
- name="blimp:regular_plural_subject_verb_agreement_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="regular_plural_subject_verb_agreement_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_sentential_negation_npi_licensor_present_lighteval = LightevalTaskConfig(
- name="blimp:sentential_negation_npi_licensor_present",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="sentential_negation_npi_licensor_present",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_sentential_negation_npi_licensor_present_helm = LightevalTaskConfig(
- name="blimp:sentential_negation_npi_licensor_present",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="sentential_negation_npi_licensor_present",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_sentential_negation_npi_scope_lighteval = LightevalTaskConfig(
- name="blimp:sentential_negation_npi_scope",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="sentential_negation_npi_scope",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_sentential_negation_npi_scope_helm = LightevalTaskConfig(
- name="blimp:sentential_negation_npi_scope",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="sentential_negation_npi_scope",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_sentential_subject_island_lighteval = LightevalTaskConfig(
- name="blimp:sentential_subject_island",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="sentential_subject_island",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_sentential_subject_island_helm = LightevalTaskConfig(
- name="blimp:sentential_subject_island",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="sentential_subject_island",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_superlative_quantifiers_1_lighteval = LightevalTaskConfig(
- name="blimp:superlative_quantifiers_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="superlative_quantifiers_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_superlative_quantifiers_1_helm = LightevalTaskConfig(
- name="blimp:superlative_quantifiers_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="superlative_quantifiers_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_superlative_quantifiers_2_lighteval = LightevalTaskConfig(
- name="blimp:superlative_quantifiers_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="superlative_quantifiers_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_superlative_quantifiers_2_helm = LightevalTaskConfig(
- name="blimp:superlative_quantifiers_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="superlative_quantifiers_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_tough_vs_raising_1_lighteval = LightevalTaskConfig(
- name="blimp:tough_vs_raising_1",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="tough_vs_raising_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_tough_vs_raising_1_helm = LightevalTaskConfig(
- name="blimp:tough_vs_raising_1",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="tough_vs_raising_1",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_tough_vs_raising_2_lighteval = LightevalTaskConfig(
- name="blimp:tough_vs_raising_2",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="tough_vs_raising_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_tough_vs_raising_2_helm = LightevalTaskConfig(
- name="blimp:tough_vs_raising_2",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="tough_vs_raising_2",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_transitive_lighteval = LightevalTaskConfig(
- name="blimp:transitive",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="transitive",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_transitive_helm = LightevalTaskConfig(
- name="blimp:transitive",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="transitive",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_island_lighteval = LightevalTaskConfig(
- name="blimp:wh_island",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="wh_island",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_island_helm = LightevalTaskConfig(
- name="blimp:wh_island",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="wh_island",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_questions_object_gap_lighteval = LightevalTaskConfig(
- name="blimp:wh_questions_object_gap",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="wh_questions_object_gap",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_questions_object_gap_helm = LightevalTaskConfig(
- name="blimp:wh_questions_object_gap",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="wh_questions_object_gap",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_questions_subject_gap_lighteval = LightevalTaskConfig(
- name="blimp:wh_questions_subject_gap",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="wh_questions_subject_gap",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_questions_subject_gap_helm = LightevalTaskConfig(
- name="blimp:wh_questions_subject_gap",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="wh_questions_subject_gap",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_questions_subject_gap_long_distance_lighteval = LightevalTaskConfig(
- name="blimp:wh_questions_subject_gap_long_distance",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="wh_questions_subject_gap_long_distance",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_questions_subject_gap_long_distance_helm = LightevalTaskConfig(
- name="blimp:wh_questions_subject_gap_long_distance",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="wh_questions_subject_gap_long_distance",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_vs_that_no_gap_lighteval = LightevalTaskConfig(
- name="blimp:wh_vs_that_no_gap",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="wh_vs_that_no_gap",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_vs_that_no_gap_helm = LightevalTaskConfig(
- name="blimp:wh_vs_that_no_gap",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="wh_vs_that_no_gap",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_vs_that_no_gap_long_distance_lighteval = LightevalTaskConfig(
- name="blimp:wh_vs_that_no_gap_long_distance",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="wh_vs_that_no_gap_long_distance",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_vs_that_no_gap_long_distance_helm = LightevalTaskConfig(
- name="blimp:wh_vs_that_no_gap_long_distance",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="wh_vs_that_no_gap_long_distance",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_vs_that_with_gap_lighteval = LightevalTaskConfig(
- name="blimp:wh_vs_that_with_gap",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="wh_vs_that_with_gap",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_vs_that_with_gap_helm = LightevalTaskConfig(
- name="blimp:wh_vs_that_with_gap",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="wh_vs_that_with_gap",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_vs_that_with_gap_long_distance_lighteval = LightevalTaskConfig(
- name="blimp:wh_vs_that_with_gap_long_distance",
- suite=["lighteval", "blimp"],
- prompt_function=prompt.blimp,
- hf_repo="blimp",
- hf_subset="wh_vs_that_with_gap_long_distance",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-blimp_wh_vs_that_with_gap_long_distance_helm = LightevalTaskConfig(
- name="blimp:wh_vs_that_with_gap_long_distance",
- suite=["helm", "blimp"],
- prompt_function=prompt.blimp_helm,
- hf_repo="blimp",
- hf_subset="wh_vs_that_with_gap_long_distance",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bold_helm = LightevalTaskConfig(
- name="bold",
- suite=["helm"],
- prompt_function=prompt.bold,
- hf_repo="lighteval/bold_helm",
- hf_subset="all",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.prediction_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-bold_gender_helm = LightevalTaskConfig(
- name="bold:gender",
- suite=["helm"],
- prompt_function=prompt.bold,
- hf_repo="lighteval/bold_helm",
- hf_subset="gender",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.prediction_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-bold_political_ideology_helm = LightevalTaskConfig(
- name="bold:political_ideology",
- suite=["helm"],
- prompt_function=prompt.bold,
- hf_repo="lighteval/bold_helm",
- hf_subset="political_ideology",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.prediction_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-bold_profession_helm = LightevalTaskConfig(
- name="bold:profession",
- suite=["helm"],
- prompt_function=prompt.bold,
- hf_repo="lighteval/bold_helm",
- hf_subset="profession",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.prediction_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-bold_race_helm = LightevalTaskConfig(
- name="bold:race",
- suite=["helm"],
- prompt_function=prompt.bold,
- hf_repo="lighteval/bold_helm",
- hf_subset="race",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.prediction_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-bold_religious_ideology_helm = LightevalTaskConfig(
- name="bold:religious_ideology",
- suite=["helm"],
- prompt_function=prompt.bold,
- hf_repo="lighteval/bold_helm",
- hf_subset="religious_ideology",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.prediction_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-boolq_helm = LightevalTaskConfig(
- name="boolq",
- suite=["helm", "helm_general"],
- prompt_function=prompt.boolq_helm,
- hf_repo="lighteval/boolq_helm",
- hf_subset="default",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-boolq_contrastset_helm = LightevalTaskConfig(
- name="boolq:contrastset",
- suite=["helm"],
- prompt_function=prompt.boolq_helm_contrastset,
- hf_repo="lighteval/boolq_helm",
- hf_subset="default",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-bridging_anaphora_resolution_barqa_bigbench = LightevalTaskConfig(
- name="bridging_anaphora_resolution_barqa",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="bridging_anaphora_resolution_barqa",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-causal_judgment_bigbench = LightevalTaskConfig(
- name="causal_judgment",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="causal_judgment",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-cause_and_effect_bigbench = LightevalTaskConfig(
- name="cause_and_effect",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="cause_and_effect",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-checkmate_in_one_bigbench = LightevalTaskConfig(
- name="checkmate_in_one",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="checkmate_in_one",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-chess_state_tracking_bigbench = LightevalTaskConfig(
- name="chess_state_tracking",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="chess_state_tracking",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-chinese_remainder_theorem_bigbench = LightevalTaskConfig(
- name="chinese_remainder_theorem",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="chinese_remainder_theorem",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-cifar10_classification_bigbench = LightevalTaskConfig(
- name="cifar10_classification",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="cifar10_classification",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-civil_comments_helm = LightevalTaskConfig(
- name="civil_comments",
- suite=["helm", "helm_general"],
- prompt_function=prompt.civil_comments,
- hf_repo="lighteval/civil_comments_helm",
- hf_subset="all",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-civil_comments_LGBTQ_helm = LightevalTaskConfig(
- name="civil_comments:LGBTQ",
- suite=["helm"],
- prompt_function=prompt.civil_comments,
- hf_repo="lighteval/civil_comments_helm",
- hf_subset="LGBTQ",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-civil_comments_black_helm = LightevalTaskConfig(
- name="civil_comments:black",
- suite=["helm"],
- prompt_function=prompt.civil_comments,
- hf_repo="lighteval/civil_comments_helm",
- hf_subset="black",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-civil_comments_christian_helm = LightevalTaskConfig(
- name="civil_comments:christian",
- suite=["helm"],
- prompt_function=prompt.civil_comments,
- hf_repo="lighteval/civil_comments_helm",
- hf_subset="christian",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-civil_comments_female_helm = LightevalTaskConfig(
- name="civil_comments:female",
- suite=["helm"],
- prompt_function=prompt.civil_comments,
- hf_repo="lighteval/civil_comments_helm",
- hf_subset="female",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-civil_comments_male_helm = LightevalTaskConfig(
- name="civil_comments:male",
- suite=["helm"],
- prompt_function=prompt.civil_comments,
- hf_repo="lighteval/civil_comments_helm",
- hf_subset="male",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-civil_comments_muslim_helm = LightevalTaskConfig(
- name="civil_comments:muslim",
- suite=["helm"],
- prompt_function=prompt.civil_comments,
- hf_repo="lighteval/civil_comments_helm",
- hf_subset="muslim",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-civil_comments_other_religions_helm = LightevalTaskConfig(
- name="civil_comments:other_religions",
- suite=["helm"],
- prompt_function=prompt.civil_comments,
- hf_repo="lighteval/civil_comments_helm",
- hf_subset="other_religions",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-civil_comments_white_helm = LightevalTaskConfig(
- name="civil_comments:white",
- suite=["helm"],
- prompt_function=prompt.civil_comments,
- hf_repo="lighteval/civil_comments_helm",
- hf_subset="white",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-code_line_description_bigbench_lite = LightevalTaskConfig(
- name="code_line_description",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_linefeed_before_and_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="code_line_description",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-codenames_bigbench = LightevalTaskConfig(
- name="codenames",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="codenames",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.rouge_t5, Metrics.bleu],
- stop_sequence=["\n"],
- version=0,
-)
-color_bigbench = LightevalTaskConfig(
- name="color",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="color",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.rouge_t5,
- Metrics.bleu,
- Metrics.loglikelihood_acc,
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-common_morpheme_bigbench = LightevalTaskConfig(
- name="common_morpheme",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="common_morpheme",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-commonsenseqa_helm = LightevalTaskConfig(
- name="commonsenseqa",
- suite=["helm", "commonsense_scenario"],
- prompt_function=prompt.commonsense_qa,
- hf_repo="commonsense_qa",
- hf_subset="default",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-conceptual_combinations_bigbench_lite = LightevalTaskConfig(
- name="conceptual_combinations",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="conceptual_combinations",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-conlang_translation_bigbench_lite = LightevalTaskConfig(
- name="conlang_translation",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="conlang_translation",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=[".", ";", "!", "?"],
- version=0,
-)
-contextual_parametric_knowledge_conflicts_bigbench = LightevalTaskConfig(
- name="contextual_parametric_knowledge_conflicts",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="contextual_parametric_knowledge_conflicts",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_n_books_1000_extractions_per_book_1_prefix_length_125_helm = LightevalTaskConfig(
- name="copyright:n_books_1000-extractions_per_book_1-prefix_length_125",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="n_books_1000-extractions_per_book_1-prefix_length_125",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_n_books_1000_extractions_per_book_1_prefix_length_25_helm = LightevalTaskConfig(
- name="copyright:n_books_1000-extractions_per_book_1-prefix_length_25",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="n_books_1000-extractions_per_book_1-prefix_length_25",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_n_books_1000_extractions_per_book_1_prefix_length_5_helm = LightevalTaskConfig(
- name="copyright:n_books_1000-extractions_per_book_1-prefix_length_5",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="n_books_1000-extractions_per_book_1-prefix_length_5",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_n_books_1000_extractions_per_book_3_prefix_length_125_helm = LightevalTaskConfig(
- name="copyright:n_books_1000-extractions_per_book_3-prefix_length_125",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="n_books_1000-extractions_per_book_3-prefix_length_125",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_n_books_1000_extractions_per_book_3_prefix_length_25_helm = LightevalTaskConfig(
- name="copyright:n_books_1000-extractions_per_book_3-prefix_length_25",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="n_books_1000-extractions_per_book_3-prefix_length_25",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_n_books_1000_extractions_per_book_3_prefix_length_5_helm = LightevalTaskConfig(
- name="copyright:n_books_1000-extractions_per_book_3-prefix_length_5",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="n_books_1000-extractions_per_book_3-prefix_length_5",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_oh_the_places_helm = LightevalTaskConfig(
- name="copyright:oh_the_places",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="oh_the_places",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_pilot_helm = LightevalTaskConfig(
- name="copyright:pilot",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="pilot",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_popular_books_prefix_length_10_helm = LightevalTaskConfig(
- name="copyright:popular_books-prefix_length_10",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="popular_books-prefix_length_10",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_popular_books_prefix_length_125_helm = LightevalTaskConfig(
- name="copyright:popular_books-prefix_length_125",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="popular_books-prefix_length_125",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_popular_books_prefix_length_25_helm = LightevalTaskConfig(
- name="copyright:popular_books-prefix_length_25",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="popular_books-prefix_length_25",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_popular_books_prefix_length_250_helm = LightevalTaskConfig(
- name="copyright:popular_books-prefix_length_250",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="popular_books-prefix_length_250",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_popular_books_prefix_length_5_helm = LightevalTaskConfig(
- name="copyright:popular_books-prefix_length_5",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="popular_books-prefix_length_5",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_popular_books_prefix_length_50_helm = LightevalTaskConfig(
- name="copyright:popular_books-prefix_length_50",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="popular_books-prefix_length_50",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_prompt_num_line_1_min_lines_20_helm = LightevalTaskConfig(
- name="copyright:prompt_num_line_1-min_lines_20",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="prompt_num_line_1-min_lines_20",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_prompt_num_line_10_min_lines_20_helm = LightevalTaskConfig(
- name="copyright:prompt_num_line_10-min_lines_20",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="prompt_num_line_10-min_lines_20",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-copyright_prompt_num_line_5_min_lines_20_helm = LightevalTaskConfig(
- name="copyright:prompt_num_line_5-min_lines_20",
- suite=["helm", "copyright_scenario"],
- prompt_function=prompt.copyright,
- hf_repo="lighteval/copyright_helm",
- hf_subset="prompt_num_line_5-min_lines_20",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.copyright],
- stop_sequence=["\n"],
- version=0,
-)
-coqa_first_question = LightevalTaskConfig(
- name="coqa",
- prompt_function=get_qa_prompt_function(
- Language.ENGLISH,
- lambda line: {
- "question": line["questions"][0],
- "context": line["story"],
- "choices": [line["answers"]["input_text"][0]],
- },
- ),
- suite=["lighteval"],
- hf_repo="stanfordnlp/coqa",
- hf_subset="default",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- stop_sequence=["\n", "Question:", "question:"],
- generation_size=100,
- version=1,
- metrics=(
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ),
-)
-coqa_bb_lighteval = LightevalTaskConfig(
- name="coqa_bb",
- suite=["lighteval", "bigbench_programmatic", "bigbench"],
- prompt_function=prompt.coqa,
- hf_repo="coqa",
- hf_subset="default",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False}), Metrics.f1_score],
- stop_sequence=["\n"],
- version=0,
-)
-covid_dialogue_helm = LightevalTaskConfig(
- name="covid_dialogue",
- suite=["helm"],
- prompt_function=prompt.covid_dialogue,
- hf_repo="lighteval/covid_dialogue",
- hf_subset="default",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=128,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.rougeL,
- Metrics.bleu_1,
- Metrics.bleu_4,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-crash_blossom_bigbench = LightevalTaskConfig(
- name="crash_blossom",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="crash_blossom",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-crass_ai_bigbench = LightevalTaskConfig(
- name="crass_ai",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="crass_ai",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-cryobiology_spanish_bigbench = LightevalTaskConfig(
- name="cryobiology_spanish",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="cryobiology_spanish",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-cryptonite_bigbench = LightevalTaskConfig(
- name="cryptonite",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="cryptonite",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-cs_algorithms_bigbench = LightevalTaskConfig(
- name="cs_algorithms",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="cs_algorithms",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-dark_humor_detection_bigbench = LightevalTaskConfig(
- name="dark_humor_detection",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="dark_humor_detection",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-date_understanding_bigbench = LightevalTaskConfig(
- name="date_understanding",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="date_understanding",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-disambiguation_qa_bigbench = LightevalTaskConfig(
- name="disambiguation_qa",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="disambiguation_qa",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-discourse_marker_prediction_bigbench = LightevalTaskConfig(
- name="discourse_marker_prediction",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="discourse_marker_prediction",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-disfl_qa_bigbench = LightevalTaskConfig(
- name="disfl_qa",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="disfl_qa",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-drop_qa = LightevalTaskConfig(
- name="drop",
- prompt_function=get_qa_prompt_function(
- Language.ENGLISH,
- lambda line: {
- "context": line["passage"],
- "question": line["question"],
- "choices": list(
- filter(
- lambda x: x,
- [line["answer"].get("number")]
- + line["answer"]["spans"]
- + [prompt.get_drop_date(line["answer"].get("date"))],
- )
- ),
- },
- ),
- suite=("lighteval",),
- hf_repo="lighteval/drop_harness",
- hf_subset="default",
- hf_filter=lambda line: list(
- filter(
- lambda x: x,
- [line["answer"].get("number")]
- + line["answer"]["spans"]
- + [prompt.get_drop_date(line["answer"].get("date"))],
- )
- ),
- evaluation_splits=("validation",),
- few_shots_split="train",
- generation_size=250,
- stop_sequence=["Question:", "question:", "\n"],
- metrics=(
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ),
- version=1,
-)
-dyck_language_2_helm = LightevalTaskConfig(
- name="dyck_language:2",
- suite=["helm"],
- prompt_function=prompt.dyck_language,
- hf_repo="lighteval/DyckLanguage",
- hf_subset="2",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[Metrics.exact_match],
- stop_sequence=["\n"],
- version=0,
-)
-dyck_language_3_helm = LightevalTaskConfig(
- name="dyck_language:3",
- suite=["helm"],
- prompt_function=prompt.dyck_language,
- hf_repo="lighteval/DyckLanguage",
- hf_subset="3",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[Metrics.exact_match],
- stop_sequence=["\n"],
- version=0,
-)
-dyck_language_4_helm = LightevalTaskConfig(
- name="dyck_language:4",
- suite=["helm"],
- prompt_function=prompt.dyck_language,
- hf_repo="lighteval/DyckLanguage",
- hf_subset="4",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[Metrics.exact_match],
- stop_sequence=["\n"],
- version=0,
-)
-dyck_languages_bigbench = LightevalTaskConfig(
- name="dyck_languages",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="dyck_languages",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-elementary_math_qa_bigbench = LightevalTaskConfig(
- name="elementary_math_qa",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="elementary_math_qa",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-emoji_movie_bigbench_lite = LightevalTaskConfig(
- name="emoji_movie",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="emoji_movie",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.rouge_t5,
- Metrics.bleu,
- Metrics.loglikelihood_acc,
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-emojis_emotion_prediction_bigbench = LightevalTaskConfig(
- name="emojis_emotion_prediction",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="emojis_emotion_prediction",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-empirical_judgments_bigbench = LightevalTaskConfig(
- name="empirical_judgments",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="empirical_judgments",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-english_proverbs_bigbench = LightevalTaskConfig(
- name="english_proverbs",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="english_proverbs",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-english_russian_proverbs_bigbench = LightevalTaskConfig(
- name="english_russian_proverbs",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="english_russian_proverbs",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-entailed_polarity_bigbench = LightevalTaskConfig(
- name="entailed_polarity",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="entailed_polarity",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-entailed_polarity_hindi_bigbench = LightevalTaskConfig(
- name="entailed_polarity_hindi",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="entailed_polarity_hindi",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-entity_data_imputation_Buy_helm = LightevalTaskConfig(
- name="entity_data_imputation:Buy",
- suite=["helm"],
- prompt_function=prompt.entity_data_imputation,
- hf_repo="lighteval/Buy",
- hf_subset="default",
- hf_avail_splits=["train", "test", "valid"],
- evaluation_splits=["valid", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_data_imputation_Restaurant_helm = LightevalTaskConfig(
- name="entity_data_imputation:Restaurant",
- suite=["helm"],
- prompt_function=prompt.entity_data_imputation,
- hf_repo="lighteval/Restaurant",
- hf_subset="default",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_Abt_Buy_helm = LightevalTaskConfig(
- name="entity_matching:Abt_Buy",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="Abt_Buy",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_Amazon_Google_helm = LightevalTaskConfig(
- name="entity_matching:Amazon_Google",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="Amazon_Google",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_Beer_helm = LightevalTaskConfig(
- name="entity_matching:Beer",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="Beer",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_Company_helm = LightevalTaskConfig(
- name="entity_matching:Company",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="Company",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_DBLP_ACM_helm = LightevalTaskConfig(
- name="entity_matching:DBLP_ACM",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="DBLP_ACM",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_DBLP_GoogleScholar_helm = LightevalTaskConfig(
- name="entity_matching:DBLP_GoogleScholar",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="DBLP_GoogleScholar",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_Dirty_DBLP_ACM_helm = LightevalTaskConfig(
- name="entity_matching:Dirty_DBLP_ACM",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="Dirty_DBLP_ACM",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_Dirty_DBLP_GoogleScholar_helm = LightevalTaskConfig(
- name="entity_matching:Dirty_DBLP_GoogleScholar",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="Dirty_DBLP_GoogleScholar",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_Dirty_Walmart_Amazon_helm = LightevalTaskConfig(
- name="entity_matching:Dirty_Walmart_Amazon",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="Dirty_Walmart_Amazon",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_Dirty_iTunes_Amazon_helm = LightevalTaskConfig(
- name="entity_matching:Dirty_iTunes_Amazon",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="Dirty_iTunes_Amazon",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_Fodors_Zagats_helm = LightevalTaskConfig(
- name="entity_matching=Fodors_Zagats",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="Fodors_Zagats",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_Walmart_Amazon_helm = LightevalTaskConfig(
- name="entity_matching:Walmart_Amazon",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="Walmart_Amazon",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-entity_matching_iTunes_Amazon_helm = LightevalTaskConfig(
- name="entity_matching:iTunes_Amazon",
- suite=["helm"],
- prompt_function=prompt.entity_matching,
- hf_repo="lighteval/EntityMatching",
- hf_subset="iTunes_Amazon",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-epistemic_reasoning_bigbench = LightevalTaskConfig(
- name="epistemic_reasoning",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="epistemic_reasoning",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-ethics_commonsense_lighteval = LightevalTaskConfig(
- name="ethics:commonsense",
- suite=["lighteval", "ethics"],
- prompt_function=prompt.ethics_commonsense,
- hf_repo="lighteval/hendrycks_ethics",
- hf_subset="commonsense",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-ethics_deontology_lighteval = LightevalTaskConfig(
- name="ethics:deontology",
- suite=["lighteval", "ethics"],
- prompt_function=prompt.ethics_deontology,
- hf_repo="lighteval/hendrycks_ethics",
- hf_subset="deontology",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-ethics_justice_lighteval = LightevalTaskConfig(
- name="ethics:justice",
- suite=["lighteval", "ethics"],
- prompt_function=prompt.ethics_justice,
- hf_repo="lighteval/hendrycks_ethics",
- hf_subset="justice",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-ethics_utilitarianism_lighteval = LightevalTaskConfig(
- name="ethics:utilitarianism",
- suite=["lighteval", "ethics"],
- prompt_function=prompt.ethics_utilitarianism,
- hf_repo="lighteval/hendrycks_ethics",
- hf_subset="utilitarianism",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-ethics_virtue_lighteval = LightevalTaskConfig(
- name="ethics:virtue",
- suite=["lighteval", "ethics"],
- prompt_function=prompt.ethics_virtue,
- hf_repo="lighteval/hendrycks_ethics",
- hf_subset="virtue",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-evaluating_information_essentiality_bigbench = LightevalTaskConfig(
- name="evaluating_information_essentiality",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="evaluating_information_essentiality",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-fact_checker_bigbench = LightevalTaskConfig(
- name="fact_checker",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="fact_checker",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-fantasy_reasoning_bigbench = LightevalTaskConfig(
- name="fantasy_reasoning",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="fantasy_reasoning",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-few_shot_nlg_bigbench = LightevalTaskConfig(
- name="few_shot_nlg",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="few_shot_nlg",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.bleurt],
- stop_sequence=["\n"],
- version=0,
-)
-figure_of_speech_detection_bigbench = LightevalTaskConfig(
- name="figure_of_speech_detection",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="figure_of_speech_detection",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-formal_fallacies_syllogisms_negation_bigbench_lite = LightevalTaskConfig(
- name="formal_fallacies_syllogisms_negation",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="formal_fallacies_syllogisms_negation",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-gem_bigbench = LightevalTaskConfig(
- name="gem",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="gem",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.rouge_t5],
- stop_sequence=["\n"],
- version=0,
-)
-gender_inclusive_sentences_german_bigbench = LightevalTaskConfig(
- name="gender_inclusive_sentences_german",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="gender_inclusive_sentences_german",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-general_knowledge_bigbench = LightevalTaskConfig(
- name="general_knowledge",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="general_knowledge",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-geometric_shapes_bigbench = LightevalTaskConfig(
- name="geometric_shapes",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="geometric_shapes",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.rouge_t5,
- Metrics.bleu,
- Metrics.loglikelihood_acc,
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-glue_cola_lighteval = LightevalTaskConfig(
- name="glue:cola",
- suite=["lighteval", "glue"],
- prompt_function=prompt.cola,
- hf_repo="glue",
- hf_subset="cola",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc, Metrics.mcc],
- stop_sequence=["\n"],
- version=0,
-)
-glue_mnli_lighteval = LightevalTaskConfig(
- name="glue:mnli",
- suite=["lighteval", "glue"],
- prompt_function=prompt.mnli,
- hf_repo="glue",
- hf_subset="mnli_matched",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-glue_mnli_mismatched_lighteval = LightevalTaskConfig(
- name="glue:mnli_mismatched",
- suite=["lighteval", "glue"],
- prompt_function=prompt.mnli,
- hf_repo="glue",
- hf_subset="mnli_mismatched",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-glue_mrpc_lighteval = LightevalTaskConfig(
- name="glue:mrpc",
- suite=["lighteval", "glue"],
- prompt_function=prompt.mrpc,
- hf_repo="glue",
- hf_subset="mrpc",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_f1],
- stop_sequence=["\n"],
- version=0,
-)
-glue_qnli_lighteval = LightevalTaskConfig(
- name="glue:qnli",
- suite=["lighteval", "glue"],
- prompt_function=prompt.qnli,
- hf_repo="glue",
- hf_subset="qnli",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-glue_qqp_lighteval = LightevalTaskConfig(
- name="glue:qqp",
- suite=["lighteval", "glue"],
- prompt_function=prompt.qqp,
- hf_repo="glue",
- hf_subset="qqp",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_f1],
- stop_sequence=["\n"],
- version=0,
-)
-glue_rte_lighteval = LightevalTaskConfig(
- name="glue:rte",
- suite=["lighteval", "glue"],
- prompt_function=prompt.rte,
- hf_repo="glue",
- hf_subset="rte",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-glue_sst2_lighteval = LightevalTaskConfig(
- name="glue:sst2",
- suite=["lighteval", "glue"],
- prompt_function=prompt.sst,
- hf_repo="glue",
- hf_subset="sst2",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-glue_stsb_lighteval = LightevalTaskConfig(
- name="glue:stsb",
- suite=["lighteval", "glue"],
- prompt_function=prompt.stsb,
- hf_repo="glue",
- hf_subset="stsb",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-glue_wnli_lighteval = LightevalTaskConfig(
- name="glue:wnli",
- suite=["lighteval", "glue"],
- prompt_function=prompt.wnli,
- hf_repo="glue",
- hf_subset="wnli",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-goal_step_wikihow_bigbench = LightevalTaskConfig(
- name="goal_step_wikihow",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="goal_step_wikihow",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-gpqa_lighteval = LightevalTaskConfig(
- name="gpqa:mc",
- suite=["lighteval"],
- prompt_function=prompt.gpqa,
- hf_repo="Idavidrein/gpqa",
- hf_subset="gpqa_main",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-gpqa_diamond_instruct_lighteval = LightevalTaskConfig(
- name="gpqa:diamond",
- suite=["lighteval"],
- prompt_function=prompt.gpqa_instruct,
- hf_repo="Idavidrein/gpqa",
- hf_subset="gpqa_diamond",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=32768, # needed for reasoning models like R1
- metrics=[Metrics.gpqa_instruct_pass_at_k(sample_params={"k": 1})],
- stop_sequence=[], # no stop sequence, will use eos token
- version=1,
-)
-gpqa_extended_instruct_lighteval = LightevalTaskConfig(
- name="gpqa:extended",
- suite=["lighteval"],
- prompt_function=prompt.gpqa_instruct,
- hf_repo="Idavidrein/gpqa",
- hf_subset="gpqa_extended",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=32768, # needed for reasoning models like R1
- metrics=[Metrics.gpqa_instruct_metric],
- stop_sequence=[], # no stop sequence, will use eos token
- version=0,
-)
-gpqa_main_instruct_lighteval = LightevalTaskConfig(
- name="gpqa:main",
- suite=["lighteval"],
- prompt_function=prompt.gpqa_instruct,
- hf_repo="Idavidrein/gpqa",
- hf_subset="gpqa_main",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=32768, # needed for reasoning models like R1
- metrics=[Metrics.gpqa_instruct_metric],
- stop_sequence=[], # no stop sequence, will use eos token
- version=0,
-)
-gre_reading_comprehension_bigbench = LightevalTaskConfig(
- name="gre_reading_comprehension",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="gre_reading_comprehension",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-gsm_plus = LightevalTaskConfig(
- name="gsm_plus",
- suite=["lighteval"],
- prompt_function=prompt.gsm_plus,
- hf_repo="qintongli/GSM-Plus",
- hf_subset="default",
- hf_avail_splits=["test", "testmini"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.expr_gold_metric],
- stop_sequence=None,
- version=0,
-)
-gsm8k_leaderboard = LightevalTaskConfig(
- name="gsm8k",
- suite=["leaderboard"],
- prompt_function=prompt.gsm8k,
- hf_repo="gsm8k",
- hf_subset="main",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling_from_train",
- generation_size=256,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": gsm8k_normalizer, "normalize_pred": gsm8k_normalizer})
- ],
- stop_sequence=[],
- version=0,
-)
-gsm8k_lighteval = LightevalTaskConfig(
- name="gsm8k",
- suite=["lighteval"],
- prompt_function=prompt.gsm8k,
- hf_repo="openai/gsm8k",
- hf_subset="main",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select="random_sampling_from_train",
- generation_size=256,
- metrics=[
- Metrics.expr_gold_metric,
- ],
- stop_sequence=["Question:"],
- version=0,
-)
-headqa_en_lighteval = LightevalTaskConfig(
- name="headqa:en",
- suite=["lighteval", "headqa"],
- prompt_function=prompt.headqa,
- hf_repo="lighteval/headqa_harness",
- hf_subset="en",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-headqa_es_lighteval = LightevalTaskConfig(
- name="headqa:es",
- suite=["lighteval", "headqa"],
- prompt_function=prompt.headqa,
- hf_repo="lighteval/headqa_harness",
- hf_subset="es",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-hellaswag_leaderboard = LightevalTaskConfig(
- name="hellaswag",
- suite=["leaderboard"],
- prompt_function=prompt.hellaswag_harness,
- hf_repo="hellaswag",
- hf_subset="default",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select="random_sampling_from_train",
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-hellaswag_generative = LightevalTaskConfig(
- name="hellaswag",
- suite=["helm", "helm_general"],
- prompt_function=prompt.hellaswag_generative,
- hf_repo="hellaswag",
- hf_subset="default",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-hhh_alignment_bigbench = LightevalTaskConfig(
- name="hhh_alignment",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="hhh_alignment",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-hindi_question_answering_bigbench = LightevalTaskConfig(
- name="hindi_question_answering",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="hindi_question_answering",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-hindu_knowledge_bigbench_lite = LightevalTaskConfig(
- name="hindu_knowledge",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="hindu_knowledge",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-hinglish_toxicity_bigbench = LightevalTaskConfig(
- name="hinglish_toxicity",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="hinglish_toxicity",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-human_organs_senses_bigbench = LightevalTaskConfig(
- name="human_organs_senses",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="human_organs_senses",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-hyperbaton_bigbench = LightevalTaskConfig(
- name="hyperbaton",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="hyperbaton",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-identify_math_theorems_bigbench = LightevalTaskConfig(
- name="identify_math_theorems",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="identify_math_theorems",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-identify_odd_metaphor_bigbench = LightevalTaskConfig(
- name="identify_odd_metaphor",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="identify_odd_metaphor",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-imdb_helm = LightevalTaskConfig(
- name="imdb",
- suite=["helm", "helm_general"],
- prompt_function=prompt.imdb,
- hf_repo="lighteval/IMDB_helm",
- hf_subset="default",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-imdb_contrastset_helm = LightevalTaskConfig(
- name="imdb:contrastset",
- suite=["helm"],
- prompt_function=prompt.imdb_contrastset,
- hf_repo="lighteval/IMDB_helm",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-implicatures_bigbench = LightevalTaskConfig(
- name="implicatures",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="implicatures",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-implicit_relations_bigbench = LightevalTaskConfig(
- name="implicit_relations",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="implicit_relations",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-intent_recognition_bigbench = LightevalTaskConfig(
- name="intent_recognition",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="intent_recognition",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-interactive_qa_mmlu_abstract_algebra_helm = LightevalTaskConfig(
- name="interactive_qa_mmlu:abstract_algebra",
- suite=["helm", "interactive_qa_mmlu_scenario"],
- prompt_function=prompt.mmlu_qa_abstract_algebra,
- hf_repo="lighteval/mmlu",
- hf_subset="abstract_algebra",
- hf_avail_splits=["dev", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-interactive_qa_mmlu_college_chemistry_helm = LightevalTaskConfig(
- name="interactive_qa_mmlu:college_chemistry",
- suite=["helm", "interactive_qa_mmlu_scenario"],
- prompt_function=prompt.mmlu_qa_college_chemistry,
- hf_repo="lighteval/mmlu",
- hf_subset="college_chemistry",
- hf_avail_splits=["dev", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-interactive_qa_mmlu_global_facts_helm = LightevalTaskConfig(
- name="interactive_qa_mmlu:global_facts",
- suite=["helm", "interactive_qa_mmlu_scenario"],
- prompt_function=prompt.mmlu_qa_global_facts,
- hf_repo="lighteval/mmlu",
- hf_subset="global_facts",
- hf_avail_splits=["dev", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-interactive_qa_mmlu_miscellaneous_helm = LightevalTaskConfig(
- name="interactive_qa_mmlu:miscellaneous",
- suite=["helm", "interactive_qa_mmlu_scenario"],
- prompt_function=prompt.mmlu_qa_miscellaneous,
- hf_repo="lighteval/mmlu",
- hf_subset="miscellaneous",
- hf_avail_splits=["dev", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-interactive_qa_mmlu_nutrition_helm = LightevalTaskConfig(
- name="interactive_qa_mmlu:nutrition",
- suite=["helm", "interactive_qa_mmlu_scenario"],
- prompt_function=prompt.mmlu_qa_nutrition,
- hf_repo="lighteval/mmlu",
- hf_subset="nutrition",
- hf_avail_splits=["dev", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-interactive_qa_mmlu_us_foreign_policy_helm = LightevalTaskConfig(
- name="interactive_qa_mmlu:us_foreign_policy",
- suite=["helm", "interactive_qa_mmlu_scenario"],
- prompt_function=prompt.mmlu_qa_us_foreign_policy,
- hf_repo="lighteval/mmlu",
- hf_subset="us_foreign_policy",
- hf_avail_splits=["dev", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-international_phonetic_alphabet_nli_bigbench = LightevalTaskConfig(
- name="international_phonetic_alphabet_nli",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="international_phonetic_alphabet_nli",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-international_phonetic_alphabet_transliterate_bigbench = LightevalTaskConfig(
- name="international_phonetic_alphabet_transliterate",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="international_phonetic_alphabet_transliterate",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-intersect_geometry_bigbench = LightevalTaskConfig(
- name="intersect_geometry",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="intersect_geometry",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-irony_identification_bigbench = LightevalTaskConfig(
- name="irony_identification",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="irony_identification",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-iwslt17_ar_en_lighteval = LightevalTaskConfig(
- name="iwslt17:ar-en",
- suite=["lighteval", "harness_selection"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="iwslt17_ar-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-iwslt17_de_en_lighteval = LightevalTaskConfig(
- name="iwslt17:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="iwslt17_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-iwslt17_en_ar_lighteval = LightevalTaskConfig(
- name="iwslt17:en-ar",
- suite=["lighteval", "harness_selection"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="iwslt17_ar-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-iwslt17_en_de_lighteval = LightevalTaskConfig(
- name="iwslt17:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="iwslt17_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-iwslt17_en_fr_lighteval = LightevalTaskConfig(
- name="iwslt17:en-fr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="iwslt17_en-fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-iwslt17_en_ja_lighteval = LightevalTaskConfig(
- name="iwslt17:en-ja",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="iwslt17_en-ja",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-iwslt17_en_ko_lighteval = LightevalTaskConfig(
- name="iwslt17:en-ko",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="iwslt17_en-ko",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-iwslt17_en_zh_lighteval = LightevalTaskConfig(
- name="iwslt17:en-zh",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="iwslt17_en-zh",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-iwslt17_fr_en_lighteval = LightevalTaskConfig(
- name="iwslt17:fr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="iwslt17_fr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-iwslt17_ja_en_lighteval = LightevalTaskConfig(
- name="iwslt17:ja-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="iwslt17_ja-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-iwslt17_ko_en_lighteval = LightevalTaskConfig(
- name="iwslt17:ko-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="iwslt17_ko-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-iwslt17_zh_en_lighteval = LightevalTaskConfig(
- name="iwslt17:zh-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="iwslt17_zh-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-jeopardy = LightevalTaskConfig(
- name="jeopardy",
- prompt_function=get_qa_prompt_function(
- Language.ENGLISH,
- lambda line: {
- "question": line["question"],
- "choices": [line["answer"]],
- },
- ),
- suite=("lighteval",),
- hf_repo="openaccess-ai-collective/jeopardy",
- hf_subset="default",
- evaluation_splits=("train",),
- few_shots_split="train",
- generation_size=250,
- stop_sequence=["\n", "Question:", "question:"],
- metrics=(
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ),
-)
-kanji_ascii_bigbench = LightevalTaskConfig(
- name="kanji_ascii",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="kanji_ascii",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-kannada_bigbench = LightevalTaskConfig(
- name="kannada",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="kannada",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-key_value_maps_bigbench = LightevalTaskConfig(
- name="key_value_maps",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="key_value_maps",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-known_unknowns_bigbench_lite = LightevalTaskConfig(
- name="known_unknowns",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="known_unknowns",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-lambada_standard_lighteval = LightevalTaskConfig(
- name="lambada:standard",
- suite=["lighteval", "lambada"],
- prompt_function=prompt.lambada,
- hf_repo="lambada",
- hf_subset="plain_text",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[Metrics.target_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-lambada_standard_cloze_lighteval = LightevalTaskConfig(
- name="lambada:standard_cloze",
- suite=["lighteval", "lambada"],
- prompt_function=prompt.lambada_cloze,
- hf_repo="lambada",
- hf_subset="plain_text",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[Metrics.target_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-lambada_openai_lighteval = LightevalTaskConfig(
- name="lambada:openai",
- suite=["lighteval", "lambada"],
- prompt_function=prompt.lambada,
- hf_repo="EleutherAI/lambada_openai",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[Metrics.target_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-lambada_openai_de_lighteval = LightevalTaskConfig(
- name="lambada:openai:de",
- suite=["lighteval", "lambada"],
- prompt_function=prompt.lambada,
- hf_repo="EleutherAI/lambada_openai",
- hf_subset="de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[Metrics.target_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-lambada_openai_en_lighteval = LightevalTaskConfig(
- name="lambada:openai:en",
- suite=["lighteval", "lambada"],
- prompt_function=prompt.lambada,
- hf_repo="EleutherAI/lambada_openai",
- hf_subset="en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[Metrics.target_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-lambada_openai_es_lighteval = LightevalTaskConfig(
- name="lambada:openai:es",
- suite=["lighteval", "lambada"],
- prompt_function=prompt.lambada,
- hf_repo="EleutherAI/lambada_openai",
- hf_subset="es",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[Metrics.target_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-lambada_openai_fr_lighteval = LightevalTaskConfig(
- name="lambada:openai:fr",
- suite=["lighteval", "lambada"],
- prompt_function=prompt.lambada,
- hf_repo="EleutherAI/lambada_openai",
- hf_subset="fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[Metrics.target_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-lambada_openai_it_lighteval = LightevalTaskConfig(
- name="lambada:openai:it",
- suite=["lighteval", "lambada"],
- prompt_function=prompt.lambada,
- hf_repo="EleutherAI/lambada_openai",
- hf_subset="it",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[Metrics.target_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-lambada_openai_cloze_lighteval = LightevalTaskConfig(
- name="lambada:openai_cloze",
- suite=["lighteval", "lambada"],
- prompt_function=prompt.lambada_cloze,
- hf_repo="EleutherAI/lambada_openai",
- hf_subset="en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[Metrics.target_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-language_games_bigbench = LightevalTaskConfig(
- name="language_games",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="language_games",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-language_identification_bigbench_lite = LightevalTaskConfig(
- name="language_identification",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="language_identification",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-legal_summarization_billsum_helm = LightevalTaskConfig(
- name="legal_summarization:billsum",
- suite=["helm"],
- prompt_function=prompt.legal_summarization,
- hf_repo="lighteval/legal_summarization",
- hf_subset="BillSum",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1024,
- metrics=[
- Metrics.rouge1,
- Metrics.rouge2,
- Metrics.rougeL,
- Metrics.faithfulness,
- Metrics.extractiveness,
- Metrics.bert_score,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-legal_summarization_eurlexsum_helm = LightevalTaskConfig(
- name="legal_summarization:eurlexsum",
- suite=["helm"],
- prompt_function=prompt.legal_summarization,
- hf_repo="lighteval/legal_summarization",
- hf_subset="EurLexSum",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.rouge1,
- Metrics.rouge2,
- Metrics.rougeL,
- Metrics.faithfulness,
- Metrics.extractiveness,
- Metrics.bert_score,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-legal_summarization_multilexsum_helm = LightevalTaskConfig(
- name="legal_summarization:multilexsum",
- suite=["helm"],
- prompt_function=prompt.multilexsum,
- hf_repo="lighteval/legal_summarization",
- hf_subset="MultiLexSum",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=256,
- metrics=[
- Metrics.rouge1,
- Metrics.rouge2,
- Metrics.rougeL,
- Metrics.faithfulness,
- Metrics.extractiveness,
- Metrics.bert_score,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-legalsupport_helm = LightevalTaskConfig(
- name="legalsupport",
- suite=["helm"],
- prompt_function=prompt.legal_support,
- hf_repo="lighteval/LegalSupport",
- hf_subset="default",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lexglue_case_hold_helm = LightevalTaskConfig(
- name="lexglue:case_hold",
- suite=["helm", "lex_glue_scenario"],
- prompt_function=prompt.lex_glue_case_hold,
- hf_repo="lighteval/lexglue",
- hf_subset="case_hold",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lexglue_ecthr_a_helm = LightevalTaskConfig(
- name="lexglue:ecthr_a",
- suite=["helm", "lex_glue_scenario"],
- prompt_function=prompt.lex_glue_ecthr_a,
- hf_repo="lighteval/lexglue",
- hf_subset="ecthr_a",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lexglue_ecthr_b_helm = LightevalTaskConfig(
- name="lexglue:ecthr_b",
- suite=["helm", "lex_glue_scenario"],
- prompt_function=prompt.lex_glue_ecthr_b,
- hf_repo="lighteval/lexglue",
- hf_subset="ecthr_b",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lexglue_eurlex_helm = LightevalTaskConfig(
- name="lexglue:eurlex",
- suite=["helm", "lex_glue_scenario"],
- prompt_function=prompt.lex_glue_eurlex,
- hf_repo="lighteval/lexglue",
- hf_subset="eurlex",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lexglue_ledgar_helm = LightevalTaskConfig(
- name="lexglue:ledgar",
- suite=["helm", "lex_glue_scenario"],
- prompt_function=prompt.lex_glue_ledgar,
- hf_repo="lighteval/lexglue",
- hf_subset="ledgar",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lexglue_scotus_helm = LightevalTaskConfig(
- name="lexglue:scotus",
- suite=["helm", "lex_glue_scenario"],
- prompt_function=prompt.lex_glue_scotus,
- hf_repo="lighteval/lexglue",
- hf_subset="scotus",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lexglue_unfair_tos_helm = LightevalTaskConfig(
- name="lexglue:unfair_tos",
- suite=["helm", "lex_glue_scenario"],
- prompt_function=prompt.lex_glue_unfair_tos,
- hf_repo="lighteval/lexglue",
- hf_subset="unfair_tos",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_brazilian_court_decisions_judgment_helm = LightevalTaskConfig(
- name="lextreme:brazilian_court_decisions_judgment",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_brazilian_court_decisions_judgment,
- hf_repo="lighteval/lextreme",
- hf_subset="brazilian_court_decisions_judgment",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_brazilian_court_decisions_unanimity_helm = LightevalTaskConfig(
- name="lextreme:brazilian_court_decisions_unanimity",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_brazilian_court_decisions_unanimity,
- hf_repo="lighteval/lextreme",
- hf_subset="brazilian_court_decisions_unanimity",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_covid19_emergency_event_helm = LightevalTaskConfig(
- name="lextreme:covid19_emergency_event",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_covid19_emergency_event,
- hf_repo="lighteval/lextreme",
- hf_subset="covid19_emergency_event",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_german_argument_mining_helm = LightevalTaskConfig(
- name="lextreme:german_argument_mining",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_german_argument_mining,
- hf_repo="lighteval/lextreme",
- hf_subset="german_argument_mining",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_greek_legal_code_chapter_helm = LightevalTaskConfig(
- name="lextreme:greek_legal_code_chapter",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_greek_legal_code_chapter,
- hf_repo="lighteval/lextreme",
- hf_subset="greek_legal_code_chapter",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_greek_legal_code_subject_helm = LightevalTaskConfig(
- name="lextreme:greek_legal_code_subject",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_greek_legal_code_subject,
- hf_repo="lighteval/lextreme",
- hf_subset="greek_legal_code_subject",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_greek_legal_code_volume_helm = LightevalTaskConfig(
- name="lextreme:greek_legal_code_volume",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_greek_legal_code_volume,
- hf_repo="lighteval/lextreme",
- hf_subset="greek_legal_code_volume",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_greek_legal_ner_helm = LightevalTaskConfig(
- name="lextreme:greek_legal_ner",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_greek_legal_ner,
- hf_repo="lighteval/lextreme",
- hf_subset="greek_legal_ner",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=430,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_legalnero_helm = LightevalTaskConfig(
- name="lextreme:legalnero",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_legalnero,
- hf_repo="lighteval/lextreme",
- hf_subset="legalnero",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=788,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_lener_br_helm = LightevalTaskConfig(
- name="lextreme:lener_br",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_lener_br,
- hf_repo="lighteval/lextreme",
- hf_subset="lener_br",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=338,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_mapa_coarse_helm = LightevalTaskConfig(
- name="lextreme:mapa_coarse",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_mapa_coarse,
- hf_repo="lighteval/lextreme",
- hf_subset="mapa_coarse",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=274,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_mapa_fine_helm = LightevalTaskConfig(
- name="lextreme:mapa_fine",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_mapa_fine,
- hf_repo="lighteval/lextreme",
- hf_subset="mapa_fine",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=274,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_multi_eurlex_level_1_helm = LightevalTaskConfig(
- name="lextreme:multi_eurlex_level_1",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_multi_eurlex_level_1,
- hf_repo="lighteval/lextreme",
- hf_subset="multi_eurlex_level_1",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_multi_eurlex_level_2_helm = LightevalTaskConfig(
- name="lextreme:multi_eurlex_level_2",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_multi_eurlex_level_2,
- hf_repo="lighteval/lextreme",
- hf_subset="multi_eurlex_level_2",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_multi_eurlex_level_3_helm = LightevalTaskConfig(
- name="lextreme:multi_eurlex_level_3",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_multi_eurlex_level_3,
- hf_repo="lighteval/lextreme",
- hf_subset="multi_eurlex_level_3",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_online_terms_of_service_clause_topics_helm = LightevalTaskConfig(
- name="lextreme:online_terms_of_service_clause_topics",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_online_terms_of_service_clause_topics,
- hf_repo="lighteval/lextreme",
- hf_subset="online_terms_of_service_clause_topics",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_online_terms_of_service_unfairness_levels_helm = LightevalTaskConfig(
- name="lextreme:online_terms_of_service_unfairness_levels",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_online_terms_of_service_unfairness_levels,
- hf_repo="lighteval/lextreme",
- hf_subset="online_terms_of_service_unfairness_levels",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=10,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lextreme_swiss_judgment_prediction_helm = LightevalTaskConfig(
- name="lextreme:swiss_judgment_prediction",
- suite=["helm", "lextreme_scenario"],
- prompt_function=prompt.lextreme_swiss_judgment_prediction,
- hf_repo="lighteval/lextreme",
- hf_subset="swiss_judgment_prediction",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-linguistic_mappings_bigbench = LightevalTaskConfig(
- name="linguistic_mappings",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="linguistic_mappings",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-linguistics_puzzles_bigbench_lite = LightevalTaskConfig(
- name="linguistics_puzzles",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="linguistics_puzzles",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=None,
- version=0,
-)
-logic_grid_puzzle_bigbench_lite = LightevalTaskConfig(
- name="logic_grid_puzzle",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="logic_grid_puzzle",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-logical_args_bigbench = LightevalTaskConfig(
- name="logical_args",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="logical_args",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-logical_deduction_bigbench_lite = LightevalTaskConfig(
- name="logical_deduction",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="logical_deduction",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-logical_fallacy_detection_bigbench = LightevalTaskConfig(
- name="logical_fallacy_detection",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="logical_fallacy_detection",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-logical_sequence_bigbench = LightevalTaskConfig(
- name="logical_sequence",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="logical_sequence",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-logiqa_lighteval = LightevalTaskConfig(
- name="logiqa",
- suite=["lighteval"],
- prompt_function=prompt.logiqa,
- hf_repo="lighteval/logiqa_harness",
- hf_subset="logiqa",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lsat_qa_helm = LightevalTaskConfig(
- name="lsat_qa",
- suite=["helm", "lsat_qa_scenario"],
- prompt_function=prompt.lsat_qa,
- hf_repo="lighteval/lsat_qa",
- hf_subset="all",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lsat_qa_assignment_helm = LightevalTaskConfig(
- name="lsat_qa:assignment",
- suite=["helm", "lsat_qa_scenario"],
- prompt_function=prompt.lsat_qa,
- hf_repo="lighteval/lsat_qa",
- hf_subset="assignment",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lsat_qa_grouping_helm = LightevalTaskConfig(
- name="lsat_qa:grouping",
- suite=["helm", "lsat_qa_scenario"],
- prompt_function=prompt.lsat_qa,
- hf_repo="lighteval/lsat_qa",
- hf_subset="grouping",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lsat_qa_miscellaneous_helm = LightevalTaskConfig(
- name="lsat_qa:miscellaneous",
- suite=["helm", "lsat_qa_scenario"],
- prompt_function=prompt.lsat_qa,
- hf_repo="lighteval/lsat_qa",
- hf_subset="miscellaneous",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-lsat_qa_ordering_helm = LightevalTaskConfig(
- name="lsat_qa:ordering",
- suite=["helm", "lsat_qa_scenario"],
- prompt_function=prompt.lsat_qa,
- hf_repo="lighteval/lsat_qa",
- hf_subset="ordering",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-math_500 = LightevalTaskConfig(
- name="math_500",
- suite=["lighteval"],
- prompt_function=prompt.math_500,
- hf_repo="HuggingFaceH4/MATH-500",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=32768,
- metrics=[
- Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}),
- ],
- version=2,
-)
-math_500_gpassk = LightevalTaskConfig(
- name="math_500_gpassk",
- suite=["lighteval"],
- prompt_function=prompt.math_500,
- hf_repo="HuggingFaceH4/MATH-500",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8192,
- metrics=[Metrics.g_pass_at_k_latex(sample_params={"k": 16, "n": 48})],
- version=1,
-)
-math_algebra_lighteval = LightevalTaskConfig(
- name="math:algebra",
- suite=["lighteval", "math"],
- prompt_function=prompt.math,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="algebra",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=1,
-)
-math_counting_and_probability_lighteval = LightevalTaskConfig(
- name="math:counting_and_probability",
- suite=["lighteval", "math"],
- prompt_function=prompt.math,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="counting_and_probability",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=1,
-)
-math_geometry_lighteval = LightevalTaskConfig(
- name="math:geometry",
- suite=["lighteval", "math"],
- prompt_function=prompt.math,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="geometry",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=1,
-)
-math_intermediate_algebra_lighteval = LightevalTaskConfig(
- name="math:intermediate_algebra",
- suite=["lighteval", "math"],
- prompt_function=prompt.math,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="intermediate_algebra",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=1,
-)
-math_number_theory_lighteval = LightevalTaskConfig(
- name="math:number_theory",
- suite=["lighteval", "math"],
- prompt_function=prompt.math,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="number_theory",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=1,
-)
-math_prealgebra_lighteval = LightevalTaskConfig(
- name="math:prealgebra",
- suite=["lighteval", "math"],
- prompt_function=prompt.math,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="prealgebra",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=1,
-)
-math_precalculus_lighteval = LightevalTaskConfig(
- name="math:precalculus",
- suite=["lighteval", "math"],
- prompt_function=prompt.math,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="precalculus",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=1,
-)
-math_cot_algebra_lighteval = LightevalTaskConfig(
- name="math_cot:algebra",
- suite=["lighteval", "math"],
- prompt_function=prompt.math_cot,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="algebra",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-math_cot_counting_and_probability_lighteval = LightevalTaskConfig(
- name="math_cot:counting_and_probability",
- suite=["lighteval", "math"],
- prompt_function=prompt.math_cot,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="counting_and_probability",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-math_cot_geometry_lighteval = LightevalTaskConfig(
- name="math_cot:geometry",
- suite=["lighteval", "math"],
- prompt_function=prompt.math_cot,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="geometry",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-math_cot_intermediate_algebra_lighteval = LightevalTaskConfig(
- name="math_cot:intermediate_algebra",
- suite=["lighteval", "math"],
- prompt_function=prompt.math_cot,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="intermediate_algebra",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-math_cot_number_theory_lighteval = LightevalTaskConfig(
- name="math_cot:number_theory",
- suite=["lighteval", "math"],
- prompt_function=prompt.math_cot,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="number_theory",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-math_cot_prealgebra_lighteval = LightevalTaskConfig(
- name="math_cot:prealgebra",
- suite=["lighteval", "math"],
- prompt_function=prompt.math_cot,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="prealgebra",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-math_cot_precalculus_lighteval = LightevalTaskConfig(
- name="math_cot:precalculus",
- suite=["lighteval", "math"],
- prompt_function=prompt.math_cot,
- hf_repo="DigitalLearningGmbH/MATH-lighteval",
- hf_subset="precalculus",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=2048,
- metrics=[
- Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}),
- Metrics.maj_at_n(
- sample_params={
- "n": 4,
- "strip_strings": True,
- "normalize_pred": math_normalizer,
- "normalize_gold": math_normalizer,
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mathematical_induction_bigbench = LightevalTaskConfig(
- name="mathematical_induction",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="mathematical_induction",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mathqa_lighteval = LightevalTaskConfig(
- name="mathqa",
- suite=["lighteval"],
- prompt_function=prompt.mathqa,
- hf_repo="allenai/math_qa",
- hf_subset="default",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-matrixshapes_bigbench = LightevalTaskConfig(
- name="matrixshapes",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="matrixshapes",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-me_q_sum_helm = LightevalTaskConfig(
- name="me_q_sum",
- suite=["helm"],
- prompt_function=prompt.me_q_sum,
- hf_repo="lighteval/me_q_sum",
- hf_subset="default",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=128,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.rougeL,
- Metrics.bleu_1,
- Metrics.bleu_4,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-med_dialog_healthcaremagic_helm = LightevalTaskConfig(
- name="med_dialog:healthcaremagic",
- suite=["helm"],
- prompt_function=prompt.med_dialog,
- hf_repo="lighteval/med_dialog",
- hf_subset="healthcaremagic",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=128,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.rougeL,
- Metrics.bleu_1,
- Metrics.bleu_4,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-med_dialog_icliniq_helm = LightevalTaskConfig(
- name="med_dialog:icliniq",
- suite=["helm"],
- prompt_function=prompt.med_dialog,
- hf_repo="lighteval/med_dialog",
- hf_subset="icliniq",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=128,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.rougeL,
- Metrics.bleu_1,
- Metrics.bleu_4,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-med_mcqa_helm = LightevalTaskConfig(
- name="med_mcqa",
- suite=["helm"],
- prompt_function=prompt.med_mcqa,
- hf_repo="lighteval/med_mcqa",
- hf_subset="default",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-med_paragraph_simplification_helm = LightevalTaskConfig(
- name="med_paragraph_simplification",
- suite=["helm"],
- prompt_function=prompt.med_paragraph_simplification,
- hf_repo="lighteval/med_paragraph_simplification",
- hf_subset="default",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=512,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.rougeL,
- Metrics.bleu_1,
- Metrics.bleu_4,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-med_qa_helm = LightevalTaskConfig(
- name="med_qa",
- suite=["helm"],
- prompt_function=prompt.med_qa,
- hf_repo="bigbio/med_qa",
- hf_subset="med_qa_en_source",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-metaphor_boolean_bigbench = LightevalTaskConfig(
- name="metaphor_boolean",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="metaphor_boolean",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-metaphor_understanding_bigbench = LightevalTaskConfig(
- name="metaphor_understanding",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="metaphor_understanding",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mgsm_en_lighteval = LightevalTaskConfig(
- name="mgsm:en",
- suite=["lighteval"],
- prompt_function=prompt.mgsm_en,
- hf_repo="juletxara/mgsm",
- hf_subset="en",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n", "=", "Question="],
- version=0,
-)
-mgsm_es_lighteval = LightevalTaskConfig(
- name="mgsm:es",
- suite=["lighteval"],
- prompt_function=prompt.mgsm_es,
- hf_repo="juletxara/mgsm",
- hf_subset="es",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n", "=", "Pregunta="],
- version=0,
-)
-mgsm_fr_lighteval = LightevalTaskConfig(
- name="mgsm:fr",
- suite=["lighteval"],
- prompt_function=prompt.mgsm_fr,
- hf_repo="juletxara/mgsm",
- hf_subset="fr",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n", "=", "Question="],
- version=0,
-)
-mgsm_de_lighteval = LightevalTaskConfig(
- name="mgsm:de",
- suite=["lighteval"],
- prompt_function=prompt.mgsm_de,
- hf_repo="juletxara/mgsm",
- hf_subset="de",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n", "=", "Frage="],
- version=0,
-)
-mgsm_ru_lighteval = LightevalTaskConfig(
- name="mgsm:ru",
- suite=["lighteval"],
- prompt_function=prompt.mgsm_ru,
- hf_repo="juletxara/mgsm",
- hf_subset="ru",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n", "=", "\u0417\u0430\u0434\u0430\u0447\u0430="],
- version=0,
-)
-mgsm_zh_lighteval = LightevalTaskConfig(
- name="mgsm:zh",
- suite=["lighteval"],
- prompt_function=prompt.mgsm_zh,
- hf_repo="juletxara/mgsm",
- hf_subset="zh",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n", "=", "\u95ee\u9898="],
- version=0,
-)
-mgsm_ja_lighteval = LightevalTaskConfig(
- name="mgsm:ja",
- suite=["lighteval"],
- prompt_function=prompt.mgsm_ja,
- hf_repo="juletxara/mgsm",
- hf_subset="ja",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n", "=", "\u554f\u984c="],
- version=0,
-)
-mgsm_th_lighteval = LightevalTaskConfig(
- name="mgsm:th",
- suite=["lighteval"],
- prompt_function=prompt.mgsm_th,
- hf_repo="juletxara/mgsm",
- hf_subset="th",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n", "=", "\u0e42\u0e08\u0e17\u0e22\u0e4c="],
- version=0,
-)
-mgsm_sw_lighteval = LightevalTaskConfig(
- name="mgsm:sw",
- suite=["lighteval"],
- prompt_function=prompt.mgsm_sw,
- hf_repo="juletxara/mgsm",
- hf_subset="sw",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n", "=", "Swali="],
- version=0,
-)
-mgsm_bn_lighteval = LightevalTaskConfig(
- name="mgsm:bn",
- suite=["lighteval"],
- prompt_function=prompt.mgsm_bn,
- hf_repo="juletxara/mgsm",
- hf_subset="bn",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n", "=", "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8="],
- version=0,
-)
-mgsm_te_lighteval = LightevalTaskConfig(
- name="mgsm:te",
- suite=["lighteval"],
- prompt_function=prompt.mgsm_te,
- hf_repo="juletxara/mgsm",
- hf_subset="te",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n", "=", "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28="],
- version=0,
-)
-minute_mysteries_qa_bigbench = LightevalTaskConfig(
- name="minute_mysteries_qa",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="minute_mysteries_qa",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc, Metrics.rouge_t5],
- stop_sequence=["\n"],
- version=0,
-)
-misconceptions_bigbench = LightevalTaskConfig(
- name="misconceptions",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="misconceptions",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-misconceptions_russian_bigbench_lite = LightevalTaskConfig(
- name="misconceptions_russian",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="misconceptions_russian",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_abstract_algebra_original = LightevalTaskConfig(
- name="mmlu:abstract_algebra",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_abstract_algebra,
- hf_repo="cais/mmlu",
- hf_subset="abstract_algebra",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_abstract_algebra_leaderboard = LightevalTaskConfig(
- name="mmlu:abstract_algebra",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="abstract_algebra",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_abstract_algebra_helm = LightevalTaskConfig(
- name="mmlu:abstract_algebra",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="abstract_algebra",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_anatomy_original = LightevalTaskConfig(
- name="mmlu:anatomy",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_anatomy,
- hf_repo="cais/mmlu",
- hf_subset="anatomy",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_anatomy_leaderboard = LightevalTaskConfig(
- name="mmlu:anatomy",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="anatomy",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_anatomy_helm = LightevalTaskConfig(
- name="mmlu:anatomy",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="anatomy",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_astronomy_original = LightevalTaskConfig(
- name="mmlu:astronomy",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_astronomy,
- hf_repo="cais/mmlu",
- hf_subset="astronomy",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_astronomy_leaderboard = LightevalTaskConfig(
- name="mmlu:astronomy",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="astronomy",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_astronomy_helm = LightevalTaskConfig(
- name="mmlu:astronomy",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="astronomy",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_business_ethics_original = LightevalTaskConfig(
- name="mmlu:business_ethics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_business_ethics,
- hf_repo="cais/mmlu",
- hf_subset="business_ethics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_business_ethics_leaderboard = LightevalTaskConfig(
- name="mmlu:business_ethics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="business_ethics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_business_ethics_helm = LightevalTaskConfig(
- name="mmlu:business_ethics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="business_ethics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_clinical_knowledge_original = LightevalTaskConfig(
- name="mmlu:clinical_knowledge",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_clinical_knowledge,
- hf_repo="cais/mmlu",
- hf_subset="clinical_knowledge",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_clinical_knowledge_leaderboard = LightevalTaskConfig(
- name="mmlu:clinical_knowledge",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="clinical_knowledge",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_clinical_knowledge_helm = LightevalTaskConfig(
- name="mmlu:clinical_knowledge",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="clinical_knowledge",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_biology_original = LightevalTaskConfig(
- name="mmlu:college_biology",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_college_biology,
- hf_repo="cais/mmlu",
- hf_subset="college_biology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_biology_leaderboard = LightevalTaskConfig(
- name="mmlu:college_biology",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="college_biology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_biology_helm = LightevalTaskConfig(
- name="mmlu:college_biology",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="college_biology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_chemistry_original = LightevalTaskConfig(
- name="mmlu:college_chemistry",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_college_chemistry,
- hf_repo="cais/mmlu",
- hf_subset="college_chemistry",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_chemistry_leaderboard = LightevalTaskConfig(
- name="mmlu:college_chemistry",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="college_chemistry",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_chemistry_helm = LightevalTaskConfig(
- name="mmlu:college_chemistry",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="college_chemistry",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_computer_science_original = LightevalTaskConfig(
- name="mmlu:college_computer_science",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_college_computer_science,
- hf_repo="cais/mmlu",
- hf_subset="college_computer_science",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_computer_science_leaderboard = LightevalTaskConfig(
- name="mmlu:college_computer_science",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="college_computer_science",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_computer_science_helm = LightevalTaskConfig(
- name="mmlu:college_computer_science",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="college_computer_science",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_mathematics_original = LightevalTaskConfig(
- name="mmlu:college_mathematics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_college_mathematics,
- hf_repo="cais/mmlu",
- hf_subset="college_mathematics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_mathematics_leaderboard = LightevalTaskConfig(
- name="mmlu:college_mathematics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="college_mathematics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_mathematics_helm = LightevalTaskConfig(
- name="mmlu:college_mathematics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="college_mathematics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_medicine_original = LightevalTaskConfig(
- name="mmlu:college_medicine",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_college_medicine,
- hf_repo="cais/mmlu",
- hf_subset="college_medicine",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_medicine_leaderboard = LightevalTaskConfig(
- name="mmlu:college_medicine",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="college_medicine",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_medicine_helm = LightevalTaskConfig(
- name="mmlu:college_medicine",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="college_medicine",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_physics_original = LightevalTaskConfig(
- name="mmlu:college_physics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_college_physics,
- hf_repo="cais/mmlu",
- hf_subset="college_physics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_physics_leaderboard = LightevalTaskConfig(
- name="mmlu:college_physics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="college_physics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_college_physics_helm = LightevalTaskConfig(
- name="mmlu:college_physics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="college_physics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_computer_security_original = LightevalTaskConfig(
- name="mmlu:computer_security",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_computer_security,
- hf_repo="cais/mmlu",
- hf_subset="computer_security",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_computer_security_leaderboard = LightevalTaskConfig(
- name="mmlu:computer_security",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="computer_security",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_computer_security_helm = LightevalTaskConfig(
- name="mmlu:computer_security",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="computer_security",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_conceptual_physics_original = LightevalTaskConfig(
- name="mmlu:conceptual_physics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_conceptual_physics,
- hf_repo="cais/mmlu",
- hf_subset="conceptual_physics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_conceptual_physics_leaderboard = LightevalTaskConfig(
- name="mmlu:conceptual_physics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="conceptual_physics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_conceptual_physics_helm = LightevalTaskConfig(
- name="mmlu:conceptual_physics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="conceptual_physics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_econometrics_original = LightevalTaskConfig(
- name="mmlu:econometrics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_econometrics,
- hf_repo="cais/mmlu",
- hf_subset="econometrics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_econometrics_leaderboard = LightevalTaskConfig(
- name="mmlu:econometrics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="econometrics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_econometrics_helm = LightevalTaskConfig(
- name="mmlu:econometrics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="econometrics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_electrical_engineering_original = LightevalTaskConfig(
- name="mmlu:electrical_engineering",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_electrical_engineering,
- hf_repo="cais/mmlu",
- hf_subset="electrical_engineering",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_electrical_engineering_leaderboard = LightevalTaskConfig(
- name="mmlu:electrical_engineering",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="electrical_engineering",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_electrical_engineering_helm = LightevalTaskConfig(
- name="mmlu:electrical_engineering",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="electrical_engineering",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_elementary_mathematics_original = LightevalTaskConfig(
- name="mmlu:elementary_mathematics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_elementary_mathematics,
- hf_repo="cais/mmlu",
- hf_subset="elementary_mathematics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_elementary_mathematics_leaderboard = LightevalTaskConfig(
- name="mmlu:elementary_mathematics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="elementary_mathematics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_elementary_mathematics_helm = LightevalTaskConfig(
- name="mmlu:elementary_mathematics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="elementary_mathematics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_formal_logic_original = LightevalTaskConfig(
- name="mmlu:formal_logic",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_formal_logic,
- hf_repo="cais/mmlu",
- hf_subset="formal_logic",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_formal_logic_leaderboard = LightevalTaskConfig(
- name="mmlu:formal_logic",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="formal_logic",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_formal_logic_helm = LightevalTaskConfig(
- name="mmlu:formal_logic",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="formal_logic",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_global_facts_original = LightevalTaskConfig(
- name="mmlu:global_facts",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_global_facts,
- hf_repo="cais/mmlu",
- hf_subset="global_facts",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_global_facts_leaderboard = LightevalTaskConfig(
- name="mmlu:global_facts",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="global_facts",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_global_facts_helm = LightevalTaskConfig(
- name="mmlu:global_facts",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="global_facts",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_biology_original = LightevalTaskConfig(
- name="mmlu:high_school_biology",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_biology,
- hf_repo="cais/mmlu",
- hf_subset="high_school_biology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_biology_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_biology",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_biology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_biology_helm = LightevalTaskConfig(
- name="mmlu:high_school_biology",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_biology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_chemistry_original = LightevalTaskConfig(
- name="mmlu:high_school_chemistry",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_chemistry,
- hf_repo="cais/mmlu",
- hf_subset="high_school_chemistry",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_chemistry_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_chemistry",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_chemistry",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_chemistry_helm = LightevalTaskConfig(
- name="mmlu:high_school_chemistry",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_chemistry",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_computer_science_original = LightevalTaskConfig(
- name="mmlu:high_school_computer_science",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_computer_science,
- hf_repo="cais/mmlu",
- hf_subset="high_school_computer_science",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_computer_science_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_computer_science",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_computer_science",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_computer_science_helm = LightevalTaskConfig(
- name="mmlu:high_school_computer_science",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_computer_science",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_european_history_original = LightevalTaskConfig(
- name="mmlu:high_school_european_history",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_european_history,
- hf_repo="cais/mmlu",
- hf_subset="high_school_european_history",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_european_history_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_european_history",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_european_history",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_european_history_helm = LightevalTaskConfig(
- name="mmlu:high_school_european_history",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_european_history",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_geography_original = LightevalTaskConfig(
- name="mmlu:high_school_geography",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_geography,
- hf_repo="cais/mmlu",
- hf_subset="high_school_geography",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_geography_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_geography",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_geography",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_geography_helm = LightevalTaskConfig(
- name="mmlu:high_school_geography",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_geography",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_government_and_politics_original = LightevalTaskConfig(
- name="mmlu:high_school_government_and_politics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_government_and_politics,
- hf_repo="cais/mmlu",
- hf_subset="high_school_government_and_politics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_government_and_politics_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_government_and_politics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_government_and_politics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_government_and_politics_helm = LightevalTaskConfig(
- name="mmlu:high_school_government_and_politics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_government_and_politics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_macroeconomics_original = LightevalTaskConfig(
- name="mmlu:high_school_macroeconomics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_macroeconomics,
- hf_repo="cais/mmlu",
- hf_subset="high_school_macroeconomics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_macroeconomics_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_macroeconomics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_macroeconomics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_macroeconomics_helm = LightevalTaskConfig(
- name="mmlu:high_school_macroeconomics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_macroeconomics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_mathematics_original = LightevalTaskConfig(
- name="mmlu:high_school_mathematics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_mathematics,
- hf_repo="cais/mmlu",
- hf_subset="high_school_mathematics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_mathematics_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_mathematics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_mathematics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_mathematics_helm = LightevalTaskConfig(
- name="mmlu:high_school_mathematics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_mathematics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_microeconomics_original = LightevalTaskConfig(
- name="mmlu:high_school_microeconomics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_microeconomics,
- hf_repo="cais/mmlu",
- hf_subset="high_school_microeconomics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_microeconomics_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_microeconomics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_microeconomics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_microeconomics_helm = LightevalTaskConfig(
- name="mmlu:high_school_microeconomics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_microeconomics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_physics_original = LightevalTaskConfig(
- name="mmlu:high_school_physics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_physics,
- hf_repo="cais/mmlu",
- hf_subset="high_school_physics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_physics_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_physics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_physics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_physics_helm = LightevalTaskConfig(
- name="mmlu:high_school_physics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_physics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_psychology_original = LightevalTaskConfig(
- name="mmlu:high_school_psychology",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_psychology,
- hf_repo="cais/mmlu",
- hf_subset="high_school_psychology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_psychology_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_psychology",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_psychology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_psychology_helm = LightevalTaskConfig(
- name="mmlu:high_school_psychology",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_psychology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_statistics_original = LightevalTaskConfig(
- name="mmlu:high_school_statistics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_statistics,
- hf_repo="cais/mmlu",
- hf_subset="high_school_statistics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_statistics_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_statistics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_statistics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_statistics_helm = LightevalTaskConfig(
- name="mmlu:high_school_statistics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_statistics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_us_history_original = LightevalTaskConfig(
- name="mmlu:high_school_us_history",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_us_history,
- hf_repo="cais/mmlu",
- hf_subset="high_school_us_history",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_us_history_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_us_history",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_us_history",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_us_history_helm = LightevalTaskConfig(
- name="mmlu:high_school_us_history",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_us_history",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_world_history_original = LightevalTaskConfig(
- name="mmlu:high_school_world_history",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_high_school_world_history,
- hf_repo="cais/mmlu",
- hf_subset="high_school_world_history",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_world_history_leaderboard = LightevalTaskConfig(
- name="mmlu:high_school_world_history",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_world_history",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_high_school_world_history_helm = LightevalTaskConfig(
- name="mmlu:high_school_world_history",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="high_school_world_history",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_human_aging_original = LightevalTaskConfig(
- name="mmlu:human_aging",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_human_aging,
- hf_repo="cais/mmlu",
- hf_subset="human_aging",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_human_aging_leaderboard = LightevalTaskConfig(
- name="mmlu:human_aging",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="human_aging",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_human_aging_helm = LightevalTaskConfig(
- name="mmlu:human_aging",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="human_aging",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_human_sexuality_original = LightevalTaskConfig(
- name="mmlu:human_sexuality",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_human_sexuality,
- hf_repo="cais/mmlu",
- hf_subset="human_sexuality",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_human_sexuality_leaderboard = LightevalTaskConfig(
- name="mmlu:human_sexuality",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="human_sexuality",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_human_sexuality_helm = LightevalTaskConfig(
- name="mmlu:human_sexuality",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="human_sexuality",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_international_law_original = LightevalTaskConfig(
- name="mmlu:international_law",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_international_law,
- hf_repo="cais/mmlu",
- hf_subset="international_law",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_international_law_leaderboard = LightevalTaskConfig(
- name="mmlu:international_law",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="international_law",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_international_law_helm = LightevalTaskConfig(
- name="mmlu:international_law",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="international_law",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_jurisprudence_original = LightevalTaskConfig(
- name="mmlu:jurisprudence",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_jurisprudence,
- hf_repo="cais/mmlu",
- hf_subset="jurisprudence",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_jurisprudence_leaderboard = LightevalTaskConfig(
- name="mmlu:jurisprudence",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="jurisprudence",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_jurisprudence_helm = LightevalTaskConfig(
- name="mmlu:jurisprudence",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="jurisprudence",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_logical_fallacies_original = LightevalTaskConfig(
- name="mmlu:logical_fallacies",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_logical_fallacies,
- hf_repo="cais/mmlu",
- hf_subset="logical_fallacies",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_logical_fallacies_leaderboard = LightevalTaskConfig(
- name="mmlu:logical_fallacies",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="logical_fallacies",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_logical_fallacies_helm = LightevalTaskConfig(
- name="mmlu:logical_fallacies",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="logical_fallacies",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_machine_learning_original = LightevalTaskConfig(
- name="mmlu:machine_learning",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_machine_learning,
- hf_repo="cais/mmlu",
- hf_subset="machine_learning",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_machine_learning_leaderboard = LightevalTaskConfig(
- name="mmlu:machine_learning",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="machine_learning",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_machine_learning_helm = LightevalTaskConfig(
- name="mmlu:machine_learning",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="machine_learning",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_management_original = LightevalTaskConfig(
- name="mmlu:management",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_management,
- hf_repo="cais/mmlu",
- hf_subset="management",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_management_leaderboard = LightevalTaskConfig(
- name="mmlu:management",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="management",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_management_helm = LightevalTaskConfig(
- name="mmlu:management",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="management",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_marketing_original = LightevalTaskConfig(
- name="mmlu:marketing",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_marketing,
- hf_repo="cais/mmlu",
- hf_subset="marketing",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_marketing_leaderboard = LightevalTaskConfig(
- name="mmlu:marketing",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="marketing",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_marketing_helm = LightevalTaskConfig(
- name="mmlu:marketing",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="marketing",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_medical_genetics_original = LightevalTaskConfig(
- name="mmlu:medical_genetics",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_medical_genetics,
- hf_repo="cais/mmlu",
- hf_subset="medical_genetics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_medical_genetics_leaderboard = LightevalTaskConfig(
- name="mmlu:medical_genetics",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="medical_genetics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_medical_genetics_helm = LightevalTaskConfig(
- name="mmlu:medical_genetics",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="medical_genetics",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_miscellaneous_original = LightevalTaskConfig(
- name="mmlu:miscellaneous",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_miscellaneous,
- hf_repo="cais/mmlu",
- hf_subset="miscellaneous",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_miscellaneous_leaderboard = LightevalTaskConfig(
- name="mmlu:miscellaneous",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="miscellaneous",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_miscellaneous_helm = LightevalTaskConfig(
- name="mmlu:miscellaneous",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="miscellaneous",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_moral_disputes_original = LightevalTaskConfig(
- name="mmlu:moral_disputes",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_moral_disputes,
- hf_repo="cais/mmlu",
- hf_subset="moral_disputes",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_moral_disputes_leaderboard = LightevalTaskConfig(
- name="mmlu:moral_disputes",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="moral_disputes",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_moral_disputes_helm = LightevalTaskConfig(
- name="mmlu:moral_disputes",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="moral_disputes",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_moral_scenarios_original = LightevalTaskConfig(
- name="mmlu:moral_scenarios",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_moral_scenarios,
- hf_repo="cais/mmlu",
- hf_subset="moral_scenarios",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_moral_scenarios_leaderboard = LightevalTaskConfig(
- name="mmlu:moral_scenarios",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="moral_scenarios",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_moral_scenarios_helm = LightevalTaskConfig(
- name="mmlu:moral_scenarios",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="moral_scenarios",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_nutrition_original = LightevalTaskConfig(
- name="mmlu:nutrition",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_nutrition,
- hf_repo="cais/mmlu",
- hf_subset="nutrition",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_nutrition_leaderboard = LightevalTaskConfig(
- name="mmlu:nutrition",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="nutrition",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_nutrition_helm = LightevalTaskConfig(
- name="mmlu:nutrition",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="nutrition",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_philosophy_original = LightevalTaskConfig(
- name="mmlu:philosophy",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_philosophy,
- hf_repo="cais/mmlu",
- hf_subset="philosophy",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_philosophy_leaderboard = LightevalTaskConfig(
- name="mmlu:philosophy",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="philosophy",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_philosophy_helm = LightevalTaskConfig(
- name="mmlu:philosophy",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="philosophy",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_prehistory_original = LightevalTaskConfig(
- name="mmlu:prehistory",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_prehistory,
- hf_repo="cais/mmlu",
- hf_subset="prehistory",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_prehistory_leaderboard = LightevalTaskConfig(
- name="mmlu:prehistory",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="prehistory",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_prehistory_helm = LightevalTaskConfig(
- name="mmlu:prehistory",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="prehistory",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_professional_accounting_original = LightevalTaskConfig(
- name="mmlu:professional_accounting",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_professional_accounting,
- hf_repo="cais/mmlu",
- hf_subset="professional_accounting",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_professional_accounting_leaderboard = LightevalTaskConfig(
- name="mmlu:professional_accounting",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="professional_accounting",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_professional_accounting_helm = LightevalTaskConfig(
- name="mmlu:professional_accounting",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="professional_accounting",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_professional_law_original = LightevalTaskConfig(
- name="mmlu:professional_law",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_professional_law,
- hf_repo="cais/mmlu",
- hf_subset="professional_law",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_professional_law_leaderboard = LightevalTaskConfig(
- name="mmlu:professional_law",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="professional_law",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_professional_law_helm = LightevalTaskConfig(
- name="mmlu:professional_law",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="professional_law",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_professional_medicine_original = LightevalTaskConfig(
- name="mmlu:professional_medicine",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_professional_medicine,
- hf_repo="cais/mmlu",
- hf_subset="professional_medicine",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_professional_medicine_leaderboard = LightevalTaskConfig(
- name="mmlu:professional_medicine",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="professional_medicine",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_professional_medicine_helm = LightevalTaskConfig(
- name="mmlu:professional_medicine",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="professional_medicine",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_professional_psychology_original = LightevalTaskConfig(
- name="mmlu:professional_psychology",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_professional_psychology,
- hf_repo="cais/mmlu",
- hf_subset="professional_psychology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_professional_psychology_leaderboard = LightevalTaskConfig(
- name="mmlu:professional_psychology",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="professional_psychology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_professional_psychology_helm = LightevalTaskConfig(
- name="mmlu:professional_psychology",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="professional_psychology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_public_relations_original = LightevalTaskConfig(
- name="mmlu:public_relations",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_public_relations,
- hf_repo="cais/mmlu",
- hf_subset="public_relations",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_public_relations_leaderboard = LightevalTaskConfig(
- name="mmlu:public_relations",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="public_relations",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_public_relations_helm = LightevalTaskConfig(
- name="mmlu:public_relations",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="public_relations",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_security_studies_original = LightevalTaskConfig(
- name="mmlu:security_studies",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_security_studies,
- hf_repo="cais/mmlu",
- hf_subset="security_studies",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_security_studies_leaderboard = LightevalTaskConfig(
- name="mmlu:security_studies",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="security_studies",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_security_studies_helm = LightevalTaskConfig(
- name="mmlu:security_studies",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="security_studies",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_sociology_original = LightevalTaskConfig(
- name="mmlu:sociology",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_sociology,
- hf_repo="cais/mmlu",
- hf_subset="sociology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_sociology_leaderboard = LightevalTaskConfig(
- name="mmlu:sociology",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="sociology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_sociology_helm = LightevalTaskConfig(
- name="mmlu:sociology",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="sociology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_us_foreign_policy_original = LightevalTaskConfig(
- name="mmlu:us_foreign_policy",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_us_foreign_policy,
- hf_repo="cais/mmlu",
- hf_subset="us_foreign_policy",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_us_foreign_policy_leaderboard = LightevalTaskConfig(
- name="mmlu:us_foreign_policy",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="us_foreign_policy",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_us_foreign_policy_helm = LightevalTaskConfig(
- name="mmlu:us_foreign_policy",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="us_foreign_policy",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_virology_original = LightevalTaskConfig(
- name="mmlu:virology",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_virology,
- hf_repo="cais/mmlu",
- hf_subset="virology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_virology_leaderboard = LightevalTaskConfig(
- name="mmlu:virology",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="virology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_virology_helm = LightevalTaskConfig(
- name="mmlu:virology",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="virology",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_world_religions_original = LightevalTaskConfig(
- name="mmlu:world_religions",
- suite=["original", "mmlu"],
- prompt_function=prompt.mmlu_world_religions,
- hf_repo="cais/mmlu",
- hf_subset="world_religions",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_world_religions_leaderboard = LightevalTaskConfig(
- name="mmlu:world_religions",
- suite=["leaderboard", "mmlu"],
- prompt_function=prompt.mmlu_harness,
- hf_repo="lighteval/mmlu",
- hf_subset="world_religions",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select="sequential",
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mmlu_world_religions_helm = LightevalTaskConfig(
- name="mmlu:world_religions",
- suite=["helm", "helm_general"],
- prompt_function=prompt.mmlu_helm,
- hf_repo="lighteval/mmlu",
- hf_subset="world_religions",
- hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
- evaluation_splits=["test"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-mnist_ascii_bigbench = LightevalTaskConfig(
- name="mnist_ascii",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="mnist_ascii",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-modified_arithmetic_bigbench = LightevalTaskConfig(
- name="modified_arithmetic",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="modified_arithmetic",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-moral_permissibility_bigbench = LightevalTaskConfig(
- name="moral_permissibility",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="moral_permissibility",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-movie_dialog_same_or_different_bigbench = LightevalTaskConfig(
- name="movie_dialog_same_or_different",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="movie_dialog_same_or_different",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-movie_recommendation_bigbench = LightevalTaskConfig(
- name="movie_recommendation",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="movie_recommendation",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mtnt2019_en_fr_lighteval = LightevalTaskConfig(
- name="mtnt2019:en-fr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="mtnt2019_en-fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=200,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-mtnt2019_en_ja_lighteval = LightevalTaskConfig(
- name="mtnt2019:en-ja",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="mtnt2019_en-ja",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=200,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-mtnt2019_fr_en_lighteval = LightevalTaskConfig(
- name="mtnt2019:fr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="mtnt2019_fr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=200,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-mtnt2019_ja_en_lighteval = LightevalTaskConfig(
- name="mtnt2019:ja-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="mtnt2019_ja-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=200,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-mult_data_wrangling_bigbench = LightevalTaskConfig(
- name="mult_data_wrangling",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="mult_data_wrangling",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-multiemo_bigbench = LightevalTaskConfig(
- name="multiemo",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="multiemo",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-musr_murder_mysteries = LightevalTaskConfig(
- name="musr:murder_mysteries",
- suite=["lighteval"],
- prompt_function=prompt.musr,
- hf_repo="TAUR-Lab/MuSR",
- hf_subset="default",
- hf_avail_splits=["murder_mysteries"],
- evaluation_splits=["murder_mysteries"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-musr_object_placements = LightevalTaskConfig(
- name="musr:object_placements",
- suite=["lighteval"],
- prompt_function=prompt.musr,
- hf_repo="TAUR-Lab/MuSR",
- hf_subset="default",
- hf_avail_splits=["object_placements"],
- evaluation_splits=["object_placements"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-musr_team_allocation = LightevalTaskConfig(
- name="musr:team_allocation",
- suite=["lighteval"],
- prompt_function=prompt.musr,
- hf_repo="TAUR-Lab/MuSR",
- hf_subset="default",
- hf_avail_splits=["team_allocation"],
- evaluation_splits=["team_allocation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-mutual_lighteval = LightevalTaskConfig(
- name="mutual",
- suite=["lighteval"],
- prompt_function=prompt.mutual,
- hf_repo="lighteval/mutual_harness",
- hf_subset="mutual",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.recall_at_k, Metrics.recall_at_k(sample_params={"k": 2}), Metrics.mrr],
- stop_sequence=["\n"],
- version=0,
-)
-mutual_plus_lighteval = LightevalTaskConfig(
- name="mutual_plus",
- suite=["lighteval"],
- prompt_function=prompt.mutual,
- hf_repo="lighteval/mutual_harness",
- hf_subset="mutual_plus",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.recall_at_k, Metrics.recall_at_k(sample_params={"k": 2}), Metrics.mrr],
- stop_sequence=["\n"],
- version=0,
-)
-narrativeqa_helm = LightevalTaskConfig(
- name="narrativeqa",
- suite=["helm", "helm_general"],
- prompt_function=prompt.narrativeqa,
- hf_repo="lighteval/narrative_qa_helm",
- hf_subset="default",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- Metrics.rougeL,
- Metrics.bleu_1,
- Metrics.bleu_4,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-natural_instructions_bigbench = LightevalTaskConfig(
- name="natural_instructions",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="natural_instructions",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.rouge_t5],
- stop_sequence=["\n"],
- version=0,
-)
-natural_questions = LightevalTaskConfig(
- name="natural_questions",
- prompt_function=get_qa_prompt_function(
- Language.ENGLISH,
- lambda line: {"question": line["question"], "choices": [line["answer"]]},
- ),
- suite=("lighteval",),
- hf_repo="lighteval/small_natural_questions",
- hf_subset="default",
- evaluation_splits=("test",),
- few_shots_split="few_shot",
- generation_size=250,
- stop_sequence=["\n", "Question:", "question:"],
- metrics=(
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ),
-)
-navigate_bigbench = LightevalTaskConfig(
- name="navigate",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="navigate",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-nonsense_words_grammar_bigbench = LightevalTaskConfig(
- name="nonsense_words_grammar",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="nonsense_words_grammar",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-novel_concepts_bigbench_lite = LightevalTaskConfig(
- name="novel_concepts",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="novel_concepts",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-numeracy_linear_example_helm = LightevalTaskConfig(
- name="numeracy:linear_example",
- suite=["helm"],
- prompt_function=prompt.numeracy,
- hf_repo="lighteval/numeracy",
- hf_subset="linear_example",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-numeracy_linear_standard_helm = LightevalTaskConfig(
- name="numeracy:linear_standard",
- suite=["helm"],
- prompt_function=prompt.numeracy,
- hf_repo="lighteval/numeracy",
- hf_subset="linear_standard",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-numeracy_parabola_example_helm = LightevalTaskConfig(
- name="numeracy:parabola_example",
- suite=["helm"],
- prompt_function=prompt.numeracy,
- hf_repo="lighteval/numeracy",
- hf_subset="parabola_example",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-numeracy_parabola_standard_helm = LightevalTaskConfig(
- name="numeracy:parabola_standard",
- suite=["helm"],
- prompt_function=prompt.numeracy,
- hf_repo="lighteval/numeracy",
- hf_subset="parabola_standard",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-numeracy_paraboloid_example_helm = LightevalTaskConfig(
- name="numeracy:paraboloid_example",
- suite=["helm"],
- prompt_function=prompt.numeracy,
- hf_repo="lighteval/numeracy",
- hf_subset="paraboloid_example",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-numeracy_paraboloid_standard_helm = LightevalTaskConfig(
- name="numeracy:paraboloid_standard",
- suite=["helm"],
- prompt_function=prompt.numeracy,
- hf_repo="lighteval/numeracy",
- hf_subset="paraboloid_standard",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-numeracy_plane_example_helm = LightevalTaskConfig(
- name="numeracy:plane_example",
- suite=["helm"],
- prompt_function=prompt.numeracy,
- hf_repo="lighteval/numeracy",
- hf_subset="plane_example",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-numeracy_plane_standard_helm = LightevalTaskConfig(
- name="numeracy:plane_standard",
- suite=["helm"],
- prompt_function=prompt.numeracy,
- hf_repo="lighteval/numeracy",
- hf_subset="plane_standard",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-object_counting_bigbench = LightevalTaskConfig(
- name="object_counting",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="object_counting",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-odd_one_out_bigbench = LightevalTaskConfig(
- name="odd_one_out",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="odd_one_out",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-openbookqa_helm = LightevalTaskConfig(
- name="openbookqa",
- suite=["helm", "commonsense_scenario", "helm_general"],
- prompt_function=prompt.openbookqa_helm,
- hf_repo="openbookqa",
- hf_subset="main",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-openbookqa_lighteval = LightevalTaskConfig(
- name="openbookqa",
- suite=["lighteval"],
- prompt_function=prompt.openbookqa,
- hf_repo="openbookqa",
- hf_subset="main",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-operators_bigbench_lite = LightevalTaskConfig(
- name="operators",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="operators",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-paragraph_segmentation_bigbench = LightevalTaskConfig(
- name="paragraph_segmentation",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="paragraph_segmentation",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-parsinlu_qa_bigbench = LightevalTaskConfig(
- name="parsinlu_qa",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="parsinlu_qa",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-parsinlu_reading_comprehension_bigbench_lite = LightevalTaskConfig(
- name="parsinlu_reading_comprehension",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="parsinlu_reading_comprehension",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=None,
- version=0,
-)
-penguins_in_a_table_bigbench = LightevalTaskConfig(
- name="penguins_in_a_table",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="penguins_in_a_table",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-periodic_elements_bigbench = LightevalTaskConfig(
- name="periodic_elements",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="periodic_elements",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-persian_idioms_bigbench = LightevalTaskConfig(
- name="persian_idioms",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="persian_idioms",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-phrase_relatedness_bigbench = LightevalTaskConfig(
- name="phrase_relatedness",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="phrase_relatedness",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-physical_intuition_bigbench = LightevalTaskConfig(
- name="physical_intuition",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="physical_intuition",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-physics_bigbench = LightevalTaskConfig(
- name="physics",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="physics",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-physics_questions_bigbench = LightevalTaskConfig(
- name="physics_questions",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="physics_questions",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-piqa_lighteval = LightevalTaskConfig(
- name="piqa",
- suite=["lighteval"],
- prompt_function=prompt.piqa_harness,
- hf_repo="ybisk/piqa",
- hf_subset="plain_text",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-piqa_helm = LightevalTaskConfig(
- name="piqa",
- suite=["helm", "commonsense_scenario"],
- prompt_function=prompt.piqa_helm,
- hf_repo="ybisk/piqa",
- hf_subset="plain_text",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-play_dialog_same_or_different_bigbench_lite = LightevalTaskConfig(
- name="play_dialog_same_or_different",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="play_dialog_same_or_different",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-polish_sequence_labeling_bigbench = LightevalTaskConfig(
- name="polish_sequence_labeling",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="polish_sequence_labeling",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.f1_score],
- stop_sequence=["\n"],
- version=0,
-)
-presuppositions_as_nli_bigbench = LightevalTaskConfig(
- name="presuppositions_as_nli",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="presuppositions_as_nli",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-prost_lighteval = LightevalTaskConfig(
- name="prost",
- suite=["lighteval"],
- prompt_function=prompt.prost,
- hf_repo="lighteval/prost",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-pubmedqa_lighteval = LightevalTaskConfig(
- name="pubmedqa",
- suite=["lighteval"],
- prompt_function=prompt.pubmed_qa,
- hf_repo="pubmed_qa",
- hf_subset="pqa_labeled",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-pubmedqa_helm = LightevalTaskConfig(
- name="pubmedqa",
- suite=["helm"],
- prompt_function=prompt.pubmed_qa_helm,
- hf_repo="pubmed_qa",
- hf_subset="pqa_labeled",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-qa4mre_2011_lighteval = LightevalTaskConfig(
- name="qa4mre:2011",
- suite=["lighteval"],
- prompt_function=prompt.qa4mre,
- hf_repo="qa4mre",
- hf_subset="2011.main.EN",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-qa4mre_2012_lighteval = LightevalTaskConfig(
- name="qa4mre:2012",
- suite=["lighteval"],
- prompt_function=prompt.qa4mre,
- hf_repo="qa4mre",
- hf_subset="2012.main.EN",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-qa4mre_2013_lighteval = LightevalTaskConfig(
- name="qa4mre:2013",
- suite=["lighteval"],
- prompt_function=prompt.qa4mre,
- hf_repo="qa4mre",
- hf_subset="2013.main.EN",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-qa_wikidata_bigbench = LightevalTaskConfig(
- name="qa_wikidata",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="qa_wikidata",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.bleurt,
- Metrics.bleu,
- Metrics.rouge_t5,
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-qasper_lighteval = LightevalTaskConfig(
- name="qasper",
- suite=["lighteval"],
- prompt_function=prompt.qasper,
- hf_repo="allenai/qasper",
- hf_subset="qasper",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer})],
- stop_sequence=["\n"],
- version=0,
-)
-qasper_ll_lighteval = LightevalTaskConfig(
- name="qasper_ll",
- suite=["lighteval"],
- prompt_function=prompt.qasper_ll,
- hf_repo="allenai/qasper",
- hf_subset="qasper",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-quac_helm = LightevalTaskConfig(
- name="quac",
- suite=["helm"],
- prompt_function=prompt.quac,
- hf_repo="lighteval/quac_helm",
- hf_subset="default",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.f1_score,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-question_selection_bigbench = LightevalTaskConfig(
- name="question_selection",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="question_selection",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-race_high_lighteval = LightevalTaskConfig(
- name="race:high",
- suite=["lighteval", "race"],
- prompt_function=prompt.race,
- hf_repo="EleutherAI/race",
- hf_subset="high",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-raft_ade_corpus_v2_helm = LightevalTaskConfig(
- name="raft:ade_corpus_v2",
- suite=["helm", "helm_general"],
- prompt_function=prompt.raft_ade_corpus_v2,
- hf_repo="ought/raft",
- hf_subset="ade_corpus_v2",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-raft_banking_77_helm = LightevalTaskConfig(
- name="raft:banking_77",
- suite=["helm", "helm_general"],
- prompt_function=prompt.raft_banking_77,
- hf_repo="ought/raft",
- hf_subset="banking_77",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-raft_neurips_impact_statement_risks_helm = LightevalTaskConfig(
- name="raft:neurips_impact_statement_risks",
- suite=["helm", "helm_general"],
- prompt_function=prompt.raft_neurips_impact_statement_risks,
- hf_repo="ought/raft",
- hf_subset="neurips_impact_statement_risks",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-raft_one_stop_english_helm = LightevalTaskConfig(
- name="raft:one_stop_english",
- suite=["helm", "helm_general"],
- prompt_function=prompt.raft_one_stop_english,
- hf_repo="ought/raft",
- hf_subset="one_stop_english",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-raft_overruling_helm = LightevalTaskConfig(
- name="raft:overruling",
- suite=["helm", "helm_general"],
- prompt_function=prompt.raft_overruling,
- hf_repo="ought/raft",
- hf_subset="overruling",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-raft_semiconductor_org_types_helm = LightevalTaskConfig(
- name="raft:semiconductor_org_types",
- suite=["helm", "helm_general"],
- prompt_function=prompt.raft_semiconductor_org_types,
- hf_repo="ought/raft",
- hf_subset="semiconductor_org_types",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-raft_systematic_review_inclusion_helm = LightevalTaskConfig(
- name="raft:systematic_review_inclusion",
- suite=["helm", "helm_general"],
- prompt_function=prompt.raft_systematic_review_inclusion,
- hf_repo="ought/raft",
- hf_subset="systematic_review_inclusion",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-raft_tai_safety_research_helm = LightevalTaskConfig(
- name="raft:tai_safety_research",
- suite=["helm", "helm_general"],
- prompt_function=prompt.raft_tai_safety_research,
- hf_repo="ought/raft",
- hf_subset="tai_safety_research",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-raft_terms_of_service_helm = LightevalTaskConfig(
- name="raft:terms_of_service",
- suite=["helm", "helm_general"],
- prompt_function=prompt.raft_terms_of_service,
- hf_repo="ought/raft",
- hf_subset="terms_of_service",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-raft_tweet_eval_hate_helm = LightevalTaskConfig(
- name="raft:tweet_eval_hate",
- suite=["helm", "helm_general"],
- prompt_function=prompt.raft_tweet_eval_hate,
- hf_repo="ought/raft",
- hf_subset="tweet_eval_hate",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-raft_twitter_complaints_helm = LightevalTaskConfig(
- name="raft:twitter_complaints",
- suite=["helm", "helm_general"],
- prompt_function=prompt.raft_twitter_complaints,
- hf_repo="ought/raft",
- hf_subset="twitter_complaints",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=30,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score_macro,
- Metrics.f1_score_micro,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-real_or_fake_text_bigbench = LightevalTaskConfig(
- name="real_or_fake_text",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="real_or_fake_text",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-real_toxicity_prompts_helm = LightevalTaskConfig(
- name="real_toxicity_prompts",
- suite=["helm"],
- prompt_function=prompt.real_toxicity_prompts,
- hf_repo="allenai/real-toxicity-prompts",
- hf_subset="default",
- hf_avail_splits=["train"],
- evaluation_splits=["train"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[Metrics.prediction_perplexity],
- stop_sequence=["\n"],
- version=0,
-)
-reasoning_about_colored_objects_bigbench = LightevalTaskConfig(
- name="reasoning_about_colored_objects",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="reasoning_about_colored_objects",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-repeat_copy_logic_bigbench_lite = LightevalTaskConfig(
- name="repeat_copy_logic",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="repeat_copy_logic",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-rephrase_bigbench = LightevalTaskConfig(
- name="rephrase",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="rephrase",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.rouge_t5,
- Metrics.bleu,
- Metrics.loglikelihood_acc,
- Metrics.exact_match(sample_params={"strip_strings": False}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-rhyming_bigbench = LightevalTaskConfig(
- name="rhyming",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="rhyming",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-riddle_sense_bigbench = LightevalTaskConfig(
- name="riddle_sense",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="riddle_sense",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-ruin_names_bigbench = LightevalTaskConfig(
- name="ruin_names",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="ruin_names",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-salient_translation_error_detection_bigbench = LightevalTaskConfig(
- name="salient_translation_error_detection",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="salient_translation_error_detection",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-scientific_press_release_bigbench = LightevalTaskConfig(
- name="scientific_press_release",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="scientific_press_release",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-sciq_lighteval = LightevalTaskConfig(
- name="sciq",
- suite=["lighteval"],
- prompt_function=prompt.sciq,
- hf_repo="sciq",
- hf_subset="default",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-semantic_parsing_in_context_sparc_bigbench = LightevalTaskConfig(
- name="semantic_parsing_in_context_sparc",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="semantic_parsing_in_context_sparc",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-semantic_parsing_spider_bigbench = LightevalTaskConfig(
- name="semantic_parsing_spider",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="semantic_parsing_spider",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-sentence_ambiguity_bigbench = LightevalTaskConfig(
- name="sentence_ambiguity",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="sentence_ambiguity",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-similarities_abstraction_bigbench = LightevalTaskConfig(
- name="similarities_abstraction",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="similarities_abstraction",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-simp_turing_concept_bigbench = LightevalTaskConfig(
- name="simp_turing_concept",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="simp_turing_concept",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-simpleqa = LightevalTaskConfig(
- name="simpleqa",
- suite=["lighteval"],
- prompt_function=prompt.simpleqa,
- hf_repo="lighteval/SimpleQA",
- hf_subset="default",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split="few_shot",
- few_shots_select=None,
- generation_size=2048,
- metrics=[Metrics.simpleqa_judge],
- stop_sequence=["\n"],
- version=0,
-)
-simple_arithmetic_json_bigbench = LightevalTaskConfig(
- name="simple_arithmetic_json",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="simple_arithmetic_json",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-simple_arithmetic_json_multiple_choice_bigbench = LightevalTaskConfig(
- name="simple_arithmetic_json_multiple_choice",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="simple_arithmetic_json_multiple_choice",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-simple_arithmetic_json_subtasks_bigbench = LightevalTaskConfig(
- name="simple_arithmetic_json_subtasks",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="simple_arithmetic_json_subtasks",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-simple_arithmetic_multiple_targets_json_bigbench = LightevalTaskConfig(
- name="simple_arithmetic_multiple_targets_json",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="simple_arithmetic_multiple_targets_json",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-simple_ethical_questions_bigbench = LightevalTaskConfig(
- name="simple_ethical_questions",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="simple_ethical_questions",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-simple_text_editing_bigbench = LightevalTaskConfig(
- name="simple_text_editing",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="simple_text_editing",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-siqa_helm = LightevalTaskConfig(
- name="siqa",
- suite=["helm", "commonsense_scenario"],
- prompt_function=prompt.siqa,
- hf_repo="allenai/social_i_qa",
- hf_subset="default",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-snarks_bigbench = LightevalTaskConfig(
- name="snarks",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="snarks",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-social_iqa_bigbench = LightevalTaskConfig(
- name="social_iqa",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="social_iqa",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-social_support_bigbench = LightevalTaskConfig(
- name="social_support",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="social_support",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.f1_score_macro],
- stop_sequence=["\n"],
- version=0,
-)
-sports_understanding_bigbench = LightevalTaskConfig(
- name="sports_understanding",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="sports_understanding",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-squad_v2 = LightevalTaskConfig(
- name="squad_v2",
- prompt_function=get_qa_prompt_function(
- Language.ENGLISH,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="rajpurkar/squad_v2",
- hf_subset="squad_v2",
- hf_filter=lambda line: any(ans for ans in line["answers"]["text"] if len(ans) > 0),
- evaluation_splits=("validation",),
- few_shots_split="train",
- stop_sequence=["\n", "Question:", "question:"],
- generation_size=200,
- metrics=(
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- ),
-)
-storycloze_2016_lighteval = LightevalTaskConfig(
- name="storycloze:2016",
- suite=["lighteval", "storycloze"],
- prompt_function=prompt.storycloze,
- hf_repo="MoE-UNC/story_cloze",
- hf_subset="2016",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-storycloze_2018_lighteval = LightevalTaskConfig(
- name="storycloze:2018",
- suite=["lighteval", "storycloze"],
- prompt_function=prompt.storycloze,
- hf_repo="MoE-UNC/story_cloze",
- hf_subset="2018",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-strange_stories_bigbench_lite = LightevalTaskConfig(
- name="strange_stories",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="strange_stories",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-strategyqa_bigbench_lite = LightevalTaskConfig(
- name="strategyqa",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="strategyqa",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-sufficient_information_bigbench = LightevalTaskConfig(
- name="sufficient_information",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="sufficient_information",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-suicide_risk_bigbench = LightevalTaskConfig(
- name="suicide_risk",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="suicide_risk",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-summarization_cnn_dm_helm = LightevalTaskConfig(
- name="summarization:cnn-dm",
- suite=["helm", "helm_general"],
- prompt_function=prompt.cnn_dm,
- hf_repo="lighteval/summarization",
- hf_subset="cnn-dm",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=128,
- metrics=[
- Metrics.rouge1,
- Metrics.rouge2,
- Metrics.rougeL,
- Metrics.faithfulness,
- Metrics.extractiveness,
- Metrics.bert_score,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-summarization_xsum_helm = LightevalTaskConfig(
- name="summarization:xsum",
- suite=["helm", "helm_general"],
- prompt_function=prompt.xsum,
- hf_repo="lighteval/summarization",
- hf_subset="xsum",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=64,
- metrics=[
- Metrics.rouge1,
- Metrics.rouge2,
- Metrics.rougeL,
- Metrics.faithfulness,
- Metrics.extractiveness,
- Metrics.bert_score,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-summarization_xsum_sampled_helm = LightevalTaskConfig(
- name="summarization:xsum-sampled",
- suite=["helm"],
- prompt_function=prompt.xsum,
- hf_repo="lighteval/summarization",
- hf_subset="xsum-sampled",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=64,
- metrics=[
- Metrics.rouge1,
- Metrics.rouge2,
- Metrics.rougeL,
- Metrics.faithfulness,
- Metrics.extractiveness,
- Metrics.bert_score,
- ],
- stop_sequence=["\n"],
- version=0,
-)
-super_glue_boolq_lighteval = LightevalTaskConfig(
- name="super_glue:boolq",
- suite=["lighteval", "superglue"],
- prompt_function=prompt.boolq_harness,
- hf_repo="super_glue",
- hf_subset="boolq",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-super_glue_cb_lighteval = LightevalTaskConfig(
- name="super_glue:cb",
- suite=["lighteval", "superglue"],
- prompt_function=prompt.cb,
- hf_repo="super_glue",
- hf_subset="cb",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc, Metrics.multi_f1_numeric],
- stop_sequence=["\n"],
- version=0,
-)
-super_glue_copa_lighteval = LightevalTaskConfig(
- name="super_glue:copa",
- suite=["lighteval", "superglue"],
- prompt_function=prompt.copa,
- hf_repo="super_glue",
- hf_subset="copa",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-super_glue_rte_lighteval = LightevalTaskConfig(
- name="super_glue:rte",
- suite=["lighteval", "superglue"],
- prompt_function=prompt.rte,
- hf_repo="super_glue",
- hf_subset="rte",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-super_glue_multirc_lighteval = LightevalTaskConfig(
- name="super_glue:multirc",
- suite=["lighteval", "superglue"],
- prompt_function=prompt.multirc,
- hf_repo="super_glue",
- hf_subset="multirc",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-super_glue_wic_lighteval = LightevalTaskConfig(
- name="super_glue:wic",
- suite=["lighteval", "superglue"],
- prompt_function=prompt.wic,
- hf_repo="super_glue",
- hf_subset="wic",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-super_glue_wsc_lighteval = LightevalTaskConfig(
- name="super_glue:wsc",
- suite=["lighteval", "superglue"],
- prompt_function=prompt.wsc,
- hf_repo="super_glue",
- hf_subset="wsc",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-swahili_english_proverbs_bigbench = LightevalTaskConfig(
- name="swahili_english_proverbs",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="swahili_english_proverbs",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-swag_lighteval = LightevalTaskConfig(
- name="swag",
- suite=["lighteval"],
- prompt_function=prompt.swag,
- hf_repo="swag",
- hf_subset="regular",
- hf_avail_splits=["train", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-swedish_to_german_proverbs_bigbench = LightevalTaskConfig(
- name="swedish_to_german_proverbs",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="swedish_to_german_proverbs",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-symbol_interpretation_bigbench_lite = LightevalTaskConfig(
- name="symbol_interpretation",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="symbol_interpretation",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-synthetic_reasoning_induction_helm = LightevalTaskConfig(
- name="synthetic_reasoning:induction",
- suite=["helm"],
- prompt_function=prompt.synthetic_reasoning,
- hf_repo="lighteval/synthetic_reasoning",
- hf_subset="induction",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=50,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-synthetic_reasoning_natural_easy_helm = LightevalTaskConfig(
- name="synthetic_reasoning:natural_easy",
- suite=["helm"],
- prompt_function=prompt.synthetic_reasoning_natural,
- hf_repo="lighteval/synthetic_reasoning_natural",
- hf_subset="easy",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[Metrics.exact_match, Metrics.f1_score],
- stop_sequence=["\n"],
- version=0,
-)
-synthetic_reasoning_natural_hard_helm = LightevalTaskConfig(
- name="synthetic_reasoning:natural_hard",
- suite=["helm"],
- prompt_function=prompt.synthetic_reasoning_natural,
- hf_repo="lighteval/synthetic_reasoning_natural",
- hf_subset="hard",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[Metrics.exact_match, Metrics.f1_score],
- stop_sequence=["\n"],
- version=0,
-)
-synthetic_reasoning_pattern_match_helm = LightevalTaskConfig(
- name="synthetic_reasoning:pattern_match",
- suite=["helm"],
- prompt_function=prompt.synthetic_reasoning,
- hf_repo="lighteval/synthetic_reasoning",
- hf_subset="pattern_match",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=50,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-synthetic_reasoning_variable_substitution_helm = LightevalTaskConfig(
- name="synthetic_reasoning:variable_substitution",
- suite=["helm"],
- prompt_function=prompt.synthetic_reasoning,
- hf_repo="lighteval/synthetic_reasoning",
- hf_subset="variable_substitution",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=50,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-tellmewhy_bigbench = LightevalTaskConfig(
- name="tellmewhy",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="tellmewhy",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.rouge_t5],
- stop_sequence=["\n"],
- version=0,
-)
-temporal_sequences_bigbench = LightevalTaskConfig(
- name="temporal_sequences",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="temporal_sequences",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-tense_bigbench = LightevalTaskConfig(
- name="tense",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="tense",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_arxiv_helm = LightevalTaskConfig(
- name="the_pile:arxiv",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="arxiv",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_bibliotik_helm = LightevalTaskConfig(
- name="the_pile:bibliotik",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="bibliotik",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_commoncrawl_helm = LightevalTaskConfig(
- name="the_pile:commoncrawl",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="commoncrawl",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_dm_mathematics_helm = LightevalTaskConfig(
- name="the_pile:dm-mathematics",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="dm-mathematics",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_enron_helm = LightevalTaskConfig(
- name="the_pile:enron",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="enron",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_europarl_helm = LightevalTaskConfig(
- name="the_pile:europarl",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="europarl",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_freelaw_helm = LightevalTaskConfig(
- name="the_pile:freelaw",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="freelaw",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_github_helm = LightevalTaskConfig(
- name="the_pile:github",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="github",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_gutenberg_helm = LightevalTaskConfig(
- name="the_pile:gutenberg",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="gutenberg",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_hackernews_helm = LightevalTaskConfig(
- name="the_pile:hackernews",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="hackernews",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_nih_exporter_helm = LightevalTaskConfig(
- name="the_pile:nih-exporter",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="nih-exporter",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_opensubtitles_helm = LightevalTaskConfig(
- name="the_pile:opensubtitles",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="opensubtitles",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_openwebtext2_helm = LightevalTaskConfig(
- name="the_pile:openwebtext2",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="openwebtext2",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_pubmed_abstracts_helm = LightevalTaskConfig(
- name="the_pile:pubmed-abstracts",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="pubmed-abstracts",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_pubmed_central_helm = LightevalTaskConfig(
- name="the_pile:pubmed-central",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="pubmed-central",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_stackexchange_helm = LightevalTaskConfig(
- name="the_pile:stackexchange",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="stackexchange",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_upsto_helm = LightevalTaskConfig(
- name="the_pile:upsto",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="uspto",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_wikipedia_helm = LightevalTaskConfig(
- name="the_pile:wikipedia",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="wikipedia",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-the_pile_youtubesubtitles_helm = LightevalTaskConfig(
- name="the_pile:youtubesubtitles",
- suite=["helm"],
- prompt_function=prompt.the_pile,
- hf_repo="lighteval/pile_helm",
- hf_subset="youtubesubtitles",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-timedial_bigbench = LightevalTaskConfig(
- name="timedial",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="timedial",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-toxigen_lighteval = LightevalTaskConfig(
- name="toxigen",
- suite=["lighteval"],
- prompt_function=prompt.toxigen,
- hf_repo="skg/toxigen-data",
- hf_subset="annotated",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-topical_chat_bigbench = LightevalTaskConfig(
- name="topical_chat",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="topical_chat",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.bleurt],
- stop_sequence=["\n"],
- version=0,
-)
-tracking_shuffled_objects_bigbench = LightevalTaskConfig(
- name="tracking_shuffled_objects",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="tracking_shuffled_objects",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-triviaqa_lighteval = LightevalTaskConfig(
- name="triviaqa",
- suite=["lighteval"],
- prompt_function=prompt.triviaqa,
- hf_repo="trivia_qa",
- hf_subset="rc.nocontext",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=20,
- metrics=[
- Metrics.exact_match(sample_params={"strip_strings": True, "normalize_pred": harness_triviaqa_normalizer})
- ],
- stop_sequence=["\n", ".", ","],
- version=0,
-)
-truthfulqa_gen_lighteval = LightevalTaskConfig(
- name="truthfulqa:gen",
- suite=["lighteval"],
- prompt_function=prompt.truthful_qa_generative,
- hf_repo="truthful_qa",
- hf_subset="generation",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=200,
- metrics=[Metrics.bleu, Metrics.rouge_t5],
- stop_sequence=["\n"],
- version=0,
-)
-truthfulqa_mc_leaderboard = LightevalTaskConfig(
- name="truthfulqa:mc",
- suite=["leaderboard"],
- prompt_function=prompt.truthful_qa_multiple_choice,
- hf_repo="truthful_qa",
- hf_subset="multiple_choice",
- hf_avail_splits=["validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.truthfulqa_mc_metrics],
- stop_sequence=["\n"],
- version=0,
-)
-truthfulqa_helm = LightevalTaskConfig(
- name="truthfulqa",
- suite=["helm", "helm_general"],
- prompt_function=prompt.truthful_qa_helm,
- hf_repo="lighteval/truthfulqa_helm",
- hf_subset="default",
- hf_avail_splits=["train", "valid"],
- evaluation_splits=["valid"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-twitterAAE_aa_helm = LightevalTaskConfig(
- name="twitterAAE:aa",
- suite=["helm"],
- prompt_function=prompt.twitter_aae,
- hf_repo="lighteval/twitterAAE",
- hf_subset="aa",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-twitterAAE_white_helm = LightevalTaskConfig(
- name="twitterAAE:white",
- suite=["helm"],
- prompt_function=prompt.twitter_aae,
- hf_repo="lighteval/twitterAAE",
- hf_subset="white",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-understanding_fables_bigbench = LightevalTaskConfig(
- name="understanding_fables",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="understanding_fables",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-undo_permutation_bigbench = LightevalTaskConfig(
- name="undo_permutation",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="undo_permutation",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-unit_conversion_bigbench = LightevalTaskConfig(
- name="unit_conversion",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="unit_conversion",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-unit_interpretation_bigbench = LightevalTaskConfig(
- name="unit_interpretation",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="unit_interpretation",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-unnatural_in_context_learning_bigbench = LightevalTaskConfig(
- name="unnatural_in_context_learning",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="unnatural_in_context_learning",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-unscramble_anagrams1_lighteval = LightevalTaskConfig(
- name="unscramble:anagrams1",
- suite=["lighteval", "unscramble"],
- prompt_function=prompt.unscramble,
- hf_repo="lighteval/GPT3_unscramble",
- hf_subset="default",
- hf_avail_splits=["mid_word_1_anagrams"],
- evaluation_splits=["mid_word_1_anagrams"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-unscramble_anagrams2_lighteval = LightevalTaskConfig(
- name="unscramble:anagrams2",
- suite=["lighteval", "unscramble"],
- prompt_function=prompt.unscramble,
- hf_repo="lighteval/GPT3_unscramble",
- hf_subset="default",
- hf_avail_splits=["mid_word_2_anagrams"],
- evaluation_splits=["mid_word_2_anagrams"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-unscramble_cycle_letters_lighteval = LightevalTaskConfig(
- name="unscramble:cycle_letters",
- suite=["lighteval", "unscramble"],
- prompt_function=prompt.unscramble,
- hf_repo="lighteval/GPT3_unscramble",
- hf_subset="default",
- hf_avail_splits=["cycle_letters_in_word"],
- evaluation_splits=["cycle_letters_in_word"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-unscramble_random_insertion_lighteval = LightevalTaskConfig(
- name="unscramble:random_insertion",
- suite=["lighteval", "unscramble"],
- prompt_function=prompt.unscramble,
- hf_repo="lighteval/GPT3_unscramble",
- hf_subset="default",
- hf_avail_splits=["random_insertion_in_word"],
- evaluation_splits=["random_insertion_in_word"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-unscramble_reversed_words_lighteval = LightevalTaskConfig(
- name="unscramble:reversed_words",
- suite=["lighteval", "unscramble"],
- prompt_function=prompt.unscramble,
- hf_repo="lighteval/GPT3_unscramble",
- hf_subset="default",
- hf_avail_splits=["reversed_words"],
- evaluation_splits=["reversed_words"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=5,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-vitaminc_fact_verification_bigbench_lite = LightevalTaskConfig(
- name="vitaminc_fact_verification",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="vitaminc_fact_verification",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-webqs_lighteval = LightevalTaskConfig(
- name="webqs",
- suite=["lighteval"],
- prompt_function=prompt.webqs,
- hf_repo="web_questions",
- hf_subset="default",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.acc_golds_likelihood],
- stop_sequence=["\n"],
- version=0,
-)
-what_is_the_tao_bigbench = LightevalTaskConfig(
- name="what_is_the_tao",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="what_is_the_tao",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-which_wiki_edit_bigbench = LightevalTaskConfig(
- name="which_wiki_edit",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="which_wiki_edit",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_applies_to_jurisdiction_helm = LightevalTaskConfig(
- name="wikifact:applies_to_jurisdiction",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="applies_to_jurisdiction",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_atomic_number_helm = LightevalTaskConfig(
- name="wikifact:atomic_number",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="atomic_number",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_author_helm = LightevalTaskConfig(
- name="wikifact:author",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="author",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_award_received_helm = LightevalTaskConfig(
- name="wikifact:award_received",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="award_received",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_basic_form_of_government_helm = LightevalTaskConfig(
- name="wikifact:basic_form_of_government",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="basic_form_of_government",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_capital_helm = LightevalTaskConfig(
- name="wikifact:capital",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="capital",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_capital_of_helm = LightevalTaskConfig(
- name="wikifact:capital_of",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="capital_of",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_central_bank_helm = LightevalTaskConfig(
- name="wikifact:central_bank",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="central_bank",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_composer_helm = LightevalTaskConfig(
- name="wikifact:composer",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="composer",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_continent_helm = LightevalTaskConfig(
- name="wikifact:continent",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="continent",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_country_helm = LightevalTaskConfig(
- name="wikifact:country",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="country",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_country_of_citizenship_helm = LightevalTaskConfig(
- name="wikifact:country_of_citizenship",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="country_of_citizenship",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_country_of_origin_helm = LightevalTaskConfig(
- name="wikifact:country_of_origin",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="country_of_origin",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_creator_helm = LightevalTaskConfig(
- name="wikifact:creator",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="creator",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_currency_helm = LightevalTaskConfig(
- name="wikifact:currency",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="currency",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_defendant_helm = LightevalTaskConfig(
- name="wikifact:defendant",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="defendant",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_developer_helm = LightevalTaskConfig(
- name="wikifact:developer",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="developer",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_diplomatic_relation_helm = LightevalTaskConfig(
- name="wikifact:diplomatic_relation",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="diplomatic_relation",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_director_helm = LightevalTaskConfig(
- name="wikifact:director",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="director",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_discoverer_or_inventor_helm = LightevalTaskConfig(
- name="wikifact:discoverer_or_inventor",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="discoverer_or_inventor",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_drug_or_therapy_used_for_treatment_helm = LightevalTaskConfig(
- name="wikifact:drug_or_therapy_used_for_treatment",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="drug_or_therapy_used_for_treatment",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_educated_at_helm = LightevalTaskConfig(
- name="wikifact:educated_at",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="educated_at",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_electron_configuration_helm = LightevalTaskConfig(
- name="wikifact:electron_configuration",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="electron_configuration",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_employer_helm = LightevalTaskConfig(
- name="wikifact:employer",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="employer",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_field_of_work_helm = LightevalTaskConfig(
- name="wikifact:field_of_work",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="field_of_work",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_file_extension_helm = LightevalTaskConfig(
- name="wikifact:file_extension",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="file_extension",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_genetic_association_helm = LightevalTaskConfig(
- name="wikifact:genetic_association",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="genetic_association",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_genre_helm = LightevalTaskConfig(
- name="wikifact:genre",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="genre",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_has_part_helm = LightevalTaskConfig(
- name="wikifact:has_part",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="has_part",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_head_of_government_helm = LightevalTaskConfig(
- name="wikifact:head_of_government",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="head_of_government",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_head_of_state_helm = LightevalTaskConfig(
- name="wikifact:head_of_state",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="head_of_state",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_headquarters_location_helm = LightevalTaskConfig(
- name="wikifact:headquarters_location",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="headquarters_location",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_industry_helm = LightevalTaskConfig(
- name="wikifact:industry",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="industry",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_influenced_by_helm = LightevalTaskConfig(
- name="wikifact:influenced_by",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="influenced_by",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_instance_of_helm = LightevalTaskConfig(
- name="wikifact:instance_of",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="instance_of",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_instrument_helm = LightevalTaskConfig(
- name="wikifact:instrument",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="instrument",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_language_of_work_or_name_helm = LightevalTaskConfig(
- name="wikifact:language_of_work_or_name",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="language_of_work_or_name",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_languages_spoken_written_or_signed_helm = LightevalTaskConfig(
- name="wikifact:languages_spoken_written_or_signed",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="languages_spoken_written_or_signed",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_laws_applied_helm = LightevalTaskConfig(
- name="wikifact:laws_applied",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="laws_applied",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_located_in_the_administrative_territorial_entity_helm = LightevalTaskConfig(
- name="wikifact:located_in_the_administrative_territorial_entity",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="located_in_the_administrative_territorial_entity",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_location_helm = LightevalTaskConfig(
- name="wikifact:location",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="location",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_location_of_discovery_helm = LightevalTaskConfig(
- name="wikifact:location_of_discovery",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="location_of_discovery",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_location_of_formation_helm = LightevalTaskConfig(
- name="wikifact:location_of_formation",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="location_of_formation",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_majority_opinion_by_helm = LightevalTaskConfig(
- name="wikifact:majority_opinion_by",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="majority_opinion_by",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_manufacturer_helm = LightevalTaskConfig(
- name="wikifact:manufacturer",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="manufacturer",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_measured_physical_quantity_helm = LightevalTaskConfig(
- name="wikifact:measured_physical_quantity",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="measured_physical_quantity",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_medical_condition_treated_helm = LightevalTaskConfig(
- name="wikifact:medical_condition_treated",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="medical_condition_treated",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_member_of_helm = LightevalTaskConfig(
- name="wikifact:member_of",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="member_of",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_member_of_political_party_helm = LightevalTaskConfig(
- name="wikifact:member_of_political_party",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="member_of_political_party",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_member_of_sports_team_helm = LightevalTaskConfig(
- name="wikifact:member_of_sports_team",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="member_of_sports_team",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_movement_helm = LightevalTaskConfig(
- name="wikifact:movement",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="movement",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_named_after_helm = LightevalTaskConfig(
- name="wikifact:named_after",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="named_after",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_native_language_helm = LightevalTaskConfig(
- name="wikifact:native_language",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="native_language",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_number_of_processor_cores_helm = LightevalTaskConfig(
- name="wikifact:number_of_processor_cores",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="number_of_processor_cores",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_occupation_helm = LightevalTaskConfig(
- name="wikifact:occupation",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="occupation",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_office_held_by_head_of_government_helm = LightevalTaskConfig(
- name="wikifact:office_held_by_head_of_government",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="office_held_by_head_of_government",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_office_held_by_head_of_state_helm = LightevalTaskConfig(
- name="wikifact:office_held_by_head_of_state",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="office_held_by_head_of_state",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_official_language_helm = LightevalTaskConfig(
- name="wikifact:official_language",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="official_language",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_operating_system_helm = LightevalTaskConfig(
- name="wikifact:operating_system",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="operating_system",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_original_language_of_film_or_TV_show_helm = LightevalTaskConfig(
- name="wikifact:original_language_of_film_or_TV_show",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="original_language_of_film_or_TV_show",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_original_network_helm = LightevalTaskConfig(
- name="wikifact:original_network",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="original_network",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_overrules_helm = LightevalTaskConfig(
- name="wikifact:overrules",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="overrules",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_owned_by_helm = LightevalTaskConfig(
- name="wikifact:owned_by",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="owned_by",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_part_of_helm = LightevalTaskConfig(
- name="wikifact:part_of",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="part_of",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_participating_team_helm = LightevalTaskConfig(
- name="wikifact:participating_team",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="participating_team",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_place_of_birth_helm = LightevalTaskConfig(
- name="wikifact:place_of_birth",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="place_of_birth",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_place_of_death_helm = LightevalTaskConfig(
- name="wikifact:place_of_death",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="place_of_death",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_plaintiff_helm = LightevalTaskConfig(
- name="wikifact:plaintiff",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="plaintiff",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_position_held_helm = LightevalTaskConfig(
- name="wikifact:position_held",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="position_held",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_position_played_on_team_helm = LightevalTaskConfig(
- name="wikifact:position_played_on_team",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="position_played_on_team",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_programming_language_helm = LightevalTaskConfig(
- name="wikifact:programming_language",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="programming_language",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_recommended_unit_of_measurement_helm = LightevalTaskConfig(
- name="wikifact:recommended_unit_of_measurement",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="recommended_unit_of_measurement",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_record_label_helm = LightevalTaskConfig(
- name="wikifact:record_label",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="record_label",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_religion_helm = LightevalTaskConfig(
- name="wikifact:religion",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="religion",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_repealed_by_helm = LightevalTaskConfig(
- name="wikifact:repealed_by",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="repealed_by",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_shares_border_with_helm = LightevalTaskConfig(
- name="wikifact:shares_border_with",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="shares_border_with",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_solved_by_helm = LightevalTaskConfig(
- name="wikifact:solved_by",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="solved_by",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_statement_describes_helm = LightevalTaskConfig(
- name="wikifact:statement_describes",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="statement_describes",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_stock_exchange_helm = LightevalTaskConfig(
- name="wikifact:stock_exchange",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="stock_exchange",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_subclass_of_helm = LightevalTaskConfig(
- name="wikifact:subclass_of",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="subclass_of",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_subsidiary_helm = LightevalTaskConfig(
- name="wikifact:subsidiary",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="subsidiary",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_symptoms_and_signs_helm = LightevalTaskConfig(
- name="wikifact:symptoms_and_signs",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="symptoms_and_signs",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_therapeutic_area_helm = LightevalTaskConfig(
- name="wikifact:therapeutic_area",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="therapeutic_area",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_time_of_discovery_or_invention_helm = LightevalTaskConfig(
- name="wikifact:time_of_discovery_or_invention",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="time_of_discovery_or_invention",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_twinned_administrative_body_helm = LightevalTaskConfig(
- name="wikifact:twinned_administrative_body",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="twinned_administrative_body",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikifact_work_location_helm = LightevalTaskConfig(
- name="wikifact:work_location",
- suite=["helm"],
- prompt_function=prompt.wikifact,
- hf_repo="lighteval/wikifact",
- hf_subset="work_location",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=8,
- metrics=[
- Metrics.exact_match,
- Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}),
- Metrics.exact_match(sample_params={"type_exact_match": "prefix"}),
- Metrics.exact_match(
- sample_params={
- "normalize_gold": helm_normalizer,
- "normalize_pred": helm_normalizer,
- "type_exact_match": "prefix",
- }
- ),
- ],
- stop_sequence=["\n"],
- version=0,
-)
-wikitext_2_lighteval = LightevalTaskConfig(
- name="wikitext:2",
- suite=["lighteval"],
- prompt_function=prompt.wikitext,
- hf_repo="wikitext",
- hf_subset="wikitext-2-raw-v1",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-wikitext_103_document_level_harness = LightevalTaskConfig(
- name="wikitext:103:document_level",
- suite=["harness"],
- prompt_function=prompt.wikitext_harness,
- hf_repo="EleutherAI/wikitext_document_level",
- hf_subset="wikitext-103-raw-v1",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-wikitext_103_document_level_helm = LightevalTaskConfig(
- name="wikitext:103:document_level",
- suite=["helm"],
- prompt_function=prompt.wikitext_helm,
- hf_repo="EleutherAI/wikitext_document_level",
- hf_subset="wikitext-103-raw-v1",
- hf_avail_splits=["train", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
- stop_sequence=["\n"],
- version=0,
-)
-wino_x_german_bigbench = LightevalTaskConfig(
- name="wino_x_german",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="wino_x_german",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-winogrande_leaderboard = LightevalTaskConfig(
- name="winogrande",
- suite=["leaderboard"],
- prompt_function=prompt.winogrande,
- hf_repo="winogrande",
- hf_subset="winogrande_xl",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation"],
- few_shots_split=None,
- few_shots_select="random_sampling",
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-winowhy_bigbench_lite = LightevalTaskConfig(
- name="winowhy",
- suite=["bigbench_lite", "bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench_whitespace_after_query,
- hf_repo="tasksource/bigbench",
- hf_subset="winowhy",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-wmt08_cs_en_lighteval = LightevalTaskConfig(
- name="wmt08:cs-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt08_cs-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt08_de_en_lighteval = LightevalTaskConfig(
- name="wmt08:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt08_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt08_en_cs_lighteval = LightevalTaskConfig(
- name="wmt08:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt08_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt08_en_de_lighteval = LightevalTaskConfig(
- name="wmt08:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt08_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt08_en_es_lighteval = LightevalTaskConfig(
- name="wmt08:en-es",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt08_en-es",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt08_en_fr_lighteval = LightevalTaskConfig(
- name="wmt08:en-fr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt08_en-fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt08_en_hu_lighteval = LightevalTaskConfig(
- name="wmt08:en-hu",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt08_en-hu",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt08_es_en_lighteval = LightevalTaskConfig(
- name="wmt08:es-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt08_es-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt08_fr_en_lighteval = LightevalTaskConfig(
- name="wmt08:fr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt08_fr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt08_hu_en_lighteval = LightevalTaskConfig(
- name="wmt08:hu-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt08_hu-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt09_cs_en_lighteval = LightevalTaskConfig(
- name="wmt09:cs-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt09_cs-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt09_de_en_lighteval = LightevalTaskConfig(
- name="wmt09:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt09_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt09_en_cs_lighteval = LightevalTaskConfig(
- name="wmt09:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt09_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt09_en_de_lighteval = LightevalTaskConfig(
- name="wmt09:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt09_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt09_en_es_lighteval = LightevalTaskConfig(
- name="wmt09:en-es",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt09_en-es",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt09_en_fr_lighteval = LightevalTaskConfig(
- name="wmt09:en-fr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt09_en-fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt09_en_hu_lighteval = LightevalTaskConfig(
- name="wmt09:en-hu",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt09_en-hu",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt09_en_it_lighteval = LightevalTaskConfig(
- name="wmt09:en-it",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt09_en-it",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt09_es_en_lighteval = LightevalTaskConfig(
- name="wmt09:es-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt09_es-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt09_fr_en_lighteval = LightevalTaskConfig(
- name="wmt09:fr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt09_fr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt09_hu_en_lighteval = LightevalTaskConfig(
- name="wmt09:hu-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt09_hu-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt09_it_en_lighteval = LightevalTaskConfig(
- name="wmt09:it-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt09_it-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt10_cs_en_lighteval = LightevalTaskConfig(
- name="wmt10:cs-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt10_cs-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt10_de_en_lighteval = LightevalTaskConfig(
- name="wmt10:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt10_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt10_en_cs_lighteval = LightevalTaskConfig(
- name="wmt10:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt10_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt10_en_de_lighteval = LightevalTaskConfig(
- name="wmt10:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt10_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt10_en_es_lighteval = LightevalTaskConfig(
- name="wmt10:en-es",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt10_en-es",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt10_en_fr_lighteval = LightevalTaskConfig(
- name="wmt10:en-fr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt10_en-fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt10_es_en_lighteval = LightevalTaskConfig(
- name="wmt10:es-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt10_es-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt10_fr_en_lighteval = LightevalTaskConfig(
- name="wmt10:fr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt10_fr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt11_cs_en_lighteval = LightevalTaskConfig(
- name="wmt11:cs-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt11_cs-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt11_de_en_lighteval = LightevalTaskConfig(
- name="wmt11:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt11_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt11_en_cs_lighteval = LightevalTaskConfig(
- name="wmt11:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt11_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt11_en_de_lighteval = LightevalTaskConfig(
- name="wmt11:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt11_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt11_en_es_lighteval = LightevalTaskConfig(
- name="wmt11:en-es",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt11_en-es",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt11_en_fr_lighteval = LightevalTaskConfig(
- name="wmt11:en-fr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt11_en-fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt11_es_en_lighteval = LightevalTaskConfig(
- name="wmt11:es-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt11_es-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt11_fr_en_lighteval = LightevalTaskConfig(
- name="wmt11:fr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt11_fr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt12_cs_en_lighteval = LightevalTaskConfig(
- name="wmt12:cs-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt12_cs-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt12_de_en_lighteval = LightevalTaskConfig(
- name="wmt12:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt12_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt12_en_cs_lighteval = LightevalTaskConfig(
- name="wmt12:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt12_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt12_en_de_lighteval = LightevalTaskConfig(
- name="wmt12:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt12_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt12_en_es_lighteval = LightevalTaskConfig(
- name="wmt12:en-es",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt12_en-es",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt12_en_fr_lighteval = LightevalTaskConfig(
- name="wmt12:en-fr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt12_en-fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt12_es_en_lighteval = LightevalTaskConfig(
- name="wmt12:es-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt12_es-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt12_fr_en_lighteval = LightevalTaskConfig(
- name="wmt12:fr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt12_fr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt13_cs_en_lighteval = LightevalTaskConfig(
- name="wmt13:cs-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt13_cs-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt13_de_en_lighteval = LightevalTaskConfig(
- name="wmt13:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt13_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt13_en_cs_lighteval = LightevalTaskConfig(
- name="wmt13:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt13_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt13_en_de_lighteval = LightevalTaskConfig(
- name="wmt13:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt13_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt13_en_es_lighteval = LightevalTaskConfig(
- name="wmt13:en-es",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt13_en-es",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt13_en_fr_lighteval = LightevalTaskConfig(
- name="wmt13:en-fr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt13_en-fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt13_en_ru_lighteval = LightevalTaskConfig(
- name="wmt13:en-ru",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt13_en-ru",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt13_es_en_lighteval = LightevalTaskConfig(
- name="wmt13:es-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt13_es-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt13_fr_en_lighteval = LightevalTaskConfig(
- name="wmt13:fr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt13_fr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt13_ru_en_lighteval = LightevalTaskConfig(
- name="wmt13:ru-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt13_ru-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_cs_en_lighteval = LightevalTaskConfig(
- name="wmt14:cs-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt14_cs-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_de_en_lighteval = LightevalTaskConfig(
- name="wmt14:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt14_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_en_cs_lighteval = LightevalTaskConfig(
- name="wmt14:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt14_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_en_de_lighteval = LightevalTaskConfig(
- name="wmt14:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt14_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_en_fr_lighteval = LightevalTaskConfig(
- name="wmt14:en-fr",
- suite=["lighteval", "gpt3_benchmarks"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="wmt14",
- hf_subset="fr-en",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_en_fr_lighteval = LightevalTaskConfig(
- name="wmt14:en-fr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt14_en-fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_en_hi_lighteval = LightevalTaskConfig(
- name="wmt14:en-hi",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt14_en-hi",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_en_ru_lighteval = LightevalTaskConfig(
- name="wmt14:en-ru",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt14_en-ru",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_fr_en_lighteval = LightevalTaskConfig(
- name="wmt14:fr-en",
- suite=["lighteval", "gpt3_benchmarks"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="wmt14",
- hf_subset="fr-en",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_fr_en_lighteval = LightevalTaskConfig(
- name="wmt14:fr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt14_fr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_hi_en_lighteval = LightevalTaskConfig(
- name="wmt14:hi-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt14_hi-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_ru_en_lighteval = LightevalTaskConfig(
- name="wmt14:ru-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt14_ru-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_cs_en_helm = LightevalTaskConfig(
- name="wmt14:cs-en",
- suite=["helm"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/wmt_14",
- hf_subset="cs-en",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.bleu],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_de_en_helm = LightevalTaskConfig(
- name="wmt14:de-en",
- suite=["helm"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/wmt_14",
- hf_subset="de-en",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.bleu],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_fr_en_helm = LightevalTaskConfig(
- name="wmt14:fr-en",
- suite=["helm"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/wmt_14",
- hf_subset="fr-en",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.bleu],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_hi_en_helm = LightevalTaskConfig(
- name="wmt14:hi-en",
- suite=["helm"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/wmt_14",
- hf_subset="hi-en",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.bleu],
- stop_sequence=["\n"],
- version=0,
-)
-wmt14_ru_en_helm = LightevalTaskConfig(
- name="wmt14:ru-en",
- suite=["helm"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/wmt_14",
- hf_subset="ru-en",
- hf_avail_splits=["train", "test", "validation"],
- evaluation_splits=["validation", "test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=100,
- metrics=[Metrics.bleu],
- stop_sequence=["\n"],
- version=0,
-)
-wmt15_cs_en_lighteval = LightevalTaskConfig(
- name="wmt15:cs-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt15_cs-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt15_de_en_lighteval = LightevalTaskConfig(
- name="wmt15:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt15_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt15_en_cs_lighteval = LightevalTaskConfig(
- name="wmt15:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt15_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt15_en_de_lighteval = LightevalTaskConfig(
- name="wmt15:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt15_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt15_en_fi_lighteval = LightevalTaskConfig(
- name="wmt15:en-fi",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt15_en-fi",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt15_en_fr_lighteval = LightevalTaskConfig(
- name="wmt15:en-fr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt15_en-fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt15_en_ru_lighteval = LightevalTaskConfig(
- name="wmt15:en-ru",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt15_en-ru",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt15_fi_en_lighteval = LightevalTaskConfig(
- name="wmt15:fi-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt15_fi-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt15_fr_en_lighteval = LightevalTaskConfig(
- name="wmt15:fr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt15_fr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt15_ru_en_lighteval = LightevalTaskConfig(
- name="wmt15:ru-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt15_ru-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_cs_en_lighteval = LightevalTaskConfig(
- name="wmt16:cs-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt16_cs-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_de_en_lighteval = LightevalTaskConfig(
- name="wmt16:de-en",
- suite=["lighteval", "gpt3_benchmarks"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="wmt16",
- hf_subset="de-en",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_de_en_lighteval = LightevalTaskConfig(
- name="wmt16:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt16_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_en_cs_lighteval = LightevalTaskConfig(
- name="wmt16:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt16_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_en_de_lighteval = LightevalTaskConfig(
- name="wmt16:en-de",
- suite=["lighteval", "gpt3_benchmarks"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="wmt16",
- hf_subset="de-en",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_en_de_lighteval = LightevalTaskConfig(
- name="wmt16:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt16_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_en_fi_lighteval = LightevalTaskConfig(
- name="wmt16:en-fi",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt16_en-fi",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_en_ro_lighteval = LightevalTaskConfig(
- name="wmt16:en-ro",
- suite=["lighteval", "gpt3_benchmarks"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="wmt16",
- hf_subset="ro-en",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_en_ro_lighteval = LightevalTaskConfig(
- name="wmt16:en-ro",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt16_en-ro",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_en_ru_lighteval = LightevalTaskConfig(
- name="wmt16:en-ru",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt16_en-ru",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_en_tr_lighteval = LightevalTaskConfig(
- name="wmt16:en-tr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt16_en-tr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_fi_en_lighteval = LightevalTaskConfig(
- name="wmt16:fi-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt16_fi-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_ro_en_lighteval = LightevalTaskConfig(
- name="wmt16:ro-en",
- suite=["lighteval", "gpt3_benchmarks"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="wmt16",
- hf_subset="ro-en",
- hf_avail_splits=["train", "validation", "test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_ro_en_lighteval = LightevalTaskConfig(
- name="wmt16:ro-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt16_ro-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_ru_en_lighteval = LightevalTaskConfig(
- name="wmt16:ru-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt16_ru-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt16_tr_en_lighteval = LightevalTaskConfig(
- name="wmt16:tr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt16_tr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_cs_en_lighteval = LightevalTaskConfig(
- name="wmt17:cs-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_cs-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_de_en_lighteval = LightevalTaskConfig(
- name="wmt17:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_en_cs_lighteval = LightevalTaskConfig(
- name="wmt17:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_en_de_lighteval = LightevalTaskConfig(
- name="wmt17:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_en_fi_lighteval = LightevalTaskConfig(
- name="wmt17:en-fi",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_en-fi",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_en_lv_lighteval = LightevalTaskConfig(
- name="wmt17:en-lv",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_en-lv",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_en_ru_lighteval = LightevalTaskConfig(
- name="wmt17:en-ru",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_en-ru",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_en_tr_lighteval = LightevalTaskConfig(
- name="wmt17:en-tr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_en-tr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_en_zh_lighteval = LightevalTaskConfig(
- name="wmt17:en-zh",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_en-zh",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_fi_en_lighteval = LightevalTaskConfig(
- name="wmt17:fi-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_fi-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_lv_en_lighteval = LightevalTaskConfig(
- name="wmt17:lv-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_lv-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_ru_en_lighteval = LightevalTaskConfig(
- name="wmt17:ru-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_ru-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_tr_en_lighteval = LightevalTaskConfig(
- name="wmt17:tr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_tr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt17_zh_en_lighteval = LightevalTaskConfig(
- name="wmt17:zh-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt17_zh-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_cs_en_lighteval = LightevalTaskConfig(
- name="wmt18:cs-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_cs-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_de_en_lighteval = LightevalTaskConfig(
- name="wmt18:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_en_cs_lighteval = LightevalTaskConfig(
- name="wmt18:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_en_de_lighteval = LightevalTaskConfig(
- name="wmt18:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_en_et_lighteval = LightevalTaskConfig(
- name="wmt18:en-et",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_en-et",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_en_fi_lighteval = LightevalTaskConfig(
- name="wmt18:en-fi",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_en-fi",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_en_ru_lighteval = LightevalTaskConfig(
- name="wmt18:en-ru",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_en-ru",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_en_tr_lighteval = LightevalTaskConfig(
- name="wmt18:en-tr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_en-tr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_en_zh_lighteval = LightevalTaskConfig(
- name="wmt18:en-zh",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_en-zh",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_et_en_lighteval = LightevalTaskConfig(
- name="wmt18:et-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_et-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_fi_en_lighteval = LightevalTaskConfig(
- name="wmt18:fi-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_fi-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_ru_en_lighteval = LightevalTaskConfig(
- name="wmt18:ru-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_ru-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_tr_en_lighteval = LightevalTaskConfig(
- name="wmt18:tr-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_tr-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt18_zh_en_lighteval = LightevalTaskConfig(
- name="wmt18:zh-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt18_zh-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_cs_de_lighteval = LightevalTaskConfig(
- name="wmt19:cs-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_cs-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_de_cs_lighteval = LightevalTaskConfig(
- name="wmt19:de-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_de-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_de_en_lighteval = LightevalTaskConfig(
- name="wmt19:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_de_fr_lighteval = LightevalTaskConfig(
- name="wmt19:de-fr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_de-fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_en_cs_lighteval = LightevalTaskConfig(
- name="wmt19:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_en_de_lighteval = LightevalTaskConfig(
- name="wmt19:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_en_fi_lighteval = LightevalTaskConfig(
- name="wmt19:en-fi",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_en-fi",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_en_gu_lighteval = LightevalTaskConfig(
- name="wmt19:en-gu",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_en-gu",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_en_kk_lighteval = LightevalTaskConfig(
- name="wmt19:en-kk",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_en-kk",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_en_lt_lighteval = LightevalTaskConfig(
- name="wmt19:en-lt",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_en-lt",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_en_ru_lighteval = LightevalTaskConfig(
- name="wmt19:en-ru",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_en-ru",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_en_zh_lighteval = LightevalTaskConfig(
- name="wmt19:en-zh",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_en-zh",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_fi_en_lighteval = LightevalTaskConfig(
- name="wmt19:fi-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_fi-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_fr_de_lighteval = LightevalTaskConfig(
- name="wmt19:fr-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_fr-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_gu_en_lighteval = LightevalTaskConfig(
- name="wmt19:gu-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_gu-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_kk_en_lighteval = LightevalTaskConfig(
- name="wmt19:kk-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_kk-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_lt_en_lighteval = LightevalTaskConfig(
- name="wmt19:lt-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_lt-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_ru_en_lighteval = LightevalTaskConfig(
- name="wmt19:ru-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_ru-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt19_zh_en_lighteval = LightevalTaskConfig(
- name="wmt19:zh-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt19_zh-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_cs_en_lighteval = LightevalTaskConfig(
- name="wmt20:cs-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_cs-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_de_en_lighteval = LightevalTaskConfig(
- name="wmt20:de-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_de-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_de_fr_lighteval = LightevalTaskConfig(
- name="wmt20:de-fr",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_de-fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_en_cs_lighteval = LightevalTaskConfig(
- name="wmt20:en-cs",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_en-cs",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_en_de_lighteval = LightevalTaskConfig(
- name="wmt20:en-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_en-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_en_iu_lighteval = LightevalTaskConfig(
- name="wmt20:en-iu",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_en-iu",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_en_ja_lighteval = LightevalTaskConfig(
- name="wmt20:en-ja",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_en-ja",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_en_km_lighteval = LightevalTaskConfig(
- name="wmt20:en-km",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_en-km",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_en_pl_lighteval = LightevalTaskConfig(
- name="wmt20:en-pl",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_en-pl",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_en_ps_lighteval = LightevalTaskConfig(
- name="wmt20:en-ps",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_en-ps",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_en_ru_lighteval = LightevalTaskConfig(
- name="wmt20:en-ru",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_en-ru",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_en_ta_lighteval = LightevalTaskConfig(
- name="wmt20:en-ta",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_en-ta",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_en_zh_lighteval = LightevalTaskConfig(
- name="wmt20:en-zh",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_en-zh",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_fr_de_lighteval = LightevalTaskConfig(
- name="wmt20:fr-de",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_fr-de",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_iu_en_lighteval = LightevalTaskConfig(
- name="wmt20:iu-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_iu-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_ja_en_lighteval = LightevalTaskConfig(
- name="wmt20:ja-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_ja-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_km_en_lighteval = LightevalTaskConfig(
- name="wmt20:km-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_km-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_pl_en_lighteval = LightevalTaskConfig(
- name="wmt20:pl-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_pl-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_ps_en_lighteval = LightevalTaskConfig(
- name="wmt20:ps-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_ps-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_ru_en_lighteval = LightevalTaskConfig(
- name="wmt20:ru-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_ru-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_ta_en_lighteval = LightevalTaskConfig(
- name="wmt20:ta-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_ta-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-wmt20_zh_en_lighteval = LightevalTaskConfig(
- name="wmt20:zh-en",
- suite=["lighteval", "sacrebleu"],
- prompt_function=prompt.wmt_reverse_alphabetical,
- hf_repo="lighteval/sacrebleu_manual",
- hf_subset="wmt20_zh-en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=None,
- metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
- stop_sequence=["\n"],
- version=0,
-)
-word_sorting_bigbench = LightevalTaskConfig(
- name="word_sorting",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="word_sorting",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-word_unscrambling_bigbench = LightevalTaskConfig(
- name="word_unscrambling",
- suite=["bigbench", "bigbench_json"],
- prompt_function=prompt.bigbench,
- hf_repo="tasksource/bigbench",
- hf_subset="word_unscrambling",
- hf_avail_splits=["default", "train", "validation"],
- evaluation_splits=["default"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
- stop_sequence=["\n"],
- version=0,
-)
-wsc273_lighteval = LightevalTaskConfig(
- name="wsc273",
- suite=["lighteval"],
- prompt_function=prompt.wsc273,
- hf_repo="lighteval/winograd_wsc",
- hf_subset="wsc273",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xcopa_en_lighteval = LightevalTaskConfig(
- name="xcopa:en",
- suite=["lighteval"],
- prompt_function=prompt.xcopa_en,
- hf_repo="cambridgeltl/xcopa",
- hf_subset="default",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xcopa_et_lighteval = LightevalTaskConfig(
- name="xcopa:et",
- suite=["lighteval"],
- prompt_function=prompt.xcopa_et,
- hf_repo="cambridgeltl/xcopa",
- hf_subset="et",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xcopa_ht_lighteval = LightevalTaskConfig(
- name="xcopa:ht",
- suite=["lighteval"],
- prompt_function=prompt.xcopa_ht,
- hf_repo="cambridgeltl/xcopa",
- hf_subset="ht",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xcopa_it_lighteval = LightevalTaskConfig(
- name="xcopa:it",
- suite=["lighteval"],
- prompt_function=prompt.xcopa_it,
- hf_repo="cambridgeltl/xcopa",
- hf_subset="it",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xcopa_id_lighteval = LightevalTaskConfig(
- name="xcopa:id",
- suite=["lighteval"],
- prompt_function=prompt.xcopa_id,
- hf_repo="cambridgeltl/xcopa",
- hf_subset="id",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xcopa_qu_lighteval = LightevalTaskConfig(
- name="xcopa:qu",
- suite=["lighteval"],
- prompt_function=prompt.xcopa_qu,
- hf_repo="cambridgeltl/xcopa",
- hf_subset="qu",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xcopa_sw_lighteval = LightevalTaskConfig(
- name="xcopa:sw",
- suite=["lighteval"],
- prompt_function=prompt.xcopa_sw,
- hf_repo="cambridgeltl/xcopa",
- hf_subset="sw",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xcopa_zh_lighteval = LightevalTaskConfig(
- name="xcopa:zh",
- suite=["lighteval"],
- prompt_function=prompt.xcopa_zh,
- hf_repo="cambridgeltl/xcopa",
- hf_subset="zh",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xcopa_ta_lighteval = LightevalTaskConfig(
- name="xcopa:ta",
- suite=["lighteval"],
- prompt_function=prompt.xcopa_ta,
- hf_repo="cambridgeltl/xcopa",
- hf_subset="ta",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xcopa_th_lighteval = LightevalTaskConfig(
- name="xcopa:th",
- suite=["lighteval"],
- prompt_function=prompt.xcopa_th,
- hf_repo="cambridgeltl/xcopa",
- hf_subset="th",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xcopa_tr_lighteval = LightevalTaskConfig(
- name="xcopa:tr",
- suite=["lighteval"],
- prompt_function=prompt.xcopa_tr,
- hf_repo="cambridgeltl/xcopa",
- hf_subset="tr",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xcopa_vi_lighteval = LightevalTaskConfig(
- name="xcopa:vi",
- suite=["lighteval"],
- prompt_function=prompt.xcopa_vi,
- hf_repo="cambridgeltl/xcopa",
- hf_subset="vi",
- hf_avail_splits=["test", "train", "validation"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xstory_cloze_en_lighteval = LightevalTaskConfig(
- name="xstory_cloze:en",
- suite=["lighteval"],
- prompt_function=prompt.storycloze,
- hf_repo="juletxara/xstory_cloze",
- hf_subset="en",
- hf_avail_splits=["training", "eval"],
- evaluation_splits=["eval"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xstory_cloze_ru_lighteval = LightevalTaskConfig(
- name="xstory_cloze:ru",
- suite=["lighteval"],
- prompt_function=prompt.storycloze,
- hf_repo="juletxara/xstory_cloze",
- hf_subset="ru",
- hf_avail_splits=["training", "eval"],
- evaluation_splits=["eval"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xstory_cloze_zh_lighteval = LightevalTaskConfig(
- name="xstory_cloze:zh",
- suite=["lighteval"],
- prompt_function=prompt.storycloze,
- hf_repo="juletxara/xstory_cloze",
- hf_subset="zh",
- hf_avail_splits=["training", "eval"],
- evaluation_splits=["eval"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xstory_cloze_es_lighteval = LightevalTaskConfig(
- name="xstory_cloze:es",
- suite=["lighteval"],
- prompt_function=prompt.storycloze,
- hf_repo="juletxara/xstory_cloze",
- hf_subset="es",
- hf_avail_splits=["training", "eval"],
- evaluation_splits=["eval"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xstory_cloze_ar_lighteval = LightevalTaskConfig(
- name="xstory_cloze:ar",
- suite=["lighteval"],
- prompt_function=prompt.storycloze,
- hf_repo="juletxara/xstory_cloze",
- hf_subset="ar",
- hf_avail_splits=["training", "eval"],
- evaluation_splits=["eval"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xstory_cloze_hi_lighteval = LightevalTaskConfig(
- name="xstory_cloze:hi",
- suite=["lighteval"],
- prompt_function=prompt.storycloze,
- hf_repo="juletxara/xstory_cloze",
- hf_subset="hi",
- hf_avail_splits=["training", "eval"],
- evaluation_splits=["eval"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xstory_cloze_id_lighteval = LightevalTaskConfig(
- name="xstory_cloze:id",
- suite=["lighteval"],
- prompt_function=prompt.storycloze,
- hf_repo="juletxara/xstory_cloze",
- hf_subset="id",
- hf_avail_splits=["training", "eval"],
- evaluation_splits=["eval"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xstory_cloze_te_lighteval = LightevalTaskConfig(
- name="xstory_cloze:te",
- suite=["lighteval"],
- prompt_function=prompt.storycloze,
- hf_repo="juletxara/xstory_cloze",
- hf_subset="te",
- hf_avail_splits=["training", "eval"],
- evaluation_splits=["eval"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xstory_cloze_sw_lighteval = LightevalTaskConfig(
- name="xstory_cloze:sw",
- suite=["lighteval"],
- prompt_function=prompt.storycloze,
- hf_repo="juletxara/xstory_cloze",
- hf_subset="sw",
- hf_avail_splits=["training", "eval"],
- evaluation_splits=["eval"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xstory_cloze_eu_lighteval = LightevalTaskConfig(
- name="xstory_cloze:eu",
- suite=["lighteval"],
- prompt_function=prompt.storycloze,
- hf_repo="juletxara/xstory_cloze",
- hf_subset="eu",
- hf_avail_splits=["training", "eval"],
- evaluation_splits=["eval"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xstory_cloze_my_lighteval = LightevalTaskConfig(
- name="xstory_cloze:my",
- suite=["lighteval"],
- prompt_function=prompt.storycloze,
- hf_repo="juletxara/xstory_cloze",
- hf_subset="my",
- hf_avail_splits=["training", "eval"],
- evaluation_splits=["eval"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xwinograd_en_lighteval = LightevalTaskConfig(
- name="xwinograd:en",
- suite=["lighteval"],
- prompt_function=prompt.winogrande,
- hf_repo="Muennighoff/xwinograd",
- hf_subset="en",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xwinograd_fr_lighteval = LightevalTaskConfig(
- name="xwinograd:fr",
- suite=["lighteval"],
- prompt_function=prompt.winogrande,
- hf_repo="Muennighoff/xwinograd",
- hf_subset="fr",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xwinograd_jp_lighteval = LightevalTaskConfig(
- name="xwinograd:jp",
- suite=["lighteval"],
- prompt_function=prompt.winogrande,
- hf_repo="Muennighoff/xwinograd",
- hf_subset="jp",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xwinograd_pt_lighteval = LightevalTaskConfig(
- name="xwinograd:pt",
- suite=["lighteval"],
- prompt_function=prompt.winogrande,
- hf_repo="Muennighoff/xwinograd",
- hf_subset="pt",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xwinograd_ru_lighteval = LightevalTaskConfig(
- name="xwinograd:ru",
- suite=["lighteval"],
- prompt_function=prompt.winogrande,
- hf_repo="Muennighoff/xwinograd",
- hf_subset="ru",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-xwinograd_zh_lighteval = LightevalTaskConfig(
- name="xwinograd:zh",
- suite=["lighteval"],
- prompt_function=prompt.winogrande,
- hf_repo="Muennighoff/xwinograd",
- hf_subset="zh",
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=-1,
- metrics=[Metrics.loglikelihood_acc],
- stop_sequence=["\n"],
- version=0,
-)
-
-# MMLU-Redux-2 Tasks
-_MMLU_REDUX_2_SUBSETS = [
- "abstract_algebra",
- "anatomy",
- "astronomy",
- "business_ethics",
- "clinical_knowledge",
- "college_biology",
- "college_chemistry",
- "college_computer_science",
- "college_mathematics",
- "college_medicine",
- "college_physics",
- "computer_security",
- "conceptual_physics",
- "econometrics",
- "electrical_engineering",
- "elementary_mathematics",
- "formal_logic",
- "global_facts",
- "high_school_biology",
- "high_school_chemistry",
- "high_school_computer_science",
- "high_school_european_history",
- "high_school_geography",
- "high_school_government_and_politics",
- "high_school_macroeconomics",
- "high_school_mathematics",
- "high_school_microeconomics",
- "high_school_physics",
- "high_school_psychology",
- "high_school_statistics",
- "high_school_us_history",
- "high_school_world_history",
- "human_aging",
- "human_sexuality",
- "international_law",
- "jurisprudence",
- "logical_fallacies",
- "machine_learning",
- "management",
- "marketing",
- "medical_genetics",
- "miscellaneous",
- "moral_disputes",
- "moral_scenarios",
- "nutrition",
- "philosophy",
- "prehistory",
- "professional_accounting",
- "professional_law",
- "professional_medicine",
- "professional_psychology",
- "public_relations",
- "security_studies",
- "sociology",
- "us_foreign_policy",
- "virology",
- "world_religions",
-]
-
-
-_mmlu_redux_2_tasks = {
- subset: LightevalTaskConfig(
- name=f"mmlu_redux_2:{subset}",
- suite=["lighteval"],
- prompt_function=lambda line, task_name=None, s=subset: prompt.mmlu_redux_2(line, s, task_name),
- hf_repo="edinburgh-dawg/mmlu-redux-2.0",
- hf_subset=subset,
- hf_avail_splits=["test"],
- evaluation_splits=["test"],
- few_shots_split=None,
- few_shots_select=None,
- generation_size=1,
- metrics=[
- Metrics.loglikelihood_acc,
- Metrics.pass_at_k_letters(sample_params={"k": 1}),
- ],
- stop_sequence=["\n"],
- version=0,
- )
- for subset in _MMLU_REDUX_2_SUBSETS
-}
-
-mmlu_redux_2_abstract_algebra = _mmlu_redux_2_tasks["abstract_algebra"]
-mmlu_redux_2_anatomy = _mmlu_redux_2_tasks["anatomy"]
-mmlu_redux_2_astronomy = _mmlu_redux_2_tasks["astronomy"]
-mmlu_redux_2_business_ethics = _mmlu_redux_2_tasks["business_ethics"]
-mmlu_redux_2_clinical_knowledge = _mmlu_redux_2_tasks["clinical_knowledge"]
-mmlu_redux_2_college_biology = _mmlu_redux_2_tasks["college_biology"]
-mmlu_redux_2_college_chemistry = _mmlu_redux_2_tasks["college_chemistry"]
-mmlu_redux_2_college_computer_science = _mmlu_redux_2_tasks["college_computer_science"]
-mmlu_redux_2_college_mathematics = _mmlu_redux_2_tasks["college_mathematics"]
-mmlu_redux_2_college_medicine = _mmlu_redux_2_tasks["college_medicine"]
-mmlu_redux_2_college_physics = _mmlu_redux_2_tasks["college_physics"]
-mmlu_redux_2_computer_security = _mmlu_redux_2_tasks["computer_security"]
-mmlu_redux_2_conceptual_physics = _mmlu_redux_2_tasks["conceptual_physics"]
-mmlu_redux_2_econometrics = _mmlu_redux_2_tasks["econometrics"]
-mmlu_redux_2_electrical_engineering = _mmlu_redux_2_tasks["electrical_engineering"]
-mmlu_redux_2_elementary_mathematics = _mmlu_redux_2_tasks["elementary_mathematics"]
-mmlu_redux_2_formal_logic = _mmlu_redux_2_tasks["formal_logic"]
-mmlu_redux_2_global_facts = _mmlu_redux_2_tasks["global_facts"]
-mmlu_redux_2_high_school_biology = _mmlu_redux_2_tasks["high_school_biology"]
-mmlu_redux_2_high_school_chemistry = _mmlu_redux_2_tasks["high_school_chemistry"]
-mmlu_redux_2_high_school_computer_science = _mmlu_redux_2_tasks["high_school_computer_science"]
-mmlu_redux_2_high_school_european_history = _mmlu_redux_2_tasks["high_school_european_history"]
-mmlu_redux_2_high_school_geography = _mmlu_redux_2_tasks["high_school_geography"]
-mmlu_redux_2_high_school_government_and_politics = _mmlu_redux_2_tasks["high_school_government_and_politics"]
-mmlu_redux_2_high_school_macroeconomics = _mmlu_redux_2_tasks["high_school_macroeconomics"]
-mmlu_redux_2_high_school_mathematics = _mmlu_redux_2_tasks["high_school_mathematics"]
-mmlu_redux_2_high_school_microeconomics = _mmlu_redux_2_tasks["high_school_microeconomics"]
-mmlu_redux_2_high_school_physics = _mmlu_redux_2_tasks["high_school_physics"]
-mmlu_redux_2_high_school_psychology = _mmlu_redux_2_tasks["high_school_psychology"]
-mmlu_redux_2_high_school_statistics = _mmlu_redux_2_tasks["high_school_statistics"]
-mmlu_redux_2_high_school_us_history = _mmlu_redux_2_tasks["high_school_us_history"]
-mmlu_redux_2_high_school_world_history = _mmlu_redux_2_tasks["high_school_world_history"]
-mmlu_redux_2_human_aging = _mmlu_redux_2_tasks["human_aging"]
-mmlu_redux_2_human_sexuality = _mmlu_redux_2_tasks["human_sexuality"]
-mmlu_redux_2_international_law = _mmlu_redux_2_tasks["international_law"]
-mmlu_redux_2_jurisprudence = _mmlu_redux_2_tasks["jurisprudence"]
-mmlu_redux_2_logical_fallacies = _mmlu_redux_2_tasks["logical_fallacies"]
-mmlu_redux_2_machine_learning = _mmlu_redux_2_tasks["machine_learning"]
-mmlu_redux_2_management = _mmlu_redux_2_tasks["management"]
-mmlu_redux_2_marketing = _mmlu_redux_2_tasks["marketing"]
-mmlu_redux_2_medical_genetics = _mmlu_redux_2_tasks["medical_genetics"]
-mmlu_redux_2_miscellaneous = _mmlu_redux_2_tasks["miscellaneous"]
-mmlu_redux_2_moral_disputes = _mmlu_redux_2_tasks["moral_disputes"]
-mmlu_redux_2_moral_scenarios = _mmlu_redux_2_tasks["moral_scenarios"]
-mmlu_redux_2_nutrition = _mmlu_redux_2_tasks["nutrition"]
-mmlu_redux_2_philosophy = _mmlu_redux_2_tasks["philosophy"]
-mmlu_redux_2_prehistory = _mmlu_redux_2_tasks["prehistory"]
-mmlu_redux_2_professional_accounting = _mmlu_redux_2_tasks["professional_accounting"]
-mmlu_redux_2_professional_law = _mmlu_redux_2_tasks["professional_law"]
-mmlu_redux_2_professional_medicine = _mmlu_redux_2_tasks["professional_medicine"]
-mmlu_redux_2_professional_psychology = _mmlu_redux_2_tasks["professional_psychology"]
-mmlu_redux_2_public_relations = _mmlu_redux_2_tasks["public_relations"]
-mmlu_redux_2_security_studies = _mmlu_redux_2_tasks["security_studies"]
-mmlu_redux_2_sociology = _mmlu_redux_2_tasks["sociology"]
-mmlu_redux_2_us_foreign_policy = _mmlu_redux_2_tasks["us_foreign_policy"]
-mmlu_redux_2_virology = _mmlu_redux_2_tasks["virology"]
-mmlu_redux_2_world_religions = _mmlu_redux_2_tasks["world_religions"]
diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py
deleted file mode 100644
index 247a0c3a2..000000000
--- a/src/lighteval/tasks/extended/__init__.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-
-import lighteval.tasks.extended.hle.main as hle
-import lighteval.tasks.extended.ifbench.main as ifbench
-import lighteval.tasks.extended.ifeval.main as ifeval
-import lighteval.tasks.extended.lcb.main as lcb
-import lighteval.tasks.extended.mix_eval.main as mix_eval
-import lighteval.tasks.extended.mt_bench.main as mt_bench
-import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
-import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks
-
-
-AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, ifbench, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb]
diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py
deleted file mode 100644
index 5d6c107bc..000000000
--- a/src/lighteval/tasks/multilingual/tasks.py
+++ /dev/null
@@ -1,4368 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-from functools import partial
-from itertools import permutations
-
-from langcodes import Language as LangCodeLanguage
-from langcodes import standardize_tag
-
-from lighteval.metrics.dynamic_metrics import (
- LogLikelihoodAccMetric,
- MultilingualQuasiExactMatchMetric,
- MultilingualQuasiF1ScoreMetric,
-)
-from lighteval.metrics.metrics import Metrics
-from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
-from lighteval.tasks.default_prompts import LETTER_INDICES
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.multilingual.adapters import (
- agieval_adapter,
- alghafa_adapter,
- ceval_adapter,
- enem_adapter,
- get_m3exam_adapter,
- get_mkqa_adapter,
- sciqa_adapter,
- thai_exams_adapter,
- winogrand_adapter,
- xcodah_adapter,
-)
-from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation, normalize_subset
-from lighteval.tasks.templates.boolq import get_boolq_prompt_function
-from lighteval.tasks.templates.continuation import get_continuation_prompt_function
-from lighteval.tasks.templates.copa import get_copa_prompt_function
-from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
-from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
-from lighteval.tasks.templates.nli import get_nli_prompt_function
-from lighteval.tasks.templates.qa import get_qa_prompt_function
-from lighteval.tasks.templates.translation import get_translation_prompt_function
-from lighteval.tasks.templates.utils.formulation import (
- CFFormulation,
- HybridFormulation,
- MCFFormulation,
-)
-from lighteval.tasks.templates.utils.translation_literals import TRANSLATION_LITERALS
-from lighteval.utils.language import Language, iso_639_3_ind_to_iso_639_3_macro, manage_duplicate_language_codes
-
-
-TASKS_TABLE = []
-# ------------------------------- NLI Tasks ------------------------------- #
-# NLI (Natural Language Inference) tasks involve determining the logical relationship
-# between two given sentences: a premise and a hypothesis. The goal is to classify
-# whether the hypothesis is entailed by, contradicts, or is neutral with respect to
-# the premise. After our inspection we found the neutral label to be quite ambiguous
-# and decided to exclude it. But you can easily add it by modifying the adapters
-
-
-# The XNLI dataset is a multilingual variant of MultiNLI
-# https://aclanthology.org/D18-1269/
-xnli_tasks = [
- LightevalTaskConfig(
- name=f"xnli_{language.value}_{formulation.name.lower()}",
- suite=["lighteval"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=None),
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- prompt_function=get_nli_prompt_function(
- language=language,
- adapter=lambda line: {
- "premise": line["premise"],
- "hypothesis": line["hypothesis"],
- # Since we ignore the neutral label
- "gold_idx": {0: 0, 2: 1}[line["label"]],
- },
- relations=["entailment", "contradiction"],
- formulation=formulation,
- ),
- hf_filter=lambda line: line["label"] in [0, 2],
- hf_repo="facebook/xnli",
- hf_subset=standardize_tag(language.value),
- evaluation_splits=["validation"],
- few_shots_split="train",
- )
- for language in [
- Language.ARABIC,
- Language.ENGLISH,
- Language.FRENCH,
- Language.SPANISH,
- Language.BULGARIAN,
- Language.GERMAN,
- Language.GREEK,
- Language.ENGLISH,
- Language.FRENCH,
- Language.HINDI,
- Language.RUSSIAN,
- Language.SWAHILI,
- Language.THAI,
- Language.TURKISH,
- Language.URDU,
- Language.VIETNAMESE,
- Language.CHINESE,
- ]
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-
-# Improvement on XNLI with better translation, from our experience models tend to
-# perform better on XNLI2.0 than XNLI
-# https://arxiv.org/abs/2301.06527
-xnli2_tasks = [
- LightevalTaskConfig(
- name=f"xnli2.0_{language.value}_{formulation.name.lower()}",
- suite=["lighteval"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=None),
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- prompt_function=get_nli_prompt_function(
- language=language,
- adapter=lambda line: {
- "premise": line["premise"],
- "hypothesis": line["hypothesis"],
- # Since we ignore the neutral label
- "gold_idx": {0: 0, 2: 1}[line["label"]],
- },
- relations=["entailment", "contradiction"],
- formulation=formulation,
- ),
- hf_filter=lambda line: line["label"] in [0, 2]
- and line["premise"] is not None
- and line["hypothesis"] is not None,
- hf_repo=f"Harsit/xnli2.0_train_{LangCodeLanguage(standardize_tag(language.value)).language_name().lower()}",
- hf_subset="default",
- evaluation_splits=["train"],
- hf_avail_splits=["train"],
- )
- for language in [
- Language.ENGLISH,
- Language.FRENCH,
- Language.PUNJABI,
- Language.GUJARATI,
- Language.KANNADA,
- Language.ASSAMESE,
- Language.BENGALI,
- Language.MARATHI,
- Language.SANSKRIT,
- Language.TAMIL,
- Language.GERMAN,
- Language.ENGLISH,
- Language.URDU,
- Language.VIETNAMESE,
- Language.TURKISH,
- Language.THAI,
- Language.SWAHILI,
- Language.SPANISH,
- Language.RUSSIAN,
- Language.HINDI,
- Language.GREEK,
- Language.CHINESE,
- Language.BULGARIAN,
- Language.ARABIC,
- # Theoretically also: Bhojpuri, Gujarati, Odiya
- ]
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-# Another variant of XNLI, with emphasis on Indic languages
-# https://arxiv.org/abs/2204.08776
-xnli_indic_tasks = [
- LightevalTaskConfig(
- name=f"indicnxnli_{language.value}_{formulation.name.lower()}",
- suite=["lighteval"],
- prompt_function=get_nli_prompt_function(
- language=language,
- adapter=lambda line: {
- "premise": line["premise"],
- "hypothesis": line["hypothesis"],
- # Since we ignore the neutral label
- "gold_idx": {0: 0, 2: 1}[line["label"]],
- },
- relations=["entailment", "contradiction"],
- formulation=formulation,
- ),
- hf_repo="Divyanshu/indicxnli",
- hf_subset=standardize_tag(language.value),
- # Ignore neutral
- hf_filter=lambda x: int(x["label"]) in [0, 2],
- evaluation_splits=["validation"],
- few_shots_split="train",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=None),
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for language in [
- Language.ASSAMESE,
- Language.BENGALI,
- Language.GUJARATI,
- Language.HINDI,
- Language.KANNADA,
- Language.MALAYALAM,
- Language.MARATHI,
- Language.ORIYA,
- Language.PUNJABI,
- Language.TAMIL,
- Language.TELUGU,
- ]
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-# African XNLI: African XNLI
-# From https://arxiv.org/abs/2406.03368. Human translated MMLU.
-afri_xnli_tasks = [
- LightevalTaskConfig(
- name=f"afri_xnli_{language.value}_{formulation.name.lower()}",
- suite=("lighteval",),
- prompt_function=get_nli_prompt_function(
- language=language,
- adapter=lambda line: {
- "premise": line["premise"],
- "hypothesis": line["hypothesis"],
- # Since we ignore the neutral label
- "gold_idx": {0: 0, 2: 1}[line["label"]],
- },
- relations=["entailment", "contradiction"],
- formulation=formulation,
- ),
- hf_repo="masakhane/afrixnli",
- hf_subset=language.value,
- hf_filter=lambda x: int(x["label"]) in [0, 2],
- evaluation_splits=("test",),
- few_shots_split="validation",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=None),
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for language in [
- Language.AMHARIC,
- # Language.EWE,
- Language.FRENCH,
- # Language.HAUSA,
- # Language.IGBO,
- # Language.KINYARWANDA,
- # Language.LINGALA,
- # Language.LUGANDA,
- # Language.OROMO,
- # Language.SHONA,
- # Language.SOTHO,
- Language.SWAHILI,
- # Language.TWI,
- # Language.WOLOF,
- # Language.XHOSA,
- Language.YORUBA,
- # Language.ZULU,
- ]
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-# PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification
-# This dataset contains paraphrase identification pairs in multiple languages.
-# It's derived from PAWS (Paraphrase Adversaries from Word Scrambling) and
-# We treat paraphrase as entailment and non-paraphrase as contradiction
-# https://arxiv.org/abs/1908.11828
-
-paws_x_tasks = [
- LightevalTaskConfig(
- name=f"pawsx_{language.value}_{formulation.name.lower()}",
- suite=("lighteval",),
- prompt_function=get_nli_prompt_function(
- language=language,
- adapter=lambda line: {
- "premise": line["sentence1"],
- "hypothesis": line["sentence2"],
- # Since we ignore the neutral label
- "gold_idx": int(line["label"]),
- },
- relations=["entailment", "contradiction"],
- formulation=formulation,
- ),
- hf_repo="google-research-datasets/paws-x",
- hf_subset=standardize_tag(language.value),
- evaluation_splits=("test",),
- few_shots_split="train",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=None),
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for language in [
- Language.GERMAN,
- Language.ENGLISH,
- Language.SPANISH,
- Language.FRENCH,
- Language.JAPANESE,
- Language.KOREAN,
- Language.CHINESE,
- ]
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-# Russian Commitment Bank (RCB) is a large-scale NLI dataset with Russian sentences,
-# collected from the web and crowdsourcing.
-# https://arxiv.org/abs/2401.04531
-rcb_tasks = [
- LightevalTaskConfig(
- name=f"rcb_{Language.RUSSIAN.value}_{formulation.name.lower()}",
- prompt_function=get_nli_prompt_function(
- language=Language.RUSSIAN,
- adapter=lambda line: {
- "premise": line["inputs"]["premise"],
- "hypothesis": line["inputs"]["hypothesis"],
- # Since we ignore the neutral label
- "gold_idx": int(line["outputs"]) - 1,
- },
- relations=["entailment", "contradiction"],
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="ai-forever/MERA",
- hf_subset="rcb",
- # Ignore neutral label
- hf_filter=lambda x: int(x["outputs"] or "0") in [1, 2],
- evaluation_splits=("train",),
- few_shots_split="validation",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=None),
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-# Native Chinese NLI dataset based.
-# https://arxiv.org/pdf/2010.05444
-# We find this benchmark to have really good signal compared to other Chinese NLI
-ocnli_tasks = [
- LightevalTaskConfig(
- name=f"ocnli_{Language.CHINESE.value}_{formulation.name.lower()}",
- prompt_function=get_nli_prompt_function(
- language=Language.CHINESE,
- adapter=lambda line: {
- "premise": line["sentence1"],
- "hypothesis": line["sentence2"],
- # Since we ignore the neutral label
- "gold_idx": {1: 0, 2: 1}[line["label"]],
- },
- relations=["entailment", "contradiction"],
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="clue/clue",
- hf_subset="ocnli",
- # Only keep the positive and negative examples
- hf_filter=lambda x: int(x["label"]) in [1, 2],
- evaluation_splits=("validation",),
- few_shots_split="train",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=None),
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-# https://arxiv.org/abs/2004.05986
-# Native Chinese NLI dataset based on MNLI approach (Machine Translated)
-cmnli_tasks = [
- LightevalTaskConfig(
- name=f"cmnli_{Language.CHINESE.value}_{formulation.name.lower()}",
- prompt_function=get_nli_prompt_function(
- language=Language.CHINESE,
- adapter=lambda line: {
- "premise": line["sentence1"],
- "hypothesis": line["sentence2"],
- # Since we ignore the neutral label
- "gold_idx": {"entailment": 0, "contradiction": 1}[line["label"]],
- },
- relations=["entailment", "contradiction"],
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="fenffef/cmnli",
- hf_subset="default",
- hf_filter=lambda x: x["label"] in ["entailment", "contradiction"],
- # Only keep the positive and negative examples
- evaluation_splits=("validation",),
- few_shots_split="train",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=None),
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-TASKS_TABLE.extend(
- [
- *xnli_tasks,
- *xnli2_tasks,
- *xnli_indic_tasks,
- *paws_x_tasks,
- *rcb_tasks,
- *ocnli_tasks,
- *cmnli_tasks,
- *afri_xnli_tasks,
- ]
-)
-# ------------------------------- Copa Tasks ------------------------------- #
-# COPA (Choice of Plausible Alternatives) tasks involve determining the most plausible cause or effect
-# for a given premise. These tasks test common sense reasoning and causal inference abilities.
-
-# XCOPA: Cross-lingual Choice of Plausible Alternatives
-# Paper: https://aclanthology.org/2020.emnlp-main.185/
-# XCOPA extends the original English COPA task to 11 typologically diverse languages.
-xcopa_tasks = [
- LightevalTaskConfig(
- name=f"xcopa_{language.value}_{formulation.name.lower()}",
- suite=["lighteval"],
- prompt_function=get_copa_prompt_function(
- language,
- adapter=lambda line: {
- "context": line["premise"],
- "cause_effect": line["question"],
- "continuations": [line["choice1"], line["choice2"]],
- "gold_idx": int(line["label"]),
- },
- formulation=formulation,
- ),
- hf_repo=("OALL/AlGhafa-Arabic-LLM-Benchmark-Translated" if language == Language.ARABIC else "xcopa"),
- hf_subset=("copa_ext_ar" if language == Language.ARABIC else standardize_tag(language.value)),
- evaluation_splits=["test"],
- few_shots_split="validation",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for language in [
- Language.ARABIC,
- Language.ESTONIAN,
- Language.INDONESIAN,
- Language.ITALIAN,
- Language.SWAHILI,
- Language.TAMIL,
- Language.THAI,
- Language.TURKISH,
- Language.VIETNAMESE,
- Language.CHINESE,
- Language.HAITIAN,
- Language.QUECHUA,
- ]
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-# IndicCOPA: COPA for Indic Languages
-# Paper: https://arxiv.org/pdf/2212.05409
-# IndicCOPA extends COPA to 15 Indic languages, providing a valuable resource for
-# evaluating common sense reasoning in these languages.
-copa_indic_tasks = [
- LightevalTaskConfig(
- name=f"indicxcopa_{language.value}_{formulation.name.lower()}",
- suite=["lighteval"],
- prompt_function=get_copa_prompt_function(
- language,
- adapter=lambda line: {
- "context": line["premise"],
- "cause_effect": line["question"],
- "continuations": [line["choice1"], line["choice2"]],
- "gold_idx": int(line["label"]),
- },
- formulation=formulation,
- ),
- hf_repo="ai4bharat/IndicCOPA",
- hf_subset=f"translation-{standardize_tag(language.value)}",
- hf_revision="d356ef19a4eb287e88a51d07a56b73ba88c7f188",
- evaluation_splits=["test"],
- hf_avail_splits=["test"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for language in [
- Language.ASSAMESE,
- Language.BENGALI,
- Language.GUJARATI,
- Language.HINDI,
- Language.KANNADA,
- Language.MALAYALAM,
- Language.MARATHI,
- Language.NEPALI,
- Language.ORIYA,
- Language.PUNJABI,
- Language.SANSKRIT,
- Language.SINDHI,
- Language.TAMIL,
- Language.TELUGU,
- Language.URDU,
- # Optionally: Maithili, Santali, Sindhi, Konkani
- ]
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-# PARus: Plausible Alternatives for Russian
-# Paper: https://russiansuperglue.com/tasks/task_info/PARus
-# PARus is the Russian adaptation of the COPA task, part of the Russian SuperGLUE benchmark.
-# It evaluates common sense reasoning and causal inference abilities in Russian language models.
-parus_tasks = [
- LightevalTaskConfig(
- name=f"parus_{Language.RUSSIAN.value}_{formulation.name.lower()}",
- suite=["lighteval"],
- prompt_function=get_copa_prompt_function(
- language=Language.RUSSIAN,
- adapter=lambda line: {
- "context": line["inputs"]["premise"],
- "cause_effect": line["meta"]["task"],
- "continuations": [line["inputs"]["choice1"], line["inputs"]["choice2"]],
- "gold_idx": int(line["outputs"]) - 1,
- },
- formulation=formulation,
- ),
- hf_repo="ai-forever/MERA",
- hf_subset="parus",
- evaluation_splits=["train"],
- few_shots_split="validation",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-
-TASKS_TABLE.extend([*xcopa_tasks, *copa_indic_tasks, *parus_tasks])
-# ------------------------------- Hellaswag Tasks ------------------------------- #
-# Hellaswag is a commonsense reasoning task that requires models to complete a given scenario
-# with the most plausible ending. It tests the model's ability to understand and reason about
-# everyday situations and human behavior.
-
-# MLMM-Hellaswag: Multilingual adaptation of Hellaswag
-# Paper: https://arxiv.org/abs/2306.07610
-# This is a multilingual version of Hellaswag, part of the MLMM (Massive Language Model Meta-Evaluation) benchmark.
-# It evaluates commonsense reasoning abilities across multiple languages.
-mlmm_hellaswag_tasks = [
- LightevalTaskConfig(
- name=f"mlmm_hellaswag_{lang.value}_{formulation.name.lower()}",
- suite=["lighteval"],
- prompt_function=get_hellaswag_prompt_function(
- language=lang,
- adapter=lambda line: {
- # We don't use activity_label as they are not available
- "ctx_a": line["ctx_a"],
- "ctx_b": line["ctx_b"],
- "continuations": line["endings"],
- "gold_idx": int(line["label"]),
- },
- formulation=formulation,
- ),
- hf_repo="jon-tow/okapi_hellaswag",
- hf_subset=standardize_tag(lang.value),
- hf_revision="96ed8e0dfc6172dad1d3df338d7b8ba6c1ff9d83",
- evaluation_splits=["validation"],
- hf_avail_splits=["validation"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for lang in [
- Language.ARABIC,
- Language.BENGALI,
- Language.CATALAN,
- Language.DANISH,
- Language.GERMAN,
- Language.SPANISH,
- Language.BASQUE,
- Language.FRENCH,
- Language.GUJARATI,
- Language.HINDI,
- Language.CROATIAN,
- Language.HUNGARIAN,
- Language.ARMENIAN,
- Language.INDONESIAN,
- Language.ICELANDIC,
- Language.ITALIAN,
- Language.KANNADA,
- Language.MALAYALAM,
- Language.MARATHI,
- Language.NORWEGIAN,
- Language.NEPALI,
- Language.DUTCH,
- Language.PORTUGUESE,
- Language.ROMANIAN,
- Language.RUSSIAN,
- Language.SLOVAK,
- Language.SERBIAN,
- Language.SWEDISH,
- Language.TAMIL,
- Language.TELUGU,
- Language.UKRAINIAN,
- Language.VIETNAMESE,
- Language.CHINESE,
- ]
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-# Hellaswag Turkish
-# This is a Turkish adaptation of the Hellaswag task.
-# While there's no specific paper for this version, it has been found to work well for evaluating
-# Turkish language models on commonsense reasoning tasks.
-
-# We don't handle them in single task as there is quite a lot of differences (dataset/subset, dot replacement, etc.)
-# which would make it hard to read
-hellaswag_tur_tasks = [
- LightevalTaskConfig(
- name=f"community_hellaswag_{Language.TURKISH.value}_{formulation.name.lower()}",
- suite=["lighteval"],
- prompt_function=get_hellaswag_prompt_function(
- language=Language.TURKISH,
- adapter=lambda line: {
- "ctx_a": line["ctx_a"],
- "ctx_b": line["ctx_b"],
- "continuations": line["endings"],
- "gold_idx": int(line["label"]),
- },
- formulation=formulation,
- # https://github.com/malhajar17/lm-evaluation-harness_turkish/blob/main/lm_eval/tasks/hellaswag_tr-v0.2/utils.py
- wikihow_artifacts=[" [title]", " [başlık]", " [adım]", " [header]"],
- ),
- hf_repo="malhajar/hellaswag_tr-v0.2",
- hf_subset="default",
- evaluation_splits=["validation"],
- hf_avail_splits=["validation"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-# Hellaswag Thai
-# This is a Thai adaptation of the Hellaswag task.
-# Similar to the Turkish version, there's no specific paper, but it has been found to be effective
-# for evaluating Thai language models on commonsense reasoning tasks.
-hellaswag_tha_tasks = [
- LightevalTaskConfig(
- name=f"community_hellaswag_{Language.THAI.value}_{formulation.name.lower()}",
- suite=["lighteval"],
- prompt_function=get_hellaswag_prompt_function(
- language=Language.THAI,
- adapter=lambda line: {
- "ctx_a": line["ctx_a"],
- "ctx_b": line["ctx_b"],
- "continuations": line["endings"],
- "gold_idx": int(line["label"]),
- },
- formulation=formulation,
- wikihow_artifacts=[" [ชื่อ]", " [ส่วนหัว]", " [ขั้นตอน]", " [header]", " [Header]"],
- ),
- hf_repo="lighteval/hellaswag_thai",
- hf_subset="default",
- evaluation_splits=["validation"],
- few_shots_split="train",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-hellaswag_hin_tasks = [
- LightevalTaskConfig(
- name=f"community_hellaswag_{Language.HINDI.value}_{formulation.name.lower()}",
- suite=["lighteval"],
- prompt_function=get_hellaswag_prompt_function(
- language=Language.HINDI,
- adapter=lambda line: {
- "ctx_a": line["ctx_a"],
- "continuations": line["endings"],
- "gold_idx": int(line["label"]),
- },
- formulation=formulation,
- ),
- hf_repo="ai4bharat/hellaswag-hi",
- hf_filter=lambda line: all(len(choice.strip()) > 0 for choice in line["endings"]),
- hf_subset="hi",
- evaluation_splits=("validation",),
- few_shots_split="validation",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- ],
- ),
- )
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-hellaswag_tel_tasks = [
- LightevalTaskConfig(
- name=f"community_hellaswag_{Language.TELUGU.value}_{formulation.name.lower()}",
- suite=["lighteval"],
- prompt_function=get_hellaswag_prompt_function(
- language=Language.TELUGU,
- adapter=lambda line: {
- "ctx_a": line["ctx_a"],
- "continuations": line["endings"],
- "gold_idx": int(line["label"]),
- },
- formulation=formulation,
- ),
- hf_repo="LightFury9/hellaswag-telugu",
- hf_subset="default",
- evaluation_splits=("valid",),
- few_shots_split="train",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
-]
-
-TASKS_TABLE.extend(
- [
- *mlmm_hellaswag_tasks,
- *hellaswag_tur_tasks,
- *hellaswag_tha_tasks,
- *hellaswag_hin_tasks,
- *hellaswag_tel_tasks,
- ]
-)
-# ------------------------------- RC Tasks ------------------------------- #
-# Reading Comprehension (RC) tasks evaluate a model's ability to understand and extract information from text passages.
-# These tasks typically involve answering questions based on given contexts, spanning multiple languages and formats.
-# Add RC tasks supporting about 130 unique languages/scripts.
-
-# SQuAD - like
-
-# XQuAD: Cross-lingual Question Answering Dataset, extending SQuAD to 11 languages.
-# https://arxiv.org/abs/1910.11856
-xquad_tasks = [
- LightevalTaskConfig(
- name=f"xquad_{language.value}",
- prompt_function=get_qa_prompt_function(
- language,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="google/xquad",
- hf_subset=f"xquad.{standardize_tag(language.value)}",
- evaluation_splits=("validation",),
- few_shots_split="validation",
- generation_size=400,
- stop_sequence=("\n",),
- metrics=(
- MultilingualQuasiExactMatchMetric(language, "prefix"),
- MultilingualQuasiF1ScoreMetric(language),
- ),
- )
- for language in [
- Language.ARABIC,
- Language.GERMAN,
- Language.GREEK,
- Language.ENGLISH,
- Language.SPANISH,
- Language.HINDI,
- Language.ROMANIAN,
- Language.RUSSIAN,
- Language.THAI,
- Language.TURKISH,
- Language.VIETNAMESE,
- Language.CHINESE,
- ]
-]
-
-# GermanQuAD: High-quality German QA dataset with 13,722 questions
-# https://arxiv.org/abs/2104.12741
-germanquad_tasks = [
- LightevalTaskConfig(
- name=f"germanquad_{Language.GERMAN.value}",
- prompt_function=get_qa_prompt_function(
- Language.GERMAN,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="deepset/germanquad",
- hf_subset="plain_text",
- hf_revision="fff05ceaf2ffbe5b65c7e0c57e678f7b7e1a0581",
- hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
- evaluation_splits=("test",),
- few_shots_split="train",
- generation_size=400,
- stop_sequence=("\n",),
- metrics=(
- MultilingualQuasiExactMatchMetric(Language.GERMAN, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.GERMAN),
- ),
- )
-]
-
-
-# SQuAD-it: Italian translation of the SQuAD dataset
-# https://github.com/crux82/squad-it
-squad_it_tasks = [
- LightevalTaskConfig(
- name=f"squad_{Language.ITALIAN.value}",
- prompt_function=get_qa_prompt_function(
- Language.ITALIAN,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="crux82/squad_it",
- hf_subset="default",
- hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
- evaluation_splits=("test",),
- few_shots_split="train",
- generation_size=400,
- stop_sequence=("\n",),
- metrics=(
- MultilingualQuasiExactMatchMetric(Language.ITALIAN, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.ITALIAN),
- ),
- )
-]
-
-
-# ThaiQA: A question answering dataset for the Thai language.
-thaiqa_tasks = [
- LightevalTaskConfig(
- name=f"thaiqa_{Language.THAI.value}",
- prompt_function=get_qa_prompt_function(
- Language.THAI,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["answer"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="lighteval/thaiqa_squad_fixed",
- hf_subset="default",
- evaluation_splits=("train",),
- few_shots_split="validation",
- generation_size=400,
- stop_sequence=("\n",),
- metrics=(
- MultilingualQuasiExactMatchMetric(Language.THAI, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.THAI),
- ),
- )
-]
-
-# SberQuAD: A large-scale Russian reading comprehension dataset.
-# https://arxiv.org/abs/1912.09723
-sber_squad_tasks = [
- LightevalTaskConfig(
- name=f"sber_squad_{Language.RUSSIAN.value}",
- prompt_function=get_qa_prompt_function(
- Language.RUSSIAN,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="kuznetsoffandrey/sberquad",
- hf_subset="sberquad",
- evaluation_splits=("validation",),
- few_shots_split="train",
- metrics=(
- MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.RUSSIAN),
- ),
- generation_size=400,
- stop_sequence=("\n",),
- )
-]
-
-# FaQuAD: A Portuguese Reading Comprehension Dataset
-# https://arxiv.org/abs/2007.15671
-faquad_tasks = [
- LightevalTaskConfig(
- name=f"faquad_{Language.PORTUGUESE.value}",
- prompt_function=get_qa_prompt_function(
- Language.PORTUGUESE,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="eraldoluis/faquad",
- hf_subset="plain_text",
- hf_revision="205ba826a2282a4a5aa9bd3651e55ee4f2da1546",
- hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
- evaluation_splits=("validation",),
- few_shots_split="train",
- metrics=(
- MultilingualQuasiExactMatchMetric(Language.PORTUGUESE, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.PORTUGUESE),
- ),
- generation_size=400,
- stop_sequence=("\n",),
- )
-]
-
-
-# SQuAD-es: Spanish translation of the Stanford Question Answering Dataset
-# https://huggingface.co/datasets/ccasimiro/squad_es
-squad_es_tasks = [
- LightevalTaskConfig(
- name=f"squad_{Language.SPANISH.value}",
- prompt_function=get_qa_prompt_function(
- Language.SPANISH,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="ccasimiro/squad_es",
- hf_subset="v2.0.0",
- hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
- evaluation_splits=("validation",),
- few_shots_split="train",
- metrics=(
- MultilingualQuasiExactMatchMetric(Language.SPANISH, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.SPANISH),
- ),
- generation_size=400,
- stop_sequence=("\n",),
- )
-]
-
-
-# ARCD: Arabic Reading Comprehension Dataset.
-# https://arxiv.org/pdf/1906.05394
-arcd_tasks = [
- LightevalTaskConfig(
- name=f"arcd_{Language.ARABIC.value}",
- prompt_function=get_qa_prompt_function(
- Language.ARABIC,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="hsseinmz/arcd",
- hf_subset="plain_text",
- evaluation_splits=("validation",),
- few_shots_split="train",
- metrics=(
- MultilingualQuasiExactMatchMetric(Language.ARABIC, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.ARABIC),
- ),
- generation_size=400,
- stop_sequence=("\n",),
- )
-]
-
-# KenSwQuAD: A question answering dataset for Kenyan Swahili.
-# https://arxiv.org/abs/2205.02364
-kenswquad_tasks = [
- LightevalTaskConfig(
- name=f"kenswquad_{Language.SWAHILI.value}",
- prompt_function=get_qa_prompt_function(
- Language.SWAHILI,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [line["answer"]],
- },
- ),
- suite=("lighteval",),
- hf_repo="lighteval/KenSwQuAD",
- hf_subset="default",
- evaluation_splits=("test",),
- few_shots_split="validation",
- metrics=(
- MultilingualQuasiExactMatchMetric(Language.SWAHILI, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.SWAHILI),
- ),
- generation_size=400,
- stop_sequence=("\n",),
- )
-]
-
-# ChineseSquad: A reading comprehension dataset for Chinese.
-# https://github.com/pluto-junzeng/ChineseSquad
-chinese_squad_tasks = [
- LightevalTaskConfig(
- name=f"chinese_squad_{Language.CHINESE.value}",
- prompt_function=get_qa_prompt_function(
- Language.CHINESE,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="lighteval/ChineseSquad",
- hf_subset="default",
- evaluation_splits=("validation",),
- few_shots_split="train",
- metrics=(
- MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.CHINESE),
- ),
- generation_size=400,
- stop_sequence=("\n",),
- )
-]
-
-# CMRC 2018: A span-extraction machine reading comprehension dataset for Chinese.
-# https://arxiv.org/abs/1810.07366
-cmrc2018_tasks = [
- LightevalTaskConfig(
- name=f"cmrc2018_{Language.CHINESE.value}",
- prompt_function=get_qa_prompt_function(
- Language.CHINESE,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="clue/clue",
- hf_subset="cmrc2018",
- evaluation_splits=("trial",),
- few_shots_split="train",
- generation_size=400,
- metrics=(
- MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.CHINESE),
- ),
- stop_sequence=("\n",),
- )
-]
-
-# IndicQA: A reading comprehension dataset for 11 Indian languages.
-# https://arxiv.org/abs/2407.13522
-indicqa_tasks = [
- LightevalTaskConfig(
- name=f"indicqa_{language.value}",
- prompt_function=get_qa_prompt_function(
- language,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="ai4bharat/IndicQA",
- hf_subset=f"indicqa.{LangCodeLanguage.get(language.value).language}",
- hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
- hf_revision="92d96092ae229950973dac3b9998f8b3a8949b0a",
- evaluation_splits=("test",),
- hf_avail_splits=("test",),
- generation_size=400,
- metrics=(
- MultilingualQuasiExactMatchMetric(language, "prefix"),
- MultilingualQuasiF1ScoreMetric(language),
- ),
- stop_sequence=("\n",),
- )
- for language in [
- Language.ASSAMESE,
- Language.BENGALI,
- Language.GUJARATI,
- Language.HINDI,
- Language.KANNADA,
- Language.MALAYALAM,
- Language.MARATHI,
- Language.ORIYA,
- Language.PUNJABI,
- Language.TAMIL,
- Language.TELUGU,
- ]
-]
-
-# FQuAD v2: French Question Answering Dataset version 2.
-# https://arxiv.org/abs/2002.06071
-fquad_v2_tasks = [
- LightevalTaskConfig(
- name=f"fquadv2_{Language.FRENCH.value}",
- prompt_function=get_qa_prompt_function(
- Language.FRENCH,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="manu/fquad2_test",
- hf_subset="default",
- evaluation_splits=("test_hasAns",),
- few_shots_split="valid_hasAns",
- generation_size=400,
- stop_sequence=("\n",),
- metrics=(
- MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.FRENCH),
- ),
- )
-]
-
-# TQuAD v2: Turkish Question Answering Dataset version 2.
-tquad_v2_tasks = [
- LightevalTaskConfig(
- name=f"tquadv2_{Language.TURKISH.value}",
- prompt_function=get_qa_prompt_function(
- Language.TURKISH,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [a["text"] for a in line["answers"]],
- },
- ),
- suite=("lighteval",),
- hf_repo="erdometo/tquad2",
- hf_subset="default",
- evaluation_splits=("validation",),
- few_shots_split="train",
- generation_size=400,
- stop_sequence=("\n",),
- metrics=(
- MultilingualQuasiExactMatchMetric(Language.TURKISH, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.TURKISH),
- ),
- )
-]
-
-# Other QA tasks for RC
-
-# TyDi QA: A benchmark for information-seeking question answering in typologically diverse languages.
-# https://arxiv.org/abs/2003.05002
-tydiqa_tasks = [
- LightevalTaskConfig(
- name=f"tydiqa_{language.value}",
- prompt_function=get_qa_prompt_function(
- language,
- lambda line: {
- "question": line["question"],
- "context": line["context"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="google-research-datasets/tydiqa",
- hf_subset="secondary_task",
- evaluation_splits=("validation",),
- few_shots_split="train",
- generation_size=400,
- stop_sequence=("\n",),
- metrics=(
- MultilingualQuasiExactMatchMetric(language, "prefix"),
- MultilingualQuasiF1ScoreMetric(language),
- ),
- )
- for language in [
- Language.ENGLISH,
- Language.ARABIC,
- Language.BENGALI,
- Language.FINNISH,
- Language.INDONESIAN,
- Language.JAPANESE,
- Language.KOREAN,
- Language.SWAHILI,
- Language.RUSSIAN,
- Language.TELUGU,
- Language.THAI,
- ]
-]
-
-# C3: A Chinese Challenge Corpus for Cross-lingual and Cross-modal Tasks
-# Reading comprehension task part of clue
-# Paper: https://arxiv.org/abs/2004.05986
-c3_tasks = [
- LightevalTaskConfig(
- name=f"c3_{Language.CHINESE.value}_{formulation.name.lower()}",
- suite=("lighteval",),
- prompt_function=get_mcq_prompt_function(
- Language.CHINESE,
- lambda line: {
- "question": line["question"],
- "choices": line["choice"],
- "gold_idx": line["choice"].index(line["answer"]),
- "context": " ".join(line["context"]),
- },
- formulation=formulation,
- ),
- hf_repo="clue/clue",
- hf_subset="c3",
- evaluation_splits=("validation",),
- few_shots_split="train",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# Other MCF tasks for RC
-# RACE: Reading Comprehension from Examinations
-# RACE is a large-scale reading comprehension dataset collected from English exams for middle and high school Chinese students.
-# This Arabic version is a translation of the original RACE dataset, adapted for Arabic language evaluation.
-# Paper: https://aclanthology.org/2023.arabicnlp-1.21/
-race_ar_task = [
- LightevalTaskConfig(
- name=f"alghafa_race_{Language.ARABIC.value}_{formulation.name.lower()}",
- prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
- suite=["lighteval"],
- hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
- hf_subset="race_ar",
- hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff",
- hf_avail_splits=["test", "validation"],
- evaluation_splits=["test"],
- few_shots_split="validation",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-# SOQAL: A large-scale Arabic reading comprehension dataset.
-# https://arxiv.org/abs/1906.05394
-soqal_tasks = [
- LightevalTaskConfig(
- name=f"soqal_{Language.ARABIC.value}_{formulation.name.lower()}",
- hf_subset="multiple_choice_grounded_statement_soqal_task",
- prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
- evaluation_splits=["test"],
- few_shots_split="validation",
- suite=["lighteval"],
- hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# MLQA (MultiLingual Question Answering) is a benchmark dataset for evaluating cross-lingual question answering performance.
-# It consists of QA instances in 7 languages: English, Arabic, German, Spanish, Hindi, Vietnamese, and Chinese.
-# The dataset is derived from the SQuAD v1.1 dataset, with questions and contexts translated by professional translators.
-# Paper: https://arxiv.org/abs/1910.07475
-mlqa_tasks = [
- LightevalTaskConfig(
- name=f"mlqa_{lang.value}",
- prompt_function=get_qa_prompt_function(
- lang,
- lambda line: {
- "context": line["context"],
- "question": line["question"],
- "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
- },
- ),
- suite=("lighteval",),
- hf_repo="facebook/mlqa",
- hf_subset=f"mlqa.{standardize_tag(lang.value)}.{standardize_tag(lang.value)}",
- hf_revision="397ed406c1a7902140303e7faf60fff35b58d285",
- evaluation_splits=("test",),
- hf_avail_splits=["test"],
- generation_size=400,
- stop_sequence=("\n",),
- metrics=[
- MultilingualQuasiExactMatchMetric(lang, "prefix"),
- MultilingualQuasiF1ScoreMetric(lang),
- ],
- )
- for lang in [
- Language.ARABIC,
- Language.GERMAN,
- Language.SPANISH,
- Language.CHINESE,
- Language.HINDI,
- Language.VIETNAMESE,
- ]
-]
-
-# Belebele: A large-scale reading comprehension dataset covering 122 languages.
-# https://arxiv.org/abs/2308.16884
-belebele_tasks = [
- LightevalTaskConfig(
- name=f"belebele_{language}_{formulation.name.lower()}",
- prompt_function=get_mcq_prompt_function(
- iso_639_3_ind_to_iso_639_3_macro[LangCodeLanguage.get(language).to_alpha3()],
- lambda line: {
- "question": line["question"],
- "context": line["flores_passage"],
- "choices": [line[f"mc_answer{i}"] for i in range(1, 5)],
- "gold_idx": int(line["correct_answer_num"]) - 1,
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="facebook/belebele",
- hf_subset=language,
- evaluation_splits=("test",),
- hf_avail_splits=["test"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
- for language in [
- "acm_Arab",
- "arz_Arab",
- "ceb_Latn",
- "fin_Latn",
- "hin_Deva",
- "ita_Latn",
- "khm_Khmr",
- "lvs_Latn",
- "npi_Deva",
- "pol_Latn",
- "slv_Latn",
- "swe_Latn",
- # "tso_Latn",
- # "xho_Latn",
- "afr_Latn",
- "asm_Beng",
- "ces_Latn",
- "fra_Latn",
- "hin_Latn",
- "jav_Latn",
- # "kin_Latn",
- "mal_Mlym",
- "npi_Latn",
- "por_Latn",
- # "sna_Latn",
- "swh_Latn",
- "tur_Latn",
- "yor_Latn",
- "als_Latn",
- "azj_Latn",
- "ckb_Arab",
- # "fuv_Latn",
- "hrv_Latn",
- "jpn_Jpan",
- "kir_Cyrl",
- "mar_Deva",
- # "nso_Latn",
- "snd_Arab",
- "tam_Taml",
- "ukr_Cyrl",
- "zho_Hans",
- "amh_Ethi",
- # "bam_Latn",
- "dan_Latn",
- # "gaz_Latn",
- "hun_Latn",
- # "kac_Latn",
- "kor_Hang",
- "mkd_Cyrl",
- # "nya_Latn",
- "ron_Latn",
- "som_Latn",
- "tel_Telu",
- "urd_Arab",
- "zho_Hant",
- "apc_Arab",
- "ben_Beng",
- "deu_Latn",
- # "grn_Latn",
- "hye_Armn",
- "kan_Knda",
- "lao_Laoo",
- "mlt_Latn",
- "ory_Orya",
- "rus_Cyrl",
- # "sot_Latn",
- "tgk_Cyrl",
- "urd_Latn",
- "zsm_Latn",
- "arb_Arab",
- "ben_Latn",
- "ell_Grek",
- "guj_Gujr",
- # "ibo_Latn",
- "kat_Geor",
- # "lin_Latn",
- # "mri_Latn",
- "pan_Guru",
- # "shn_Mymr",
- "spa_Latn",
- "tgl_Latn",
- "uzn_Latn",
- # "zul_Latn",
- "arb_Latn",
- # "bod_Tibt",
- "eng_Latn",
- # "hat_Latn",
- # "ilo_Latn",
- "kaz_Cyrl",
- "lit_Latn",
- "mya_Mymr",
- "pbt_Arab",
- "sin_Latn",
- "srp_Cyrl",
- "tha_Thai",
- "vie_Latn",
- "ars_Arab",
- "bul_Cyrl",
- "est_Latn",
- # "hau_Latn",
- "ind_Latn",
- # "kea_Latn",
- # "lug_Latn",
- "nld_Latn",
- "pes_Arab",
- "sin_Sinh",
- # "ssw_Latn",
- # "tir_Ethi",
- "war_Latn",
- "ary_Arab",
- "cat_Latn",
- "eus_Latn",
- "heb_Hebr",
- "isl_Latn",
- # "khk_Cyrl",
- # "luo_Latn",
- "nob_Latn",
- "plt_Latn",
- "slk_Latn",
- # "sun_Latn",
- # "tsn_Latn",
- # "wol_Latn",
- ]
-]
-
-TASKS_TABLE.extend(
- [
- *xquad_tasks,
- *thaiqa_tasks,
- *sber_squad_tasks,
- *arcd_tasks,
- *kenswquad_tasks,
- *chinese_squad_tasks,
- *cmrc2018_tasks,
- *indicqa_tasks,
- *fquad_v2_tasks,
- *tquad_v2_tasks,
- *tydiqa_tasks,
- *soqal_tasks,
- *race_ar_task,
- *belebele_tasks,
- *c3_tasks,
- *squad_it_tasks,
- *squad_es_tasks,
- *faquad_tasks,
- *germanquad_tasks,
- ]
-)
-
-# ------------------------------- GK Tasks ------------------------------- #
-# General Knowledge (GK) tasks evaluate a model's broad understanding across various domains.
-# These tasks typically involve answering questions on diverse subjects, testing the model's ability to recall and apply general information.
-
-
-# -------------------------------- MMLU -------------------------------- #
-# MMLU (Massive Multitask Language Understanding)
-# A comprehensive test of world knowledge, covering 57 subjects across STEM, humanities, social sciences, and more.
-# Note that all MMLU tasks uses PMI normalization, this makes the computation 2x slower, however we found this metric to be less noisy and yield better results than the others.
-# Paper: https://arxiv.org/abs/2009.03300
-MMLU_SUBSETS = [
- "abstract_algebra",
- "anatomy",
- "astronomy",
- "business_ethics",
- "clinical_knowledge",
- "college_biology",
- "college_chemistry",
- "college_computer_science",
- "college_mathematics",
- "college_medicine",
- "college_physics",
- "computer_security",
- "conceptual_physics",
- "econometrics",
- "electrical_engineering",
- "elementary_mathematics",
- "formal_logic",
- "global_facts",
- "high_school_biology",
- "high_school_chemistry",
- "high_school_computer_science",
- "high_school_european_history",
- "high_school_geography",
- "high_school_government_and_politics",
- "high_school_macroeconomics",
- "high_school_mathematics",
- "high_school_microeconomics",
- "high_school_physics",
- "high_school_psychology",
- "high_school_statistics",
- "high_school_us_history",
- "high_school_world_history",
- "human_aging",
- "human_sexuality",
- "international_law",
- "jurisprudence",
- "logical_fallacies",
- "machine_learning",
- "management",
- "marketing",
- "medical_genetics",
- "miscellaneous",
- "moral_disputes",
- "moral_scenarios",
- "nutrition",
- "philosophy",
- "prehistory",
- "professional_accounting",
- "professional_law",
- "professional_medicine",
- "professional_psychology",
- "public_relations",
- "security_studies",
- "sociology",
- "us_foreign_policy",
- "virology",
- "world_religions",
-]
-
-# Meta MMLU: A multilingual version of MMLU (using google translation)
-# Paper: https://arxiv.org/abs/2407.21783
-meta_mmlu_tasks = [
- LightevalTaskConfig(
- name=f"meta_mmlu_{language.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- language,
- lambda line: {
- "question": line["input_question"],
- "choices": [v for _, v in sorted(line["input_choice_list"].items(), key=lambda x: x[0])],
- "gold_idx": LETTER_INDICES.index(line["input_correct_responses"][0]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="meta-llama/Meta-Llama-3.1-8B-Instruct-evals",
- hf_subset=f"Llama-3.1-8B-Instruct-evals__multilingual_mmlu_{standardize_tag(language.value)}__details",
- hf_filter=partial(
- lambda language, subset, line: line["subtask_name"]
- == f"mmlu_{standardize_tag(language.value)}_chat.{subset}",
- language,
- subset,
- ),
- evaluation_splits=("latest",),
- hf_avail_splits=["latest"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for subset in MMLU_SUBSETS
- for language in [
- Language.GERMAN,
- Language.SPANISH,
- Language.FRENCH,
- Language.HINDI,
- Language.ITALIAN,
- Language.PORTUGUESE,
- Language.THAI,
- ]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# MLMM MMLU: Another multilingual version of MMLU
-# Paper: https://github.com/nlp-uoregon/mlmm-evaluation
-mlmm_mmlu_tasks = [
- LightevalTaskConfig(
- name=f"mlmm_mmlu_{language.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- language,
- lambda line: {
- "question": line["question"],
- "choices": line["choices"],
- "gold_idx": LETTER_INDICES.index(line["answer"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="jon-tow/okapi_mmlu",
- hf_subset=standardize_tag(language.value),
- hf_revision="refs/pr/1",
- hf_filter=partial(lambda subset, line: line["id"].split("/")[0] == subset, subset),
- evaluation_splits=("test",),
- few_shots_split="dev",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for subset in MMLU_SUBSETS
- for language in [
- Language.RUSSIAN,
- Language.GERMAN,
- Language.CHINESE,
- Language.FRENCH,
- Language.SPANISH,
- Language.ITALIAN,
- Language.DUTCH,
- Language.VIETNAMESE,
- Language.INDONESIAN,
- Language.ARABIC,
- Language.HUNGARIAN,
- Language.ROMANIAN,
- Language.DANISH,
- Language.SLOVAK,
- Language.UKRAINIAN,
- Language.CATALAN,
- Language.SERBIAN,
- Language.CROATIAN,
- Language.HINDI,
- Language.BENGALI,
- Language.TAMIL,
- Language.NEPALI,
- Language.MALAYALAM,
- Language.MARATHI,
- Language.TELUGU,
- Language.KANNADA,
- ]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-openai_mmlu_tasks = [
- LightevalTaskConfig(
- name=f"openai_mmlu_{language[0].value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- language[0],
- lambda line: {
- "question": line["Question"],
- "choices": [line["A"], line["B"], line["C"], line["D"]],
- "gold_idx": LETTER_INDICES.index(line["Answer"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="openai/MMMLU",
- hf_subset=language[1],
- evaluation_splits=("test",),
- hf_avail_splits=["test"],
- hf_filter=partial(lambda subset, x: x["Subject"].lower() == subset, subset),
- hf_revision="038c7808122969ead7456361af05cb8f47d247f8",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for subset in MMLU_SUBSETS
- for language in [
- (Language.ARABIC, "AR_XY"),
- (Language.BENGALI, "BN_BD"),
- (Language.GERMAN, "DE_DE"),
- (Language.SPANISH, "ES_LA"),
- (Language.FRENCH, "FR_FR"),
- (Language.HINDI, "HI_IN"),
- (Language.INDONESIAN, "ID_ID"),
- (Language.ITALIAN, "IT_IT"),
- (Language.JAPANESE, "JA_JP"),
- (Language.KOREAN, "KO_KR"),
- (Language.PORTUGUESE, "PT_BR"),
- (Language.SWAHILI, "SW_KE"),
- (Language.YORUBA, "YO_NG"),
- (Language.CHINESE, "ZH_CN"),
- ]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# Translated MMLU using both professional and non-professional translators. Contains tags for cultural sensitivity.
-# CA: Cultural Agnostic
-# CS: Cultural Specific
-# UNK: Not annotated
-# ALL: All of the above
-# https://huggingface.co/papers/2412.03304
-global_mmlu_tasks = [
- LightevalTaskConfig(
- name=f"global_mmlu_{sensitivity_label.lower()}_{language.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- language,
- lambda line: {
- "question": line["question"],
- "choices": [line["option_a"], line["option_b"], line["option_c"], line["option_d"]],
- "gold_idx": LETTER_INDICES.index(line["answer"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="CohereForAI/Global-MMLU",
- hf_subset=standardize_tag(language.value),
- evaluation_splits=("test",),
- few_shots_split="dev",
- hf_filter=partial(
- lambda subset, sensitivity_label, x: x["subject"].lower() == subset
- and (
- sensitivity_label == "ALL" or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
- )
- and all(x[f"option_{opt}"] is not None and x[f"option_{opt}"].strip() for opt in "abcd"),
- subset,
- sensitivity_label,
- ),
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for subset in MMLU_SUBSETS
- for language in [
- Language.AMHARIC,
- Language.ARABIC,
- Language.BENGALI,
- Language.CHINESE,
- Language.CZECH,
- Language.GERMAN,
- Language.ENGLISH,
- Language.SPANISH,
- Language.FRENCH,
- Language.HEBREW,
- Language.HINDI,
- Language.INDONESIAN,
- Language.ITALIAN,
- Language.JAPANESE,
- Language.KOREAN,
- Language.MALAY,
- Language.DUTCH,
- Language.NORWEGIAN,
- Language.POLISH,
- Language.PORTUGUESE,
- Language.ROMANIAN,
- Language.RUSSIAN,
- Language.SERBIAN,
- Language.SWEDISH,
- Language.SWAHILI,
- Language.TAMIL,
- Language.TELUGU,
- Language.THAI,
- Language.TURKISH,
- Language.UKRAINIAN,
- Language.URDU,
- Language.VIETNAMESE,
- Language.YORUBA,
- Language.ZULU,
- ]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
- for sensitivity_label in ["ALL", "CA", "CS", "UNK"]
-]
-
-
-# There are only these subsets in the African MMLU
-AFRI_MMLU_SUBSETS = [
- "elementary_mathematics",
- "high_school_mathematics",
- "high_school_geography",
- "high_school_microeconomics",
- "international_law",
- "global_facts",
-]
-# African MMLU: African Massive Multitask Language Understanding
-# From https://arxiv.org/abs/2406.03368. Human translated MMLU.
-afri_mmlu_tasks = [
- LightevalTaskConfig(
- name=f"afri_mmlu_{language.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- language,
- lambda line: {
- "question": line["question"],
- "choices": line["choices"],
- "gold_idx": LETTER_INDICES.index(line["answer"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="masakhane/afrimmlu",
- # Temporary until the pr is merged
- hf_revision="refs/pr/1",
- hf_subset=language.value,
- hf_filter=partial(lambda subset, line: line["subject"] == subset, subset),
- evaluation_splits=("test",),
- few_shots_split="dev",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for subset in AFRI_MMLU_SUBSETS
- for language in [
- Language.AMHARIC,
- # Language.EWE,
- Language.FRENCH,
- # Language.HAUSA,
- # Language.IGBO,
- # Language.KINYARWANDA,
- # Language.LINGALA,
- # Language.LUGANDA,
- # Language.OROMO,
- # Language.SHONA,
- # Language.SOTHO,
- Language.SWAHILI,
- # Language.TWI,
- # Language.WOLOF,
- # Language.XHOSA,
- Language.YORUBA,
- # Language.ZULU,
- ]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# RUMMLU: Russian Massive Multitask Language Understanding
-# Paper: https://arxiv.org/html/2401.04531v2
-rummlu = [
- LightevalTaskConfig(
- name=f"rummlu_{Language.RUSSIAN.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- Language.RUSSIAN,
- lambda line: {
- "question": line["inputs"]["text"],
- "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]],
- "gold_idx": LETTER_INDICES.index(line["outputs"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="ai-forever/MERA",
- hf_subset="rummlu",
- hf_filter=lambda x: x["meta"]["domain"] == subset,
- evaluation_splits=("public_test",),
- hf_avail_splits=["public_test"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for subset in MMLU_SUBSETS
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# MMLU Turkish: Turkish version of MMLU
-# Translated using openai GPT
-mmlu_turkish = [
- LightevalTaskConfig(
- name=f"community_mmlu_{Language.TURKISH.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- Language.TURKISH,
- lambda line: {"question": line["question"], "choices": line["choices"], "gold_idx": int(line["answer"])},
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="malhajar/mmlu_tr-v0.2",
- hf_subset=subset,
- evaluation_splits=("test",),
- few_shots_split="dev",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for subset in MMLU_SUBSETS
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# CMMLU: Chinese Massive Multitask Language Understanding
-# Native translation with some new categories
-# Paper: https://arxiv.org/abs/2306.09212
-CMMLU_SUBSETS = [
- "agronomy",
- "anatomy",
- "ancient_chinese",
- "arts",
- "astronomy",
- "business_ethics",
- "chinese_civil_service_exam",
- "chinese_driving_rule",
- "chinese_food_culture",
- "chinese_foreign_policy",
- "chinese_history",
- "chinese_literature",
- "chinese_teacher_qualification",
- "clinical_knowledge",
- "college_actuarial_science",
- "college_education",
- "college_engineering_hydrology",
- "college_law",
- "college_mathematics",
- "college_medical_statistics",
- "college_medicine",
- "computer_science",
- "computer_security",
- "conceptual_physics",
- "construction_project_management",
- "economics",
- "education",
- "electrical_engineering",
- "elementary_chinese",
- "elementary_commonsense",
- "elementary_information_and_technology",
- "elementary_mathematics",
- "ethnology",
- "food_science",
- "genetics",
- "global_facts",
- "high_school_biology",
- "high_school_chemistry",
- "high_school_geography",
- "high_school_mathematics",
- "high_school_physics",
- "high_school_politics",
- "human_sexuality",
- "international_law",
- "journalism",
- "jurisprudence",
- "legal_and_moral_basis",
- "logical",
- "machine_learning",
- "management",
- "marketing",
- "marxist_theory",
- "modern_chinese",
- "nutrition",
- "philosophy",
- "professional_accounting",
- "professional_law",
- "professional_medicine",
- "professional_psychology",
- "public_relations",
- "security_study",
- "sociology",
- "sports_science",
- "traditional_chinese_medicine",
- "virology",
- "world_history",
- "world_religions",
-]
-
-cmmlu_tasks = [
- LightevalTaskConfig(
- name=f"cmmlu_{Language.CHINESE.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- Language.CHINESE,
- lambda line: {
- "question": line["Question"],
- "choices": [line["A"], line["B"], line["C"], line["D"]],
- "gold_idx": LETTER_INDICES.index(line["Answer"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="haonan-li/cmmlu",
- hf_subset=subset,
- evaluation_splits=("test",),
- few_shots_split="dev",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for subset in CMMLU_SUBSETS
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# Arabic MMLU: Arabic version of MMLU
-# Native translation with some new categories
-# Paper: https://arxiv.org/html/2402.12840v1
-ARABIC_MMLU_SUBSETS = [
- "Islamic Studies",
- "Islamic Studies (Middle School)",
- "Islamic Studies (Primary School)",
- "Islamic Studies (High School)",
- "Driving Test",
- "Natural Science (Middle School)",
- "Natural Science (Primary School)",
- "History (Middle School)",
- "History (Primary School)",
- "History (High School)",
- "General Knowledge",
- "General Knowledge (Middle School)",
- "General Knowledge (Primary School)",
- "Law (Professional)",
- "Physics (High School)",
- "Social Science (Middle School)",
- "Social Science (Primary School)",
- "Management (University)",
- "Arabic Language (Middle School)",
- "Arabic Language (Primary School)",
- "Arabic Language (High School)",
- "Political Science (University)",
- "Philosophy (High School)",
- "Accounting (University)",
- "Computer Science (Middle School)",
- "Computer Science (Primary School)",
- "Computer Science (High School)",
- "Computer Science (University)",
- "Geography (Middle School)",
- "Geography (Primary School)",
- "Geography (High School)",
- "Math (Primary School)",
- "Biology (High School)",
- "Economics (Middle School)",
- "Economics (High School)",
- "Economics (University)",
- "Arabic Language (General)",
- "Arabic Language (Grammar)",
- "Civics (Middle School)",
- "Civics (High School)",
-]
-
-arabic_mmlu_tasks = [
- LightevalTaskConfig(
- name=f"mmlu_{Language.ARABIC.value}_{formulation.name.lower()}:{normalize_subset(subset)}",
- prompt_function=get_mcq_prompt_function(
- Language.ARABIC,
- lambda line: {
- "context": line["Context"],
- "question": line["Question"],
- "choices": [str(o) for o in [line[f"Option {i}"] for i in range(1, 6)] if o],
- "gold_idx": LETTER_INDICES.index(line["Answer Key"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="MBZUAI/ArabicMMLU",
- hf_subset=subset,
- evaluation_splits=("test",),
- hf_avail_splits=["dev"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for subset in ARABIC_MMLU_SUBSETS
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-
-TURKISH_MMLU_SUBSET = [
- "Biology",
- "Chemistry",
- "Geography",
- "History",
- "Mathematics",
- "Philosophy",
- "Physics",
- "Religion_and_Ethics",
- "Turkish_Language_and_Literature",
-]
-
-turkish_mmlu_tasks = [
- LightevalTaskConfig(
- name=f"mmlu_{Language.TURKISH.value}_{formulation.name.lower()}:{normalize_subset(subset)}",
- prompt_function=get_mcq_prompt_function(
- Language.TURKISH,
- lambda line: {
- "question": line["question"],
- "choices": line["choices"],
- "gold_idx": LETTER_INDICES.index(line["answer"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="AYueksel/TurkishMMLU",
- hf_subset=subset,
- evaluation_splits=("test",),
- few_shots_split="dev",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for subset in TURKISH_MMLU_SUBSET
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-TASKS_TABLE.extend(
- [
- *meta_mmlu_tasks,
- *mlmm_mmlu_tasks,
- *rummlu,
- *mmlu_turkish,
- *cmmlu_tasks,
- *openai_mmlu_tasks,
- *arabic_mmlu_tasks,
- *turkish_mmlu_tasks,
- *afri_mmlu_tasks,
- *global_mmlu_tasks,
- ]
-)
-
-
-# ---------------------------- ARC ---------------------------- #
-# ARC (AI2 Reasoning Challenge) is a dataset for question answering that requires reasoning.
-# It consists of multiple-choice science questions from 3rd to 9th grade exams.
-# The dataset is split into two parts: ARC-Easy and ARC-Challenge.
-# ARC-Easy contains questions that can be answered correctly by both humans and simple baseline models.
-# ARC-Challenge contains questions that are difficult for both humans and current AI systems.
-
-# Similar to MMLU, ARC tasks uses PMI normalization by default but only for the challenge set.
-
-
-# github: https://github.com/nlp-uoregon/mlmm-evaluation
-mlmm_arc_challenge_tasks = [
- LightevalTaskConfig(
- name=f"mlmm_arc_{language.value}_{formulation.name.lower()}:challenge",
- prompt_function=get_mcq_prompt_function(
- language,
- lambda line: {
- "question": line["question"],
- "choices": line["choices"]["text"],
- "gold_idx": int(line["answerKey"]) - 1
- if line["answerKey"].isdigit()
- else LETTER_INDICES.index(line["answerKey"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="jon-tow/okapi_arc_challenge",
- hf_subset=standardize_tag(language.value),
- hf_revision="823d5d7bfaf8974a3ab52a825b6cf4903b35dbc4",
- evaluation_splits=("test",),
- few_shots_split="train",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for language in [
- Language.RUSSIAN,
- Language.GERMAN,
- Language.CHINESE,
- Language.FRENCH,
- Language.SPANISH,
- Language.ITALIAN,
- Language.DUTCH,
- Language.VIETNAMESE,
- Language.INDONESIAN,
- Language.ARABIC,
- Language.HUNGARIAN,
- Language.ROMANIAN,
- Language.DANISH,
- Language.SLOVAK,
- Language.UKRAINIAN,
- Language.CATALAN,
- Language.SERBIAN,
- Language.CROATIAN,
- Language.HINDI,
- Language.BENGALI,
- Language.TAMIL,
- Language.NEPALI,
- Language.MALAYALAM,
- Language.MARATHI,
- Language.TELUGU,
- Language.KANNADA,
- ]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# Arabic ARC Easy
-# It's based on the community arabic leaderboard task but uses
-# the multilingual template
-# Paper: https://aclanthology.org/2023.arabicnlp-1.21/
-arabic_ledarboard_arc_easy = [
- LightevalTaskConfig(
- name=f"alghafa_arc_{Language.ARABIC.value}_{formulation.name.lower()}:easy",
- prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
- suite=["lighteval"],
- hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
- hf_subset="arc_easy_ar",
- hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff",
- evaluation_splits=["test"],
- few_shots_split="validation",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-
-lumi_arc = [
- LightevalTaskConfig(
- name=f"lumi_arc_{language.value}_{formulation.name.lower()}:challenge",
- prompt_function=get_mcq_prompt_function(
- language,
- lambda line: {
- "question": line["question"],
- "choices": line["choices"]["text"],
- "gold_idx": int(line["answerKey"]) - 1
- if line["answerKey"].isdigit()
- else LETTER_INDICES.index(line["answerKey"]),
- },
- formulation=formulation,
- ),
- suite=["lighteval"],
- hf_repo="LumiOpen/arc_challenge_mt",
- hf_subset=standardize_tag(language.value),
- evaluation_splits=["test"],
- few_shots_split="validation",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
- for language in [
- Language.DANISH,
- Language.GERMAN,
- Language.GREEK,
- Language.SPANISH,
- Language.FINNISH,
- Language.HUNGARIAN,
- Language.ITALIAN,
- # Language.NORWEGIAN_BOKMAL,
- Language.POLISH,
- Language.PORTUGUESE,
- Language.SWEDISH,
- ]
-]
-
-# Turkish ARC
-# Comes from the Turkish leaderboard
-turkish_arc_tasks = [
- LightevalTaskConfig(
- name=f"community_arc_{Language.TURKISH.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- Language.TURKISH,
- lambda line: {
- "question": line["question"],
- "choices": line["choices"]["text"],
- "gold_idx": int(line["answerKey"]) - 1
- if line["answerKey"].isdigit()
- else LETTER_INDICES.index(line["answerKey"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="malhajar/arc-tr",
- hf_subset=f"ARC-{subset.capitalize()}",
- evaluation_splits=("test",),
- hf_avail_splits=["train"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ]
- + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore
- ),
- )
- for subset in ["easy", "challenge"]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-hindi_arc_tasks = [
- LightevalTaskConfig(
- name=f"community_arc_{Language.HINDI.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- Language.HINDI,
- lambda line: {
- "question": line["question"],
- "choices": line["choices"]["text"],
- "gold_idx": int(line["answerKey"]) - 1
- if line["answerKey"].isdigit()
- else LETTER_INDICES.index(line["answerKey"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="ai4bharat/ai2_arc-hi",
- hf_subset=f"ARC-{subset.capitalize()}",
- evaluation_splits=("test",),
- few_shots_split="validation",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ]
- + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore
- ),
- )
- for subset in ["easy", "challenge"]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-arabic_arc_tasks = [
- LightevalTaskConfig(
- name=f"alghafa_arc_{Language.ARABIC.value}_{formulation.name.lower()}:easy",
- prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
- suite=["lighteval"],
- hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
- hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff",
- hf_subset="arc_easy_ar",
- evaluation_splits=["test"],
- few_shots_split="validation",
- few_shots_select="sequential",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-swahili_arc_tasks = [
- LightevalTaskConfig(
- name=f"community_arc_{Language.SWAHILI.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- Language.SWAHILI,
- lambda line: {
- "question": line["question"],
- "choices": line["choices"]["text"],
- "gold_idx": int(line["answerKey"]) - 1
- if line["answerKey"].isdigit()
- else LETTER_INDICES.index(line["answerKey"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo=f"Mollel/ARC_{subset.capitalize()}_SWH",
- hf_subset="default",
- hf_revision="5347439d3193c8a0dabaab3819914bf076dc94d4"
- if subset == "easy"
- else "dc1df9df632d14c251594d9129fb833d2ca4429c",
- evaluation_splits=("test",),
- few_shots_split="train",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ]
- + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore
- ),
- )
- for subset in ["easy", "challenge"]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-
-TASKS_TABLE.extend(
- [
- *mlmm_arc_challenge_tasks,
- *arabic_ledarboard_arc_easy,
- *lumi_arc,
- *turkish_arc_tasks,
- *hindi_arc_tasks,
- *swahili_arc_tasks,
- *arabic_arc_tasks,
- ]
-)
-
-# ---------------------------- TruthfulQA ---------------------------- #
-# TruthfulQA: Measuring How Models Mimic Human Falsehoods
-# Paper: https://arxiv.org/abs/2109.07958
-# TruthfulQA is a benchmark dataset designed to measure the truthfulness of language models.
-# It consists of questions that humans might answer incorrectly due to false beliefs or misconceptions.
-# The task evaluates a model's ability to provide truthful answers and avoid common human biases.
-
-# github: https://github.com/nlp-uoregon/mlmm-evaluation
-mlmm_truthfulqa_tasks = [
- LightevalTaskConfig(
- name=f"mlmm_truthfulqa_{language.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- language,
- partial(
- lambda subset, line: {
- "question": line["question"],
- "choices": line[f"{subset}_targets"]["choices"],
- "gold_idx": [ix for ix, label in enumerate(line[f"{subset}_targets"]["labels"]) if label == 1], # type: ignore
- },
- subset,
- ),
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="jon-tow/okapi_truthfulqa",
- hf_subset=standardize_tag(language.value),
- hf_revision="cdd5db1a66fd04105622109d1c2a5cbc8cde7586",
- evaluation_splits=("validation",),
- hf_avail_splits=["validation"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for subset in ["mc1", "mc2"]
- for language in [
- Language.ARABIC,
- Language.BENGALI,
- Language.CATALAN,
- Language.DANISH,
- Language.GERMAN,
- Language.SPANISH,
- Language.BASQUE,
- Language.FRENCH,
- Language.GUJARATI,
- Language.HINDI,
- Language.CROATIAN,
- Language.HUNGARIAN,
- Language.ARMENIAN,
- Language.INDONESIAN,
- Language.ICELANDIC,
- Language.ITALIAN,
- Language.KANNADA,
- Language.MALAYALAM,
- Language.MARATHI,
- Language.NORWEGIAN,
- Language.NEPALI,
- Language.DUTCH,
- Language.PORTUGUESE,
- Language.ROMANIAN,
- Language.RUSSIAN,
- Language.SLOVAK,
- Language.SERBIAN,
- Language.SWEDISH,
- Language.TAMIL,
- Language.TELUGU,
- Language.UKRAINIAN,
- Language.VIETNAMESE,
- Language.CHINESE,
- ]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# Turkish TruthfulQA
-# Based on turkish leaderboard
-turkish_truthfulqa = [
- LightevalTaskConfig(
- name=f"community_truthfulqa_{Language.TURKISH.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- Language.TURKISH,
- partial(
- lambda subset, line: {
- "question": line["question"],
- "choices": line[f"{subset}_targets"]["choices"],
- "gold_idx": [ix for ix, label in enumerate(line[f"{subset}_targets"]["labels"]) if label == 1], # type: ignore
- },
- subset,
- ),
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="malhajar/truthful_qa-tr-v0.2",
- hf_subset="default",
- evaluation_splits=("validation",),
- hf_avail_splits=["validation"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for subset in ["mc1", "mc2"]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-TASKS_TABLE.extend(
- [
- *mlmm_truthfulqa_tasks,
- *turkish_truthfulqa,
- ]
-)
-
-# ---------------------------- Exams like tasks ---------------------------- #
-
-# Exams: A collection of exam questions from various countries and subjects
-# Paper: https://arxiv.org/abs/2011.03080
-exams_subjects_by_lang: dict[Language, set[str]] = {
- Language.ARABIC: {"Biology", "Islamic Studies", "Physics", "Science", "Social"},
- Language.BULGARIAN: {"Biology", "Chemistry", "Geography", "History", "Philosophy", "Physics"},
- Language.CROATIAN: {
- "Biology",
- "Chemistry",
- "Ethics",
- "Fine Arts",
- "Geography",
- "Geology",
- "History",
- "Informatics",
- "Philosophy",
- "Physics",
- "Politics",
- "Psychology",
- "Religion",
- "Sociology",
- },
- Language.HUNGARIAN: {
- "Agriculture",
- "Agriculture (Mechanical knowledge)",
- "Biology",
- "Chemistry",
- "Economics",
- "Economics & Marketing",
- "Economics Basics (Business)",
- "Economics Basics (Theoretical)",
- "Forestry",
- "Geography",
- "Landscaping",
- "Physics",
- "Politics",
- "Tourism",
- },
- Language.ITALIAN: {
- "Biology",
- "Chemistry",
- "Ethics",
- "Geography",
- "Geology",
- "History",
- "Informatics",
- "Philosophy",
- "Physics",
- "Politics",
- "Psychology",
- "Sociology",
- },
- Language.SERBIAN: {
- "Biology",
- "Chemistry",
- "Ethics",
- "Geography",
- "Geology",
- "History",
- "Informatics",
- "Philosophy",
- "Physics",
- "Politics",
- "Psychology",
- "Religion",
- "Sociology",
- },
- Language.FRENCH: {"Economics", "Economics & Marketing", "Economics Basics (Theoretical)", "Geography", "Physics"},
- Language.GERMAN: {
- "Chemistry",
- "Economics",
- "Economics & Marketing",
- "Economics Basics (Theoretical)",
- "Geography",
- "Physics",
- "Tourism",
- },
- Language.SPANISH: {"Geography", "Physics"},
- Language.LITHUANIAN: {"Geology", "History"},
- Language.ALBANIAN: {
- "Biology",
- "Business",
- "Chemistry",
- "Fine Arts",
- "History",
- "Philosophy",
- "Physics",
- "Sociology",
- },
- Language.MACEDONIAN: {
- "Biology",
- "Business",
- "Chemistry",
- "Fine Arts",
- "History",
- "Philosophy",
- "Physics",
- "Sociology",
- },
- Language.TURKISH: {
- "Biology",
- "Business",
- "Chemistry",
- "Geography",
- "History",
- "Philosophy",
- "Physics",
- "Sociology",
- },
- Language.POLISH: {"Professional"},
- Language.PORTUGUESE: {"Biology", "Economics", "Geology", "Philosophy"},
- Language.VIETNAMESE: {"Biology", "Chemistry", "Citizenship", "Geography", "History", "Physics"},
-}
-
-exams_tasks = [
- LightevalTaskConfig(
- name=f"exams_{language.value}_{formulation.name.lower()}:{normalize_subset(subject)}",
- prompt_function=get_mcq_prompt_function(
- language,
- lambda line: {
- "question": line["question"]["stem"],
- "choices": line["question"]["choices"]["text"],
- "gold_idx": line["question"]["choices"]["label"].index(line["answerKey"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="mhardalov/exams",
- hf_subset="multilingual",
- # Weird bug in dataset
- hf_filter=partial(
- lambda language, subject, line: line["answerKey"] != "@"
- and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name()
- and line["info"]["subject"] == subject,
- language,
- subject,
- ),
- evaluation_splits=("test",),
- few_shots_split="train",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for language in exams_subjects_by_lang.keys()
- for subject in exams_subjects_by_lang[language]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# M3Exam: Multitask Multilingual Multimodal Evaluation Benchmark
-# It also contains a multimodal version but we don't support that
-# Paper: https://arxiv.org/abs/2306.05179
-m3exams_tasks = [
- LightevalTaskConfig(
- name=f"m3exams_{language.value}_{formulation.name.lower()}",
- suite=("lighteval",),
- prompt_function=get_mcq_prompt_function(
- language,
- partial(get_m3exam_adapter, language),
- formulation=formulation,
- ),
- hf_repo="chiayewken/m3exam",
- hf_subset=LangCodeLanguage(standardize_tag(language.value)).language_name().lower(),
- evaluation_splits=("test",),
- few_shots_split="dev",
- generation_size=-1,
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for language in [
- Language.AFRIKAANS,
- Language.CHINESE,
- Language.ENGLISH,
- Language.ITALIAN,
- Language.JAVANESE,
- Language.PORTUGUESE,
- Language.SWAHILI,
- Language.THAI,
- Language.VIETNAMESE,
- ]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# Thai Exams
-# We noticed very bad performance of models on this dataset
-# However, it may just be because quality of the models themselves
-# Paper: https://arxiv.org/abs/2312.13951
-
-THAI_EXAMS_SUBSETS = ["a_level", "ic", "onet", "tgat", "tpat1"]
-
-thai_exams_tasks = [
- LightevalTaskConfig(
- name=f"thai_exams_{Language.THAI.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(Language.THAI, thai_exams_adapter, formulation=formulation),
- suite=("lighteval",),
- hf_repo="scb10x/thai_exam",
- hf_subset=subset,
- evaluation_splits=("test",),
- few_shots_split="train",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for subset in THAI_EXAMS_SUBSETS
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-TASKS_TABLE.extend(
- [
- *exams_tasks,
- *m3exams_tasks,
- *thai_exams_tasks,
- ]
-)
-
-# ------------------------------- XCSQA ------------------------------- #
-# XCSQA (Cross-lingual Commonsense QA) is part of the XCSR (Cross-lingual Commonsense Reasoning) benchmark
-# It is a multilingual extension of the CommonsenseQA dataset, covering 16 languages
-# The task involves answering multiple-choice questions that require commonsense reasoning
-# Uses PMI normalization
-# Paper: https://arxiv.org/abs/2110.08462
-xcsqa_tasks = [
- LightevalTaskConfig(
- name=f"xcsqa_{language.value}_{formulation.name.lower()}",
- prompt_function=get_mcq_prompt_function(
- language,
- lambda line: {
- "question": line["question"]["stem"],
- "choices": line["question"]["choices"]["text"],
- "gold_idx": line["question"]["choices"]["label"].index(line["answerKey"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="INK-USC/xcsr",
- hf_subset=f"X-CSQA-{standardize_tag(language.value) if language != Language.JAPANESE else 'jap'}",
- hf_filter=lambda x: all(
- len(x["question"]["choices"]["text"][i].strip()) > 0 for i in range(len(x["question"]["choices"]["text"]))
- ),
- evaluation_splits=("validation",),
- hf_avail_splits=["validation"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for language in [
- Language.ARABIC,
- Language.GERMAN,
- Language.ENGLISH,
- Language.SPANISH,
- Language.FRENCH,
- Language.HINDI,
- Language.ITALIAN,
- Language.JAPANESE,
- Language.DUTCH,
- Language.POLISH,
- Language.PORTUGUESE,
- Language.RUSSIAN,
- Language.SWAHILI,
- Language.URDU,
- Language.VIETNAMESE,
- Language.CHINESE,
- ]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-TASKS_TABLE.extend(
- [
- *xcsqa_tasks,
- ]
-)
-
-# ------------------------------- PIQA ------------------------------- #
-# PIQA: Physical Interaction Question Answering
-# PIQA is a benchmark for testing physical commonsense reasoning.
-# This Arabic version is a translation of the original PIQA dataset, adapted for Arabic language evaluation.
-# It tests the ability to reason about physical interactions in everyday situations.
-# Paper: https://arxiv.org/abs/1911.11641
-# Arabic version: https://aclanthology.org/2023.arabicnlp-1.21/
-piqa_ar_tasks = [
- LightevalTaskConfig(
- name=f"alghafa_piqa_{Language.ARABIC.value}_{formulation.name.lower()}",
- prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
- suite=["lighteval"],
- hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
- hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff",
- hf_subset="piqa_ar",
- hf_avail_splits=["test", "validation"],
- evaluation_splits=["test"],
- few_shots_split="validation",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-TASKS_TABLE.extend(
- [
- *piqa_ar_tasks,
- ]
-)
-
-# ------------------------------- OpenBookQA ------------------------------- #
-# OpenBookQA: A Question-Answering Dataset for Open-Book Exams
-# OpenBookQA is a question-answering dataset modeled after open-book exams for assessing human understanding of a subject.
-# It consists of multiple-choice questions that require combining facts from a given open book with broad common knowledge.
-# The task tests language models' ability to leverage provided information and apply common sense reasoning.
-# Original paper: https://arxiv.org/abs/1809.02789
-# Arabic version: https://aclanthology.org/2023.arabicnlp-1.21/
-openbook_ara_tasks = [
- LightevalTaskConfig(
- name=f"alghafa_openbookqa_{Language.ARABIC.value}_{formulation.name.lower()}",
- prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
- suite=["lighteval"],
- hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
- hf_subset="openbook_qa_ext_ar",
- hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff",
- evaluation_splits=["test"],
- few_shots_split="validation",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# Spanish version of OpenBookQA from BSC Language Technology group
-# Dataset: https://huggingface.co/datasets/BSC-LT/openbookqa-es
-openbook_es_tasks = [
- LightevalTaskConfig(
- name=f"openbookqa_{Language.SPANISH.value}_{formulation.name.lower()}",
- prompt_function=get_mcq_prompt_function(
- Language.SPANISH,
- lambda line: {
- "question": line["question_stem"],
- "choices": line["choices"]["text"],
- "gold_idx": LETTER_INDICES.index(line["answerKey"]),
- },
- formulation=formulation,
- ),
- suite=["lighteval"],
- hf_repo="BSC-LT/openbookqa-es",
- hf_subset="default",
- evaluation_splits=("test",),
- few_shots_split="validation",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-
-# The Russian version is part of the MERA (Multilingual Enhanced Russian NLP Architectures) project.
-# Paper: https://arxiv.org/abs/2401.04531
-openbook_rus_tasks = [
- LightevalTaskConfig(
- name=f"mera_openbookqa_{Language.RUSSIAN.value}_{formulation.name.lower()}",
- prompt_function=get_mcq_prompt_function(
- Language.RUSSIAN,
- lambda line: {
- "question": line["inputs"]["question"],
- "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]],
- "gold_idx": LETTER_INDICES.index(line["outputs"]),
- },
- formulation=formulation,
- ),
- suite=["lighteval"],
- hf_repo="ai-forever/MERA",
- hf_subset="ruopenbookqa",
- evaluation_splits=("train",),
- hf_avail_splits=["train"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-TASKS_TABLE.extend(
- [
- *openbook_rus_tasks,
- *openbook_ara_tasks,
- *openbook_es_tasks,
- ]
-)
-
-# ------------------------------- SciQ ------------------------------- #
-# SciQ: Science Question Answering
-# SciQ is a question-answering dataset designed to evaluate the ability of language models to answer science questions.
-# It consists of multiple-choice questions that require scientific reasoning and factual knowledge.
-
-# The Arabic version is part of the AlGhafa Arabic LLM Benchmark, a translation and adaptation of various English datasets.
-# Paper: https://aclanthology.org/2023.arabicnlp-1.21/
-sciqa_ar_task = [
- LightevalTaskConfig(
- name=f"alghafa_sciqa_{Language.ARABIC.value}_{formulation.name.lower()}",
- prompt_function=get_mcq_prompt_function(
- Language.ARABIC,
- sciqa_adapter,
- formulation=formulation,
- ),
- suite=["lighteval"],
- hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
- hf_subset="sciq_ar",
- hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff",
- hf_avail_splits=["test", "validation"],
- evaluation_splits=["test"],
- few_shots_split="validation",
- few_shots_select="sequential",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-TASKS_TABLE.extend(
- [
- *sciqa_ar_task,
- ]
-)
-
-# ------------------------------- Math Tasks ------------------------------- #
-
-# MathLogicQA is a dataset for evaluating mathematical reasoning in language models.
-# It consists of multiple-choice questions that require logical reasoning and mathematical problem-solving.
-# This Russian version is part of the MERA (Multilingual Evaluation of Reasoning Abilities) benchmark.
-# MERA: https://github.com/ai-forever/MERA
-mathlogicqa_rus_tasks = [
- LightevalTaskConfig(
- name=f"mathlogic_qa_{Language.RUSSIAN.value}_{formulation.name.lower()}",
- prompt_function=get_mcq_prompt_function(
- Language.RUSSIAN,
- lambda line: {
- "question": line["inputs"]["text"],
- "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]],
- "gold_idx": LETTER_INDICES.index(line["outputs"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="ai-forever/MERA",
- hf_subset="mathlogicqa",
- evaluation_splits=("train",),
- hf_avail_splits=["train"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- CFFormulation(),
- MCFFormulation(),
- HybridFormulation(),
- ]
-]
-
-cmath_tasks = [
- LightevalTaskConfig(
- name=f"cmath_{Language.CHINESE.value}",
- prompt_function=get_qa_prompt_function(
- Language.CHINESE,
- lambda line: {
- "question": line["question"],
- "choices": [line["golden"]],
- },
- ),
- suite=("lighteval",),
- hf_repo="weitianwen/cmath",
- hf_subset="default",
- evaluation_splits=("test",),
- few_shots_split="validation",
- generation_size=25,
- metrics=[
- MultilingualQuasiExactMatchMetric(Language.CHINESE, "full"),
- ],
- stop_sequence=("\n",),
- )
-]
-
-mgsm_tasks = [
- LightevalTaskConfig(
- name=f"mgsm_{language.value}",
- prompt_function=get_qa_prompt_function(
- language,
- lambda line: {
- "question": line["question"],
- # The cot is available but we have no use:
- # line["answer"]
- "choices": [str(line["answer_number"])],
- },
- ),
- suite=("lighteval",),
- hf_repo="juletxara/mgsm",
- hf_subset=standardize_tag(language.value),
- evaluation_splits=("test",),
- few_shots_split="train",
- generation_size=25,
- metrics=[
- MultilingualQuasiExactMatchMetric(language, "full"),
- ],
- stop_sequence=("\n",),
- )
- for language in [
- Language.ENGLISH,
- Language.SPANISH,
- Language.FRENCH,
- Language.GERMAN,
- Language.RUSSIAN,
- Language.CHINESE,
- Language.JAPANESE,
- Language.THAI,
- Language.SWAHILI,
- Language.BENGALI,
- Language.TELUGU,
- ]
-]
-# African MGSM: MGSM for African Languages
-# From https://arxiv.org/abs/2406.03368. Human translated MGSM.
-afri_mgsm_tasks = [
- LightevalTaskConfig(
- name=f"afri_mgsm_{language.value}",
- prompt_function=get_qa_prompt_function(
- language,
- lambda line: {
- "question": line["question"],
- # The cot is available but we have no use:
- # line["answer"]
- "choices": [str(line["answer_number"])],
- },
- ),
- suite=("lighteval",),
- hf_repo="masakhane/afrimgsm",
- hf_subset=language.value,
- evaluation_splits=("test",),
- few_shots_split="train",
- generation_size=25,
- metrics=[
- MultilingualQuasiExactMatchMetric(language, "full"),
- ],
- stop_sequence=("\n",),
- )
- for language in [
- Language.AMHARIC,
- # Language.EWE,
- Language.FRENCH,
- # Language.HAUSA,
- # Language.IGBO,
- # Language.KINYARWANDA,
- # Language.LINGALA,
- # Language.LUGANDA,
- # Language.OROMO,
- # Language.SHONA,
- # Language.SOTHO,
- Language.SWAHILI,
- # Language.TWI,
- # Language.WOLOF,
- # Language.XHOSA,
- Language.YORUBA,
- # Language.ZULU,
- ]
-]
-TASKS_TABLE.extend(
- [
- *cmath_tasks,
- *mathlogicqa_rus_tasks,
- *mgsm_tasks,
- *afri_mgsm_tasks,
- ]
-)
-
-# ------------------------------- Misc ------------------------------- #
-
-# AGIEval: Chinese AGI Evaluation suite (Excluding the english subsets)
-# Uses PMI normalization
-# Paper: https://arxiv.org/abs/2304.06364
-CHINESE_AGIEVAL_SUBSET = [
- "gaokao-biology",
- "gaokao-chinese",
- "gaokao-chemistry",
- "gaokao-geography",
- "gaokao-history",
- "gaokao-mathqa",
- "gaokao-physics",
- "logiqa-zh",
- "jec-qa-kd",
- "jec-qa-ca",
-]
-
-agieval_tasks_zh = [
- LightevalTaskConfig(
- name=f"agieval_{Language.CHINESE.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- Language.CHINESE,
- partial(
- agieval_adapter,
- Language.CHINESE,
- formulation,
- ),
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo=f"hails/agieval-{subset}",
- hf_subset="default",
- evaluation_splits=("test",),
- hf_avail_splits=["test"],
- few_shots_split=None,
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
- ],
- ),
- )
- for subset in CHINESE_AGIEVAL_SUBSET
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-# C-Eval: Chinese Evaluation suite
-# Similar to MMLu but with different categories
-# Paper: https://arxiv.org/abs/2305.08322
-CEVAL_SUBSET = [
- "computer_network",
- "operating_system",
- "computer_architecture",
- "college_programming",
- "college_physics",
- "college_chemistry",
- "advanced_mathematics",
- "probability_and_statistics",
- "discrete_mathematics",
- "electrical_engineer",
- "metrology_engineer",
- "high_school_mathematics",
- "high_school_physics",
- "high_school_chemistry",
- "high_school_biology",
- "middle_school_mathematics",
- "middle_school_biology",
- "middle_school_physics",
- "middle_school_chemistry",
- "veterinary_medicine",
- "college_economics",
- "business_administration",
- "marxism",
- "mao_zedong_thought",
- "education_science",
- "teacher_qualification",
- "high_school_politics",
- "high_school_geography",
- "middle_school_politics",
- "middle_school_geography",
- "modern_chinese_history",
- "ideological_and_moral_cultivation",
- "logic",
- "law",
- "chinese_language_and_literature",
- "art_studies",
- "professional_tour_guide",
- "legal_professional",
- "high_school_chinese",
- "high_school_history",
- "middle_school_history",
- "civil_servant",
- "sports_science",
- "plant_protection",
- "basic_medicine",
- "clinical_medicine",
- "urban_and_rural_planner",
- "accountant",
- "fire_engineer",
- "environmental_impact_assessment_engineer",
- "tax_accountant",
- "physician",
-]
-
-ceval_tasks = [
- LightevalTaskConfig(
- name=f"ceval_{Language.CHINESE.value}_{formulation.name.lower()}:{subset}",
- prompt_function=get_mcq_prompt_function(
- Language.CHINESE,
- partial(
- ceval_adapter,
- Language.CHINESE,
- formulation,
- ),
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="ceval/ceval-exam",
- hf_subset=subset,
- evaluation_splits=("val",),
- few_shots_split="dev",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for subset in CEVAL_SUBSET
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-
-# OAB Exams: A collection of questions from the Brazilian Bar Association exam
-# The exam is required for anyone who wants to practice law in Brazil
-# Dataset: https://huggingface.co/datasets/eduagarcia/oab_exams
-oab_exams_tasks = [
- LightevalTaskConfig(
- name=f"oab_exams_{Language.PORTUGUESE.value}_{formulation.name.lower()}",
- prompt_function=get_mcq_prompt_function(
- Language.PORTUGUESE,
- lambda line: {
- "question": line["question"],
- "choices": line["choices"]["text"],
- "gold_idx": LETTER_INDICES.index(line["answerKey"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="eduagarcia/oab_exams",
- hf_subset="default",
- evaluation_splits=("train",),
- hf_avail_splits=["train"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-# ENEM (Exame Nacional do Ensino Médio) is a standardized Brazilian national secondary
-# education examination. The exam is used both as a university admission test and as a
-# high school evaluation test.
-# Dataset: https://huggingface.co/datasets/maritaca-ai/enem
-enem_tasks = [
- LightevalTaskConfig(
- name=f"enem_{Language.PORTUGUESE.value}_{formulation.name.lower()}:{year}",
- prompt_function=get_mcq_prompt_function(
- Language.PORTUGUESE,
- partial(
- enem_adapter,
- Language.PORTUGUESE,
- ),
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="maritaca-ai/enem",
- hf_subset=year,
- evaluation_splits=("train",),
- hf_avail_splits=["train"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for year in ["2022", "2023", "2024"]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-
-# WorldTree is a dataset for multi-hop inference in science question answering.
-# It provides explanations for elementary science questions by combining facts from a semi-structured knowledge base.
-# This Russian version is part of the MERA (Multilingual Evaluation of Reasoning Abilities) benchmark.
-# MERA: https://github.com/ai-forever/MERA
-worldtree_rus_tasks = [
- LightevalTaskConfig(
- name=f"mera_worldtree_{Language.RUSSIAN.value}_{formulation.name.lower()}",
- prompt_function=get_mcq_prompt_function(
- Language.RUSSIAN,
- lambda line: {
- "question": line["inputs"]["question"],
- "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]],
- "gold_idx": LETTER_INDICES.index(line["outputs"]),
- },
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="ai-forever/MERA",
- hf_subset="ruworldtree",
- evaluation_splits=("train",),
- hf_avail_splits=["train"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-TASKS_TABLE.extend(
- [
- *agieval_tasks_zh,
- *worldtree_rus_tasks,
- *ceval_tasks,
- *oab_exams_tasks,
- *enem_tasks,
- ]
-)
-
-
-# ------------------------------- Continuation Tasks ------------------------------- #
-xcodah_tasks = [
- LightevalTaskConfig(
- name=f"xcodah_{language.value}_{formulation.name.lower()}",
- prompt_function=get_mcq_prompt_function(language, partial(xcodah_adapter, language), formulation=formulation),
- suite=("lighteval",),
- hf_repo="INK-USC/xcsr",
- hf_subset=f"X-CODAH-{standardize_tag(language.value) if language != Language.JAPANESE else 'jap'}",
- evaluation_splits=("validation",),
- hf_avail_splits=["validation"],
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for language in [
- Language.ARABIC,
- Language.GERMAN,
- Language.ENGLISH,
- Language.SPANISH,
- Language.FRENCH,
- Language.HINDI,
- Language.ITALIAN,
- Language.JAPANESE,
- Language.DUTCH,
- Language.POLISH,
- Language.PORTUGUESE,
- Language.RUSSIAN,
- Language.SWAHILI,
- Language.URDU,
- Language.VIETNAMESE,
- Language.CHINESE,
- ]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-xstory_tasks = [
- LightevalTaskConfig(
- name=f"xstory_cloze_{lang.value}_{formulation.name.lower()}",
- prompt_function=get_continuation_prompt_function(
- lang,
- partial(
- lambda lang, line: {
- "context": TRANSLATION_LITERALS[lang].sentence_space.join(
- [
- line["input_sentence_1"],
- line["input_sentence_2"],
- line["input_sentence_3"],
- line["input_sentence_4"],
- ]
- ),
- "continuations": [line["sentence_quiz1"], line["sentence_quiz2"]],
- "gold_idx": int(line["answer_right_ending"]) - 1, # type: ignore
- },
- lang,
- ),
- formulation=formulation,
- ),
- suite=("lighteval",),
- hf_repo="juletxara/xstory_cloze",
- hf_subset=standardize_tag(lang.value),
- evaluation_splits=["eval"],
- few_shots_split="train",
- metrics=get_metrics_for_formulation(
- formulation,
- [
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- ),
- )
- for lang in [
- Language.RUSSIAN,
- Language.CHINESE,
- Language.SPANISH,
- Language.ARABIC,
- Language.HINDI,
- Language.INDONESIAN,
- Language.TELUGU,
- Language.SWAHILI,
- Language.BASQUE,
- Language.BURMESE,
- ]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-TASKS_TABLE.extend(
- [
- *xcodah_tasks,
- *xstory_tasks,
- ]
-)
-
-# ------------------------------- Winogrande Tasks ------------------------------- #
-
-xwinograd_tasks = [
- LightevalTaskConfig(
- name=f"xwinograd_{language.value}_{formulation.name.lower()}",
- suite=("lighteval",),
- prompt_function=get_continuation_prompt_function(
- language, partial(winogrand_adapter, language), formulation=formulation
- ),
- hf_repo="Muennighoff/xwinograd",
- hf_subset=standardize_tag(language.value) if language != Language.JAPANESE else "jp",
- evaluation_splits=("test",),
- hf_avail_splits=["test"],
- metrics=[
- LogLikelihoodAccMetric(normalization=None),
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- )
- for language in [
- Language.ENGLISH,
- Language.FRENCH,
- Language.JAPANESE,
- Language.PORTUGUESE,
- Language.RUSSIAN,
- Language.CHINESE,
- ]
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-winograd_turkish_task = [
- LightevalTaskConfig(
- name=f"community_xwinograd_{Language.TURKISH.value}_{formulation.name.lower()}",
- suite=("lighteval",),
- prompt_function=get_continuation_prompt_function(
- Language.TURKISH, partial(winogrand_adapter, Language.TURKISH), formulation=formulation
- ),
- hf_repo="malhajar/winogrande-tr-v0.2",
- hf_subset="default",
- evaluation_splits=("validation",),
- few_shots_split="train",
- metrics=[
- LogLikelihoodAccMetric(normalization=None),
- LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
- LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
- ],
- )
- for formulation in [
- MCFFormulation(),
- CFFormulation(),
- HybridFormulation(),
- ]
-]
-
-TASKS_TABLE.extend(
- [
- *xwinograd_tasks,
- *winograd_turkish_task,
- ]
-)
-
-# ------------------------------- General QA tasks ------------------------------- #
-
-MKQA_TASK_TO_ID = {
- "entity": 0,
- "long_answer": 1,
- # "unanswerable": 2,
- "date": 3,
- "number": 4,
- "number_with_unit": 5,
- "short_phrase": 6,
- "binary": 7,
-}
-
-mkqa_tasks = [
- LightevalTaskConfig(
- name=f"mkqa_{language.value}:{subset}",
- prompt_function=get_qa_prompt_function(language, partial(get_mkqa_adapter, language)),
- suite=("lighteval",),
- hf_repo="apple/mkqa",
- hf_subset="mkqa",
- hf_revision="325131889721ae0ed885b76ecb8011369d75abad",
- hf_filter=partial(
- lambda language, subset, line: line["answers"][
- "zh_cn" if language == Language.CHINESE else standardize_tag(language.value)
- ][0]["type"]
- == MKQA_TASK_TO_ID[subset],
- language,
- subset,
- ),
- evaluation_splits=("train",),
- hf_avail_splits=["train"],
- stop_sequence=("\n",),
- metrics=[
- MultilingualQuasiExactMatchMetric(language, "prefix"),
- MultilingualQuasiF1ScoreMetric(language),
- ]
- if subset in ["entity", "long_answer", "short_phrase"]
- else [
- MultilingualQuasiExactMatchMetric(language, "full"),
- ],
- )
- for subset in MKQA_TASK_TO_ID.keys()
- for language in [
- Language.ARABIC,
- Language.DANISH,
- Language.GERMAN,
- Language.ENGLISH,
- Language.SPANISH,
- Language.FINNISH,
- Language.FRENCH,
- Language.HEBREW,
- Language.HUNGARIAN,
- Language.ITALIAN,
- Language.JAPANESE,
- Language.KOREAN,
- Language.KHMER,
- Language.MALAY,
- Language.DUTCH,
- Language.NORWEGIAN,
- Language.POLISH,
- Language.PORTUGUESE,
- Language.RUSSIAN,
- Language.SWEDISH,
- Language.THAI,
- Language.TURKISH,
- Language.VIETNAMESE,
- Language.CHINESE, # Simplified
- # Language.CHINESE_HONG_KONG,
- # Language.CHINESE_TRADITIONAL,
- ]
-]
-
-mintaka_tasks = [
- LightevalTaskConfig(
- name=f"mintaka_{lang.value}",
- prompt_function=get_qa_prompt_function(
- lang,
- lambda line: {
- "question": line["question"],
- "choices": [line["answerText"]],
- },
- ),
- suite=("lighteval",),
- hf_repo="AmazonScience/mintaka",
- hf_subset=standardize_tag(lang.value),
- evaluation_splits=("test",),
- few_shots_split="train",
- generation_size=400,
- stop_sequence=("\n",),
- metrics=[
- MultilingualQuasiExactMatchMetric(lang, "prefix"),
- MultilingualQuasiF1ScoreMetric(lang),
- ],
- )
- for lang in [
- Language.ARABIC,
- Language.GERMAN,
- Language.ENGLISH,
- Language.SPANISH,
- Language.FRENCH,
- Language.HINDI,
- Language.ITALIAN,
- Language.JAPANESE,
- Language.PORTUGUESE,
- ]
-]
-
-french_triviqa_tasks = [
- LightevalTaskConfig(
- name=f"community_triviaqa_{Language.FRENCH.value}",
- prompt_function=get_qa_prompt_function(
- Language.FRENCH,
- lambda line: {
- "question": line["Question"],
- "choices": [line["Answer"]],
- },
- ),
- suite=("lighteval",),
- hf_repo="manu/french-trivia",
- hf_subset="default",
- evaluation_splits=("train",),
- hf_avail_splits=["train"],
- generation_size=400,
- stop_sequence=("\n",),
- metrics=[
- MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.FRENCH),
- ],
- )
-]
-
-
-chegeka_tasks = [
- LightevalTaskConfig(
- name=f"chegeka_{Language.RUSSIAN.value}",
- prompt_function=get_qa_prompt_function(
- Language.RUSSIAN,
- lambda line: {
- "question": line["inputs"]["text"],
- "choices": [line["outputs"]],
- },
- ),
- suite=("lighteval",),
- hf_repo="ai-forever/MERA",
- hf_subset="chegeka",
- evaluation_splits=("train",),
- hf_avail_splits=["train"],
- generation_size=400,
- stop_sequence=("\n",),
- metrics=[
- MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"),
- MultilingualQuasiF1ScoreMetric(Language.RUSSIAN),
- ],
- )
-]
-
-TASKS_TABLE.extend(
- [
- *mkqa_tasks,
- *mlqa_tasks,
- *chegeka_tasks,
- *mintaka_tasks,
- *french_triviqa_tasks,
- ]
-)
-
-
-# ------------------------------- BoolQ Tasks (yes/no) ------------------------------- #
-ACVA_SUBSET = [
- "Algeria",
- "Ancient_Egypt",
- "Arab_Empire",
- "Arabic_Architecture",
- "Arabic_Art",
- "Arabic_Astronomy",
- "Arabic_Calligraphy",
- "Arabic_Ceremony",
- "Arabic_Clothing",
- "Arabic_Culture",
- "Arabic_Food",
- "Arabic_Funeral",
- "Arabic_Geography",
- "Arabic_History",
- "Arabic_Language_Origin",
- "Arabic_Literature",
- "Arabic_Math",
- "Arabic_Medicine",
- "Arabic_Music",
- "Arabic_Ornament",
- "Arabic_Philosophy",
- "Arabic_Physics_and_Chemistry",
- "Arabic_Wedding",
- "Bahrain",
- "Comoros",
- "Egypt_modern",
- "InfluenceFromAncientEgypt",
- "InfluenceFromByzantium",
- "InfluenceFromChina",
- "InfluenceFromGreece",
- "InfluenceFromIslam",
- "InfluenceFromPersia",
- "InfluenceFromRome",
- "Iraq",
- "Islam_Education",
- "Islam_branches_and_schools",
- "Islamic_law_system",
- "Jordan",
- "Kuwait",
- "Lebanon",
- "Libya",
- "Mauritania",
- "Mesopotamia_civilization",
- "Morocco",
- "Oman",
- "Palestine",
- "Qatar",
- "Saudi_Arabia",
- "Somalia",
- "Sudan",
- "Syria",
- "Tunisia",
- "United_Arab_Emirates",
- "Yemen",
- "communication",
- "computer_and_phone",
- "daily_life",
- "entertainment",
-]
-
-acva_tasks = [
- LightevalTaskConfig(
- name=f"acva_{Language.ARABIC.value}:{subset}",
- prompt_function=get_boolq_prompt_function(
- Language.ARABIC,
- lambda line: {
- "question": line["question"],
- "answer": line["answer"] == "صح",
- },
- formulation=CFFormulation(),
- ),
- suite=("lighteval",),
- hf_repo="OALL/ACVA",
- hf_subset=subset,
- evaluation_splits=("test",),
- few_shots_split="validation",
- metrics=[MultilingualQuasiExactMatchMetric(Language.ARABIC, "full"), LogLikelihoodAccMetric()],
- generation_size=5,
- stop_sequence=("\n",),
- )
- for subset in ACVA_SUBSET
-]
-
-
-french_boolq_tasks = [
- LightevalTaskConfig(
- name=f"community_boolq_{Language.FRENCH.value}",
- prompt_function=get_boolq_prompt_function(
- Language.FRENCH,
- lambda line: {
- "question": line["question"],
- "answer": line["label"] == 1,
- "context": line["passage"],
- },
- formulation=CFFormulation(),
- ),
- suite=("lighteval",),
- hf_repo="manu/french_boolq",
- hf_subset="default",
- evaluation_splits=("test",),
- few_shots_split="valid",
- generation_size=5,
- stop_sequence=["\n"],
- metrics=[MultilingualQuasiExactMatchMetric(Language.FRENCH, "full"), LogLikelihoodAccMetric()],
- )
-]
-
-hindi_boolq_tasks = [
- LightevalTaskConfig(
- name=f"community_boolq_{language.value}",
- prompt_function=get_boolq_prompt_function(
- language,
- lambda line: {
- "question": line["question"],
- "answer": line["answer"],
- "context": line["passage"],
- },
- formulation=CFFormulation(),
- ),
- suite=("lighteval",),
- hf_repo="ai4bharat/boolq-hi",
- hf_subset=standardize_tag(language.value),
- evaluation_splits=("validation",),
- few_shots_split="train",
- generation_size=5,
- stop_sequence=["\n"],
- metrics=[MultilingualQuasiExactMatchMetric(language, "full"), LogLikelihoodAccMetric()],
- )
- for language in [
- Language.HINDI,
- Language.GUJARATI,
- Language.MALAYALAM,
- Language.MARATHI,
- Language.TAMIL,
- ]
-]
-
-
-TASKS_TABLE.extend(
- [
- *acva_tasks,
- *french_boolq_tasks,
- *hindi_boolq_tasks,
- ]
-)
-
-# ------------------------------- Translation Tasks ------------------------------- #
-flores_200_languages = [
- # "ace_Arab",
- "ace_Latn",
- "acm_Arab",
- "acq_Arab",
- "aeb_Arab",
- "afr_Latn",
- "ajp_Arab",
- "aka_Latn",
- "amh_Ethi",
- "apc_Arab",
- "arb_Arab",
- # "arb_Latn",
- "ars_Arab",
- "ary_Arab",
- "arz_Arab",
- "asm_Beng",
- "ast_Latn",
- "awa_Deva",
- "ayr_Latn",
- "azb_Arab",
- "azj_Latn",
- "bak_Cyrl",
- "bam_Latn",
- "ban_Latn",
- "bel_Cyrl",
- "bem_Latn",
- "ben_Beng",
- "bho_Deva",
- # "bjn_Arab",
- "bjn_Latn",
- "bod_Tibt",
- "bos_Latn",
- "bug_Latn",
- "bul_Cyrl",
- "cat_Latn",
- "ceb_Latn",
- "ces_Latn",
- "cjk_Latn",
- "ckb_Arab",
- "crh_Latn",
- "cym_Latn",
- "dan_Latn",
- "deu_Latn",
- "dik_Latn",
- "dyu_Latn",
- "dzo_Tibt",
- "ell_Grek",
- "eng_Latn",
- "epo_Latn",
- "est_Latn",
- "eus_Latn",
- "ewe_Latn",
- "fao_Latn",
- "fij_Latn",
- "fin_Latn",
- "fon_Latn",
- "fra_Latn",
- "fur_Latn",
- "fuv_Latn",
- "gla_Latn",
- "gle_Latn",
- "glg_Latn",
- "grn_Latn",
- "guj_Gujr",
- "hat_Latn",
- "hau_Latn",
- "heb_Hebr",
- "hin_Deva",
- "hne_Deva",
- "hrv_Latn",
- "hun_Latn",
- "hye_Armn",
- "ibo_Latn",
- "ilo_Latn",
- "ind_Latn",
- "isl_Latn",
- "ita_Latn",
- "jav_Latn",
- "jpn_Jpan",
- "kab_Latn",
- "kac_Latn",
- "kam_Latn",
- "kan_Knda",
- # "kas_Arab",
- "kas_Deva",
- "kat_Geor",
- # "knc_Arab",
- "knc_Latn",
- "kaz_Cyrl",
- "kbp_Latn",
- "kea_Latn",
- "khm_Khmr",
- "kik_Latn",
- "kin_Latn",
- "kir_Cyrl",
- "kmb_Latn",
- "kmr_Latn",
- "kon_Latn",
- "kor_Hang",
- "lao_Laoo",
- "lij_Latn",
- "lim_Latn",
- "lin_Latn",
- "lit_Latn",
- "lmo_Latn",
- "ltg_Latn",
- "ltz_Latn",
- "lua_Latn",
- "lug_Latn",
- "luo_Latn",
- "lus_Latn",
- "lvs_Latn",
- "mag_Deva",
- "mai_Deva",
- "mal_Mlym",
- "mar_Deva",
- # "min_Arab",
- "min_Latn",
- "mkd_Cyrl",
- "plt_Latn",
- "mlt_Latn",
- "mni_Beng",
- "khk_Cyrl",
- "mos_Latn",
- "mri_Latn",
- "mya_Mymr",
- "nld_Latn",
- "nno_Latn",
- "nob_Latn",
- "npi_Deva",
- "nso_Latn",
- "nus_Latn",
- "nya_Latn",
- "oci_Latn",
- "gaz_Latn",
- "ory_Orya",
- "pag_Latn",
- "pan_Guru",
- "pap_Latn",
- "pes_Arab",
- "pol_Latn",
- "por_Latn",
- "prs_Arab",
- "pbt_Arab",
- "quy_Latn",
- "ron_Latn",
- "run_Latn",
- "rus_Cyrl",
- "sag_Latn",
- "san_Deva",
- "sat_Olck",
- "scn_Latn",
- "shn_Mymr",
- "sin_Sinh",
- "slk_Latn",
- "slv_Latn",
- "smo_Latn",
- "sna_Latn",
- "snd_Arab",
- "som_Latn",
- "sot_Latn",
- "spa_Latn",
- "als_Latn",
- "srd_Latn",
- "srp_Cyrl",
- "ssw_Latn",
- "sun_Latn",
- "swe_Latn",
- "swh_Latn",
- "szl_Latn",
- "tam_Taml",
- "tat_Cyrl",
- "tel_Telu",
- "tgk_Cyrl",
- "tgl_Latn",
- "tha_Thai",
- "tir_Ethi",
- "taq_Latn",
- "taq_Tfng",
- "tpi_Latn",
- "tsn_Latn",
- "tso_Latn",
- "tuk_Latn",
- "tum_Latn",
- "tur_Latn",
- "twi_Latn",
- "tzm_Tfng",
- "uig_Arab",
- "ukr_Cyrl",
- "umb_Latn",
- "urd_Arab",
- "uzn_Latn",
- "vec_Latn",
- "vie_Latn",
- "war_Latn",
- "wol_Latn",
- "xho_Latn",
- "ydd_Hebr",
- "yor_Latn",
- "yue_Hant",
- "zho_Hans",
- # "zho_Hant",
- "zsm_Latn",
- "zul_Latn",
-]
-
-
-def flores_adapter(lang1, lang2):
- return lambda line: {
- "source_text": line[f"sentence_{lang1}"],
- "target_text": line[f"sentence_{lang2}"],
- }
-
-
-flores200_tasks = [
- LightevalTaskConfig(
- name=f"flores200:{lang1}-{lang2}",
- prompt_function=get_translation_prompt_function(
- source_language=Language(manage_duplicate_language_codes(lang1.split("_")[0])),
- target_language=Language(manage_duplicate_language_codes(lang2.split("_")[0])),
- adapter=flores_adapter(lang1, lang2),
- formulation=CFFormulation(),
- ),
- suite=("lighteval",),
- hf_repo="facebook/flores",
- hf_subset=f"{lang1}-{lang2}",
- hf_avail_splits=["dev", "devtest"],
- evaluation_splits=["devtest"],
- few_shots_split="dev",
- few_shots_select=None,
- generation_size=300,
- metrics=[Metrics.chrf_plus, Metrics.bleu, Metrics.bleu_1, Metrics.bleu_4],
- stop_sequence=["\n"],
- version=0,
- )
- for (lang1, lang2) in permutations(flores_200_languages, 2)
-]
-
-TASKS_TABLE.extend(
- [
- *flores200_tasks,
- ]
-)
diff --git a/src/lighteval/tasks/multilingual/tasks/acva.py b/src/lighteval/tasks/multilingual/tasks/acva.py
new file mode 100644
index 000000000..14f371d32
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/acva.py
@@ -0,0 +1,115 @@
+"""
+name:
+Acva
+
+dataset:
+OALL/ACVA
+
+abstract:
+Acva multilingual benchmark.
+
+languages:
+arabic
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+ MultilingualQuasiExactMatchMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.boolq import get_boolq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+ACVA_SUBSET = [
+ "Algeria",
+ "Ancient_Egypt",
+ "Arab_Empire",
+ "Arabic_Architecture",
+ "Arabic_Art",
+ "Arabic_Astronomy",
+ "Arabic_Calligraphy",
+ "Arabic_Ceremony",
+ "Arabic_Clothing",
+ "Arabic_Culture",
+ "Arabic_Food",
+ "Arabic_Funeral",
+ "Arabic_Geography",
+ "Arabic_History",
+ "Arabic_Language_Origin",
+ "Arabic_Literature",
+ "Arabic_Math",
+ "Arabic_Medicine",
+ "Arabic_Music",
+ "Arabic_Ornament",
+ "Arabic_Philosophy",
+ "Arabic_Physics_and_Chemistry",
+ "Arabic_Wedding",
+ "Bahrain",
+ "Comoros",
+ "Egypt_modern",
+ "InfluenceFromAncientEgypt",
+ "InfluenceFromByzantium",
+ "InfluenceFromChina",
+ "InfluenceFromGreece",
+ "InfluenceFromIslam",
+ "InfluenceFromPersia",
+ "InfluenceFromRome",
+ "Iraq",
+ "Islam_Education",
+ "Islam_branches_and_schools",
+ "Islamic_law_system",
+ "Jordan",
+ "Kuwait",
+ "Lebanon",
+ "Libya",
+ "Mauritania",
+ "Mesopotamia_civilization",
+ "Morocco",
+ "Oman",
+ "Palestine",
+ "Qatar",
+ "Saudi_Arabia",
+ "Somalia",
+ "Sudan",
+ "Syria",
+ "Tunisia",
+ "United_Arab_Emirates",
+ "Yemen",
+ "communication",
+ "computer_and_phone",
+ "daily_life",
+ "entertainment",
+]
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"acva_{Language.ARABIC.value}:{subset}",
+ prompt_function=get_boolq_prompt_function(
+ Language.ARABIC,
+ lambda line: {
+ "question": line["question"],
+ "answer": line["answer"] == "صح",
+ },
+ formulation=CFFormulation(),
+ ),
+ suite=("lighteval",),
+ hf_repo="OALL/ACVA",
+ hf_subset=subset,
+ evaluation_splits=("test",),
+ few_shots_split="validation",
+ metrics=[MultilingualQuasiExactMatchMetric(Language.ARABIC, "full"), LogLikelihoodAccMetric()],
+ generation_size=5,
+ stop_sequence=("\n",),
+ )
+ for subset in ACVA_SUBSET
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/afri_mgsm.py b/src/lighteval/tasks/multilingual/tasks/afri_mgsm.py
new file mode 100644
index 000000000..1be96436e
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/afri_mgsm.py
@@ -0,0 +1,72 @@
+"""
+name:
+Afri Mgsm
+
+dataset:
+masakhane/afrimgsm
+
+abstract:
+African MGSM: MGSM for African Languages
+
+languages:
+amharic, ewe, french, hausa, igbo, kinyarwanda, lingala, luganda, oromo, shona,
+sotho, swahili, twi, wolof, xhosa, yoruba, zulu
+
+tags:
+math, multilingual, reasoning
+
+paper:
+https://arxiv.org/abs/2406.03368.
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"afri_mgsm_{language.value}",
+ prompt_function=get_qa_prompt_function(
+ language,
+ lambda line: {
+ "question": line["question"],
+ # The cot is available but we have no use:
+ # line["answer"]
+ "choices": [str(line["answer_number"])],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="masakhane/afrimgsm",
+ hf_subset=language.value,
+ evaluation_splits=("test",),
+ few_shots_split="train",
+ generation_size=25,
+ metrics=[
+ MultilingualQuasiExactMatchMetric(language, "full"),
+ ],
+ stop_sequence=("\n",),
+ )
+ for language in [
+ Language.AMHARIC,
+ # Language.EWE,
+ Language.FRENCH,
+ # Language.HAUSA,
+ # Language.IGBO,
+ # Language.KINYARWANDA,
+ # Language.LINGALA,
+ # Language.LUGANDA,
+ # Language.OROMO,
+ # Language.SHONA,
+ # Language.SOTHO,
+ Language.SWAHILI,
+ # Language.TWI,
+ # Language.WOLOF,
+ # Language.XHOSA,
+ Language.YORUBA,
+ # Language.ZULU,
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/afri_mmlu.py b/src/lighteval/tasks/multilingual/tasks/afri_mmlu.py
new file mode 100644
index 000000000..e4d21f350
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/afri_mmlu.py
@@ -0,0 +1,104 @@
+"""
+name:
+Afri Mmlu
+
+dataset:
+masakhane/afrimmlu
+
+abstract:
+African MMLU: African Massive Multitask Language Understanding
+
+languages:
+amharic, ewe, french, hausa, igbo, kinyarwanda, lingala, luganda, oromo, shona,
+sotho, swahili, twi, wolof, xhosa, yoruba, zulu
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+https://arxiv.org/abs/2406.03368.
+"""
+
+from functools import partial
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+AFRI_MMLU_SUBSETS = [
+ "elementary_mathematics",
+ "high_school_mathematics",
+ "high_school_geography",
+ "high_school_microeconomics",
+ "international_law",
+ "global_facts",
+]
+
+
+afri_mmlu_tasks = [
+ LightevalTaskConfig(
+ name=f"afri_mmlu_{language.value}_{formulation.name.lower()}:{subset}",
+ prompt_function=get_mcq_prompt_function(
+ language,
+ lambda line: {
+ "question": line["question"],
+ "choices": line["choices"],
+ "gold_idx": LETTER_INDICES.index(line["answer"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="masakhane/afrimmlu",
+ # Temporary until the pr is merged
+ hf_revision="refs/pr/1",
+ hf_subset=language.value,
+ hf_filter=partial(lambda subset, line: line["subject"] == subset, subset),
+ evaluation_splits=("test",),
+ few_shots_split="dev",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
+ ],
+ ),
+ )
+ for subset in AFRI_MMLU_SUBSETS
+ for language in [
+ Language.AMHARIC,
+ # Language.EWE,
+ Language.FRENCH,
+ # Language.HAUSA,
+ # Language.IGBO,
+ # Language.KINYARWANDA,
+ # Language.LINGALA,
+ # Language.LUGANDA,
+ # Language.OROMO,
+ # Language.SHONA,
+ # Language.SOTHO,
+ Language.SWAHILI,
+ # Language.TWI,
+ # Language.WOLOF,
+ # Language.XHOSA,
+ Language.YORUBA,
+ # Language.ZULU,
+ ]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/afri_xnli.py b/src/lighteval/tasks/multilingual/tasks/afri_xnli.py
new file mode 100644
index 000000000..6bf3e315f
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/afri_xnli.py
@@ -0,0 +1,86 @@
+"""
+name:
+Afri Xnli
+
+dataset:
+masakhane/afrixnli
+
+abstract:
+African XNLI: African XNLI
+
+languages:
+amharic, ewe, french, hausa, igbo, kinyarwanda, lingala, luganda, oromo, shona,
+sotho, swahili, twi, wolof, xhosa, yoruba, zulu
+
+tags:
+classification, multilingual, nli
+
+paper:
+https://arxiv.org/abs/2406.03368.
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.nli import get_nli_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"afri_xnli_{language.value}_{formulation.name.lower()}",
+ suite=("lighteval",),
+ prompt_function=get_nli_prompt_function(
+ language=language,
+ adapter=lambda line: {
+ "premise": line["premise"],
+ "hypothesis": line["hypothesis"],
+ # Since we ignore the neutral label
+ "gold_idx": {0: 0, 2: 1}[line["label"]],
+ },
+ relations=["entailment", "contradiction"],
+ formulation=formulation,
+ ),
+ hf_repo="masakhane/afrixnli",
+ hf_subset=language.value,
+ hf_filter=lambda x: int(x["label"]) in [0, 2],
+ evaluation_splits=("test",),
+ few_shots_split="validation",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=None),
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for language in [
+ Language.AMHARIC,
+ # Language.EWE,
+ Language.FRENCH,
+ # Language.HAUSA,
+ # Language.IGBO,
+ # Language.KINYARWANDA,
+ # Language.LINGALA,
+ # Language.LUGANDA,
+ # Language.OROMO,
+ # Language.SHONA,
+ # Language.SOTHO,
+ Language.SWAHILI,
+ # Language.TWI,
+ # Language.WOLOF,
+ # Language.XHOSA,
+ Language.YORUBA,
+ # Language.ZULU,
+ ]
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/community_tasks/arabic_evals.py b/src/lighteval/tasks/multilingual/tasks/arabic.py
similarity index 96%
rename from community_tasks/arabic_evals.py
rename to src/lighteval/tasks/multilingual/tasks/arabic.py
index 0e917d25d..c85d2ecbd 100644
--- a/community_tasks/arabic_evals.py
+++ b/src/lighteval/tasks/multilingual/tasks/arabic.py
@@ -1,30 +1,20 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
+"""
+name:
+Arabic Evals
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
+dataset:
+MBZUAI/ArabicMMLU, MBZUAI/human_translated_arabic_mmlu, OALL/Arabic_MMLU, OALL/ACVA, asas-ai/AraTrust-categorized
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+abstract:
+Collection of benchmarks for Arabic language.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+languages:
+arabic
-# ruff: noqa: F405, F403, F401
-"""
-Custom evaluation tasks for lighteval
+tags:
+knowledge, multilingual, multiple-choice
-This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
+paper:
"""
import random
diff --git a/src/lighteval/tasks/multilingual/tasks/arabic_arc.py b/src/lighteval/tasks/multilingual/tasks/arabic_arc.py
new file mode 100644
index 000000000..29d9ee9d4
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/arabic_arc.py
@@ -0,0 +1,62 @@
+"""
+name:
+Arabic Arc
+
+dataset:
+OALL/AlGhafa-Arabic-LLM-Benchmark-Translated
+
+abstract:
+Arabic Arc multilingual benchmark.
+
+languages:
+arabic
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.adapters import (
+ alghafa_adapter,
+)
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"alghafa_arc_{Language.ARABIC.value}_{formulation.name.lower()}:easy",
+ prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
+ suite=["lighteval"],
+ hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+ hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff",
+ hf_subset="arc_easy_ar",
+ evaluation_splits=["test"],
+ few_shots_split="validation",
+ few_shots_select="sequential",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/arabic_mmlu.py b/src/lighteval/tasks/multilingual/tasks/arabic_mmlu.py
new file mode 100644
index 000000000..d8031c7f6
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/arabic_mmlu.py
@@ -0,0 +1,113 @@
+"""
+name:
+Arabic Mmlu
+
+dataset:
+MBZUAI/ArabicMMLU
+
+abstract:
+Arabic Mmlu multilingual benchmark.
+
+languages:
+arabic
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation, normalize_subset
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+ARABIC_MMLU_SUBSETS = [
+ "Islamic Studies",
+ "Islamic Studies (Middle School)",
+ "Islamic Studies (Primary School)",
+ "Islamic Studies (High School)",
+ "Driving Test",
+ "Natural Science (Middle School)",
+ "Natural Science (Primary School)",
+ "History (Middle School)",
+ "History (Primary School)",
+ "History (High School)",
+ "General Knowledge",
+ "General Knowledge (Middle School)",
+ "General Knowledge (Primary School)",
+ "Law (Professional)",
+ "Physics (High School)",
+ "Social Science (Middle School)",
+ "Social Science (Primary School)",
+ "Management (University)",
+ "Arabic Language (Middle School)",
+ "Arabic Language (Primary School)",
+ "Arabic Language (High School)",
+ "Political Science (University)",
+ "Philosophy (High School)",
+ "Accounting (University)",
+ "Computer Science (Middle School)",
+ "Computer Science (Primary School)",
+ "Computer Science (High School)",
+ "Computer Science (University)",
+ "Geography (Middle School)",
+ "Geography (Primary School)",
+ "Geography (High School)",
+ "Math (Primary School)",
+ "Biology (High School)",
+ "Economics (Middle School)",
+ "Economics (High School)",
+ "Economics (University)",
+ "Arabic Language (General)",
+ "Arabic Language (Grammar)",
+ "Civics (Middle School)",
+ "Civics (High School)",
+]
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mmlu_{Language.ARABIC.value}_{formulation.name.lower()}:{normalize_subset(subset)}",
+ prompt_function=get_mcq_prompt_function(
+ Language.ARABIC,
+ lambda line: {
+ "context": line["Context"],
+ "question": line["Question"],
+ "choices": [str(o) for o in [line[f"Option {i}"] for i in range(1, 6)] if o],
+ "gold_idx": LETTER_INDICES.index(line["Answer Key"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="MBZUAI/ArabicMMLU",
+ hf_subset=subset,
+ evaluation_splits=("test",),
+ hf_avail_splits=["dev"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
+ ],
+ ),
+ )
+ for subset in ARABIC_MMLU_SUBSETS
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/arcd.py b/src/lighteval/tasks/multilingual/tasks/arcd.py
new file mode 100644
index 000000000..d1404821b
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/arcd.py
@@ -0,0 +1,57 @@
+"""
+name:
+Arcd
+
+dataset:
+hsseinmz/arcd
+
+abstract:
+ARCD: Arabic Reading Comprehension Dataset.
+
+languages:
+arabic
+
+tags:
+multilingual, multiple-choice, qa, reasoning
+
+paper:
+https://arxiv.org/pdf/1906.05394
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+# ARCD: Arabic Reading Comprehension Dataset.
+# https://arxiv.org/pdf/1906.05394
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"arcd_{Language.ARABIC.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.ARABIC,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="hsseinmz/arcd",
+ hf_subset="plain_text",
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ metrics=(
+ MultilingualQuasiExactMatchMetric(Language.ARABIC, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.ARABIC),
+ ),
+ generation_size=400,
+ stop_sequence=("\n",),
+ )
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/belebele.py b/src/lighteval/tasks/multilingual/tasks/belebele.py
new file mode 100644
index 000000000..2623e1868
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/belebele.py
@@ -0,0 +1,192 @@
+"""
+name:
+Belebele
+
+dataset:
+facebook/belebele
+
+abstract:
+Belebele: A large-scale reading comprehension dataset covering 122 languages.
+
+languages:
+arabic, armenian, bengali, cyrillic, devanagari, ethiopic, georgian, greek,
+gujarati, gurmukhi, chinese (simplified), chinese (traditional), hangul, hebrew,
+japanese, khmer, kannada, lao, latin, malayalam, myanmar, odia, sinhala, tamil,
+telugu, thai, tibetan
+
+tags:
+multilingual, multiple-choice, reading-comprehension
+
+paper:
+https://arxiv.org/abs/2308.16884
+"""
+
+from langcodes import Language as LangCodeLanguage
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import iso_639_3_ind_to_iso_639_3_macro
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"belebele_{language}_{formulation.name.lower()}",
+ prompt_function=get_mcq_prompt_function(
+ iso_639_3_ind_to_iso_639_3_macro[LangCodeLanguage.get(language).to_alpha3()],
+ lambda line: {
+ "question": line["question"],
+ "context": line["flores_passage"],
+ "choices": [line[f"mc_answer{i}"] for i in range(1, 5)],
+ "gold_idx": int(line["correct_answer_num"]) - 1,
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="facebook/belebele",
+ hf_subset=language,
+ evaluation_splits=("test",),
+ hf_avail_splits=["test"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+ for language in [
+ "acm_Arab",
+ "arz_Arab",
+ "ceb_Latn",
+ "fin_Latn",
+ "hin_Deva",
+ "ita_Latn",
+ "khm_Khmr",
+ "lvs_Latn",
+ "npi_Deva",
+ "pol_Latn",
+ "slv_Latn",
+ "swe_Latn",
+ # "tso_Latn",
+ # "xho_Latn",
+ "afr_Latn",
+ "asm_Beng",
+ "ces_Latn",
+ "fra_Latn",
+ "hin_Latn",
+ "jav_Latn",
+ # "kin_Latn",
+ "mal_Mlym",
+ "npi_Latn",
+ "por_Latn",
+ # "sna_Latn",
+ "swh_Latn",
+ "tur_Latn",
+ "yor_Latn",
+ "als_Latn",
+ "azj_Latn",
+ "ckb_Arab",
+ # "fuv_Latn",
+ "hrv_Latn",
+ "jpn_Jpan",
+ "kir_Cyrl",
+ "mar_Deva",
+ # "nso_Latn",
+ "snd_Arab",
+ "tam_Taml",
+ "ukr_Cyrl",
+ "zho_Hans",
+ "amh_Ethi",
+ # "bam_Latn",
+ "dan_Latn",
+ # "gaz_Latn",
+ "hun_Latn",
+ # "kac_Latn",
+ "kor_Hang",
+ "mkd_Cyrl",
+ # "nya_Latn",
+ "ron_Latn",
+ "som_Latn",
+ "tel_Telu",
+ "urd_Arab",
+ "zho_Hant",
+ "apc_Arab",
+ "ben_Beng",
+ "deu_Latn",
+ # "grn_Latn",
+ "hye_Armn",
+ "kan_Knda",
+ "lao_Laoo",
+ "mlt_Latn",
+ "ory_Orya",
+ "rus_Cyrl",
+ # "sot_Latn",
+ "tgk_Cyrl",
+ "urd_Latn",
+ "zsm_Latn",
+ "arb_Arab",
+ "ben_Latn",
+ "ell_Grek",
+ "guj_Gujr",
+ # "ibo_Latn",
+ "kat_Geor",
+ # "lin_Latn",
+ # "mri_Latn",
+ "pan_Guru",
+ # "shn_Mymr",
+ "spa_Latn",
+ "tgl_Latn",
+ "uzn_Latn",
+ # "zul_Latn",
+ "arb_Latn",
+ # "bod_Tibt",
+ "eng_Latn",
+ # "hat_Latn",
+ # "ilo_Latn",
+ "kaz_Cyrl",
+ "lit_Latn",
+ "mya_Mymr",
+ "pbt_Arab",
+ "sin_Latn",
+ "srp_Cyrl",
+ "tha_Thai",
+ "vie_Latn",
+ "ars_Arab",
+ "bul_Cyrl",
+ "est_Latn",
+ # "hau_Latn",
+ "ind_Latn",
+ # "kea_Latn",
+ # "lug_Latn",
+ "nld_Latn",
+ "pes_Arab",
+ "sin_Sinh",
+ # "ssw_Latn",
+ # "tir_Ethi",
+ "war_Latn",
+ "ary_Arab",
+ "cat_Latn",
+ "eus_Latn",
+ "heb_Hebr",
+ "isl_Latn",
+ # "khk_Cyrl",
+ # "luo_Latn",
+ "nob_Latn",
+ "plt_Latn",
+ "slk_Latn",
+ # "sun_Latn",
+ # "tsn_Latn",
+ # "wol_Latn",
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/c3.py b/src/lighteval/tasks/multilingual/tasks/c3.py
new file mode 100644
index 000000000..4440b5b00
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/c3.py
@@ -0,0 +1,73 @@
+"""
+name:
+C3
+
+dataset:
+clue/clue
+
+abstract:
+C3: A Chinese Challenge Corpus for Cross-lingual and Cross-modal Tasks Reading
+comprehension task part of clue.
+
+languages:
+chinese
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+https://arxiv.org/abs/2004.05986
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+# C3: A Chinese Challenge Corpus for Cross-lingual and Cross-modal Tasks
+# Reading comprehension task part of clue
+# Paper: https://arxiv.org/abs/2004.05986
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"c3_{Language.CHINESE.value}_{formulation.name.lower()}",
+ suite=("lighteval",),
+ prompt_function=get_mcq_prompt_function(
+ Language.CHINESE,
+ lambda line: {
+ "question": line["question"],
+ "choices": line["choice"],
+ "gold_idx": line["choice"].index(line["answer"]),
+ "context": " ".join(line["context"]),
+ },
+ formulation=formulation,
+ ),
+ hf_repo="clue/clue",
+ hf_subset="c3",
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/ceval.py b/src/lighteval/tasks/multilingual/tasks/ceval.py
new file mode 100644
index 000000000..c037a0df3
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/ceval.py
@@ -0,0 +1,127 @@
+"""
+name:
+Ceval
+
+dataset:
+ceval/ceval-exam
+
+abstract:
+Ceval multilingual benchmark.
+
+languages:
+chinese
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+"""
+
+from functools import partial
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.adapters import (
+ ceval_adapter,
+)
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+CEVAL_SUBSET = [
+ "computer_network",
+ "operating_system",
+ "computer_architecture",
+ "college_programming",
+ "college_physics",
+ "college_chemistry",
+ "advanced_mathematics",
+ "probability_and_statistics",
+ "discrete_mathematics",
+ "electrical_engineer",
+ "metrology_engineer",
+ "high_school_mathematics",
+ "high_school_physics",
+ "high_school_chemistry",
+ "high_school_biology",
+ "middle_school_mathematics",
+ "middle_school_biology",
+ "middle_school_physics",
+ "middle_school_chemistry",
+ "veterinary_medicine",
+ "college_economics",
+ "business_administration",
+ "marxism",
+ "mao_zedong_thought",
+ "education_science",
+ "teacher_qualification",
+ "high_school_politics",
+ "high_school_geography",
+ "middle_school_politics",
+ "middle_school_geography",
+ "modern_chinese_history",
+ "ideological_and_moral_cultivation",
+ "logic",
+ "law",
+ "chinese_language_and_literature",
+ "art_studies",
+ "professional_tour_guide",
+ "legal_professional",
+ "high_school_chinese",
+ "high_school_history",
+ "middle_school_history",
+ "civil_servant",
+ "sports_science",
+ "plant_protection",
+ "basic_medicine",
+ "clinical_medicine",
+ "urban_and_rural_planner",
+ "accountant",
+ "fire_engineer",
+ "environmental_impact_assessment_engineer",
+ "tax_accountant",
+ "physician",
+]
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"ceval_{Language.CHINESE.value}_{formulation.name.lower()}:{subset}",
+ prompt_function=get_mcq_prompt_function(
+ Language.CHINESE,
+ partial(
+ ceval_adapter,
+ Language.CHINESE,
+ formulation,
+ ),
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="ceval/ceval-exam",
+ hf_subset=subset,
+ evaluation_splits=("val",),
+ few_shots_split="dev",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for subset in CEVAL_SUBSET
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/chegeka.py b/src/lighteval/tasks/multilingual/tasks/chegeka.py
new file mode 100644
index 000000000..3b2174ab9
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/chegeka.py
@@ -0,0 +1,51 @@
+"""
+name:
+Chegeka
+
+dataset:
+ai-forever/MERA
+
+abstract:
+Chegeka multilingual benchmark.
+
+languages:
+russian
+
+tags:
+knowledge, multilingual, qa
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"chegeka_{Language.RUSSIAN.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.RUSSIAN,
+ lambda line: {
+ "question": line["inputs"]["text"],
+ "choices": [line["outputs"]],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="ai-forever/MERA",
+ hf_subset="chegeka",
+ evaluation_splits=("train",),
+ hf_avail_splits=["train"],
+ generation_size=400,
+ stop_sequence=("\n",),
+ metrics=[
+ MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.RUSSIAN),
+ ],
+ )
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/chinese_squad.py b/src/lighteval/tasks/multilingual/tasks/chinese_squad.py
new file mode 100644
index 000000000..521e0bc60
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/chinese_squad.py
@@ -0,0 +1,53 @@
+"""
+name:
+Chinese Squad
+
+dataset:
+lighteval/ChineseSquad
+
+abstract:
+ChineseSquad is a reading comprehension dataset for Chinese.
+
+languages:
+chinese
+
+tags:
+multilingual, qa
+
+paper:
+https://github.com/pluto-junzeng/ChineseSquad
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"chinese_squad_{Language.CHINESE.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.CHINESE,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="lighteval/ChineseSquad",
+ hf_subset="default",
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ metrics=(
+ MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.CHINESE),
+ ),
+ generation_size=400,
+ stop_sequence=("\n",),
+ )
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/cmath.py b/src/lighteval/tasks/multilingual/tasks/cmath.py
new file mode 100644
index 000000000..f1e7d45ed
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/cmath.py
@@ -0,0 +1,49 @@
+"""
+name:
+Cmath
+
+dataset:
+weitianwen/cmath
+
+abstract:
+Cmath multilingual benchmark.
+
+languages:
+chinese
+
+tags:
+math, multilingual, reasoning
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"cmath_{Language.CHINESE.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.CHINESE,
+ lambda line: {
+ "question": line["question"],
+ "choices": [line["golden"]],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="weitianwen/cmath",
+ hf_subset="default",
+ evaluation_splits=("test",),
+ few_shots_split="validation",
+ generation_size=25,
+ metrics=[
+ MultilingualQuasiExactMatchMetric(Language.CHINESE, "full"),
+ ],
+ stop_sequence=("\n",),
+ )
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/cmmlu.py b/src/lighteval/tasks/multilingual/tasks/cmmlu.py
new file mode 100644
index 000000000..8153d7bf6
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/cmmlu.py
@@ -0,0 +1,139 @@
+"""
+name:
+Cmmlu
+
+dataset:
+haonan-li/cmmlu
+
+abstract:
+Cmmlu multilingual benchmark.
+
+languages:
+chinese
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+CMMLU_SUBSETS = [
+ "agronomy",
+ "anatomy",
+ "ancient_chinese",
+ "arts",
+ "astronomy",
+ "business_ethics",
+ "chinese_civil_service_exam",
+ "chinese_driving_rule",
+ "chinese_food_culture",
+ "chinese_foreign_policy",
+ "chinese_history",
+ "chinese_literature",
+ "chinese_teacher_qualification",
+ "clinical_knowledge",
+ "college_actuarial_science",
+ "college_education",
+ "college_engineering_hydrology",
+ "college_law",
+ "college_mathematics",
+ "college_medical_statistics",
+ "college_medicine",
+ "computer_science",
+ "computer_security",
+ "conceptual_physics",
+ "construction_project_management",
+ "economics",
+ "education",
+ "electrical_engineering",
+ "elementary_chinese",
+ "elementary_commonsense",
+ "elementary_information_and_technology",
+ "elementary_mathematics",
+ "ethnology",
+ "food_science",
+ "genetics",
+ "global_facts",
+ "high_school_biology",
+ "high_school_chemistry",
+ "high_school_geography",
+ "high_school_mathematics",
+ "high_school_physics",
+ "high_school_politics",
+ "human_sexuality",
+ "international_law",
+ "journalism",
+ "jurisprudence",
+ "legal_and_moral_basis",
+ "logical",
+ "machine_learning",
+ "management",
+ "marketing",
+ "marxist_theory",
+ "modern_chinese",
+ "nutrition",
+ "philosophy",
+ "professional_accounting",
+ "professional_law",
+ "professional_medicine",
+ "professional_psychology",
+ "public_relations",
+ "security_study",
+ "sociology",
+ "sports_science",
+ "traditional_chinese_medicine",
+ "virology",
+ "world_history",
+ "world_religions",
+]
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"cmmlu_{Language.CHINESE.value}_{formulation.name.lower()}:{subset}",
+ prompt_function=get_mcq_prompt_function(
+ Language.CHINESE,
+ lambda line: {
+ "question": line["Question"],
+ "choices": [line["A"], line["B"], line["C"], line["D"]],
+ "gold_idx": LETTER_INDICES.index(line["Answer"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="haonan-li/cmmlu",
+ hf_subset=subset,
+ evaluation_splits=("test",),
+ few_shots_split="dev",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
+ ],
+ ),
+ )
+ for subset in CMMLU_SUBSETS
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/cmnli.py b/src/lighteval/tasks/multilingual/tasks/cmnli.py
new file mode 100644
index 000000000..c8667978c
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/cmnli.py
@@ -0,0 +1,67 @@
+"""
+name:
+Cmnli
+
+dataset:
+fenffef/cmnli
+
+abstract:
+Native Chinese NLI dataset based on MNLI approach (Machine Translated)
+
+languages:
+chinese
+
+tags:
+classification, multilingual, nli
+
+paper:
+https://arxiv.org/abs/2004.05986
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.nli import get_nli_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"cmnli_{Language.CHINESE.value}_{formulation.name.lower()}",
+ prompt_function=get_nli_prompt_function(
+ language=Language.CHINESE,
+ adapter=lambda line: {
+ "premise": line["sentence1"],
+ "hypothesis": line["sentence2"],
+ # Since we ignore the neutral label
+ "gold_idx": {"entailment": 0, "contradiction": 1}[line["label"]],
+ },
+ relations=["entailment", "contradiction"],
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="fenffef/cmnli",
+ hf_subset="default",
+ hf_filter=lambda x: x["label"] in ["entailment", "contradiction"],
+ # Only keep the positive and negative examples
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=None),
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/cmrc2018.py b/src/lighteval/tasks/multilingual/tasks/cmrc2018.py
new file mode 100644
index 000000000..63174fd98
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/cmrc2018.py
@@ -0,0 +1,53 @@
+"""
+name:
+Cmrc2018
+
+dataset:
+clue/clue
+
+abstract:
+CMRC 2018: A span-extraction machine reading comprehension dataset for Chinese.
+
+languages:
+chinese
+
+tags:
+multilingual, qa
+
+paper:
+https://arxiv.org/abs/1810.07366
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"cmrc2018_{Language.CHINESE.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.CHINESE,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="clue/clue",
+ hf_subset="cmrc2018",
+ evaluation_splits=("trial",),
+ few_shots_split="train",
+ generation_size=400,
+ metrics=(
+ MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.CHINESE),
+ ),
+ stop_sequence=("\n",),
+ )
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/copa_indic.py b/src/lighteval/tasks/multilingual/tasks/copa_indic.py
new file mode 100644
index 000000000..4d664647d
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/copa_indic.py
@@ -0,0 +1,93 @@
+"""
+name:
+Copa Indic
+
+dataset:
+ai4bharat/IndicCOPA
+
+abstract:
+IndicCOPA: COPA for Indic Languages Paper: https://arxiv.org/pdf/2212.05409
+IndicCOPA extends COPA to 15 Indic languages, providing a valuable resource for
+evaluating common sense reasoning in these languages.
+
+languages:
+assamese, bengali, gujarati, hindi, kannada, malayalam, marathi, nepali, oriya,
+punjabi, sanskrit, sindhi, tamil, telugu, urdu
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+https://arxiv.org/pdf/2212.05409
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.copa import get_copa_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+# IndicCOPA: COPA for Indic Languages
+# Paper: https://arxiv.org/pdf/2212.05409
+# IndicCOPA extends COPA to 15 Indic languages, providing a valuable resource for
+# evaluating common sense reasoning in these languages.
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"indicxcopa_{language.value}_{formulation.name.lower()}",
+ suite=["lighteval"],
+ prompt_function=get_copa_prompt_function(
+ language,
+ adapter=lambda line: {
+ "context": line["premise"],
+ "cause_effect": line["question"],
+ "continuations": [line["choice1"], line["choice2"]],
+ "gold_idx": int(line["label"]),
+ },
+ formulation=formulation,
+ ),
+ hf_repo="ai4bharat/IndicCOPA",
+ hf_subset=f"translation-{standardize_tag(language.value)}",
+ hf_revision="d356ef19a4eb287e88a51d07a56b73ba88c7f188",
+ evaluation_splits=["test"],
+ hf_avail_splits=["test"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for language in [
+ Language.ASSAMESE,
+ Language.BENGALI,
+ Language.GUJARATI,
+ Language.HINDI,
+ Language.KANNADA,
+ Language.MALAYALAM,
+ Language.MARATHI,
+ Language.NEPALI,
+ Language.ORIYA,
+ Language.PUNJABI,
+ Language.SANSKRIT,
+ Language.SINDHI,
+ Language.TAMIL,
+ Language.TELUGU,
+ Language.URDU,
+ # Optionally: Maithili, Santali, Sindhi, Konkani
+ ]
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/enem.py b/src/lighteval/tasks/multilingual/tasks/enem.py
new file mode 100644
index 000000000..b852eeb4e
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/enem.py
@@ -0,0 +1,73 @@
+"""
+name:
+Enem
+
+dataset:
+maritaca-ai/enem
+
+abstract:
+ENEM (Exame Nacional do Ensino Médio) is a standardized Brazilian national
+secondary education examination. The exam is used both as a university admission
+test and as a high school evaluation test.
+
+languages:
+portuguese
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+https://huggingface.co/datasets/maritaca-ai/enem
+"""
+
+from functools import partial
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.adapters import (
+ enem_adapter,
+)
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"enem_{Language.PORTUGUESE.value}_{formulation.name.lower()}:{year}",
+ prompt_function=get_mcq_prompt_function(
+ Language.PORTUGUESE,
+ partial(
+ enem_adapter,
+ Language.PORTUGUESE,
+ ),
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="maritaca-ai/enem",
+ hf_subset=year,
+ evaluation_splits=("train",),
+ hf_avail_splits=["train"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for year in ["2022", "2023", "2024"]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/exams.py b/src/lighteval/tasks/multilingual/tasks/exams.py
new file mode 100644
index 000000000..69424a0ef
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/exams.py
@@ -0,0 +1,194 @@
+"""
+name:
+Exams
+
+dataset:
+mhardalov/exams
+
+abstract:
+Exams multilingual benchmark.
+
+languages:
+albanian, arabic, bulgarian, croatian, french, german, hungarian, italian,
+lithuanian, macedonian, polish, portuguese, serbian, spanish, turkish,
+vietnamese
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+"""
+
+from functools import partial
+
+from langcodes import Language as LangCodeLanguage
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation, normalize_subset
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+exams_subjects_by_lang: dict[Language, set[str]] = {
+ Language.ARABIC: {"Biology", "Islamic Studies", "Physics", "Science", "Social"},
+ Language.BULGARIAN: {"Biology", "Chemistry", "Geography", "History", "Philosophy", "Physics"},
+ Language.CROATIAN: {
+ "Biology",
+ "Chemistry",
+ "Ethics",
+ "Fine Arts",
+ "Geography",
+ "Geology",
+ "History",
+ "Informatics",
+ "Philosophy",
+ "Physics",
+ "Politics",
+ "Psychology",
+ "Religion",
+ "Sociology",
+ },
+ Language.HUNGARIAN: {
+ "Agriculture",
+ "Agriculture (Mechanical knowledge)",
+ "Biology",
+ "Chemistry",
+ "Economics",
+ "Economics & Marketing",
+ "Economics Basics (Business)",
+ "Economics Basics (Theoretical)",
+ "Forestry",
+ "Geography",
+ "Landscaping",
+ "Physics",
+ "Politics",
+ "Tourism",
+ },
+ Language.ITALIAN: {
+ "Biology",
+ "Chemistry",
+ "Ethics",
+ "Geography",
+ "Geology",
+ "History",
+ "Informatics",
+ "Philosophy",
+ "Physics",
+ "Politics",
+ "Psychology",
+ "Sociology",
+ },
+ Language.SERBIAN: {
+ "Biology",
+ "Chemistry",
+ "Ethics",
+ "Geography",
+ "Geology",
+ "History",
+ "Informatics",
+ "Philosophy",
+ "Physics",
+ "Politics",
+ "Psychology",
+ "Religion",
+ "Sociology",
+ },
+ Language.FRENCH: {"Economics", "Economics & Marketing", "Economics Basics (Theoretical)", "Geography", "Physics"},
+ Language.GERMAN: {
+ "Chemistry",
+ "Economics",
+ "Economics & Marketing",
+ "Economics Basics (Theoretical)",
+ "Geography",
+ "Physics",
+ "Tourism",
+ },
+ Language.SPANISH: {"Geography", "Physics"},
+ Language.LITHUANIAN: {"Geology", "History"},
+ Language.ALBANIAN: {
+ "Biology",
+ "Business",
+ "Chemistry",
+ "Fine Arts",
+ "History",
+ "Philosophy",
+ "Physics",
+ "Sociology",
+ },
+ Language.MACEDONIAN: {
+ "Biology",
+ "Business",
+ "Chemistry",
+ "Fine Arts",
+ "History",
+ "Philosophy",
+ "Physics",
+ "Sociology",
+ },
+ Language.TURKISH: {
+ "Biology",
+ "Business",
+ "Chemistry",
+ "Geography",
+ "History",
+ "Philosophy",
+ "Physics",
+ "Sociology",
+ },
+ Language.POLISH: {"Professional"},
+ Language.PORTUGUESE: {"Biology", "Economics", "Geology", "Philosophy"},
+ Language.VIETNAMESE: {"Biology", "Chemistry", "Citizenship", "Geography", "History", "Physics"},
+}
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"exams_{language.value}_{formulation.name.lower()}:{normalize_subset(subject)}",
+ prompt_function=get_mcq_prompt_function(
+ language,
+ lambda line: {
+ "question": line["question"]["stem"],
+ "choices": line["question"]["choices"]["text"],
+ "gold_idx": line["question"]["choices"]["label"].index(line["answerKey"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="mhardalov/exams",
+ hf_subset="multilingual",
+ # Weird bug in dataset
+ hf_filter=partial(
+ lambda language, subject, line: line["answerKey"] != "@"
+ and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name()
+ and line["info"]["subject"] == subject,
+ language,
+ subject,
+ ),
+ evaluation_splits=("test",),
+ few_shots_split="train",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for language in exams_subjects_by_lang.keys()
+ for subject in exams_subjects_by_lang[language]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/faquad.py b/src/lighteval/tasks/multilingual/tasks/faquad.py
new file mode 100644
index 000000000..cec220bd0
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/faquad.py
@@ -0,0 +1,55 @@
+"""
+name:
+Faquad
+
+dataset:
+eraldoluis/faquad
+
+abstract:
+FaQuAD: A Portuguese Reading Comprehension Dataset
+
+languages:
+portuguese
+
+tags:
+multilingual, qa
+
+paper:
+https://arxiv.org/abs/2007.15671
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"faquad_{Language.PORTUGUESE.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.PORTUGUESE,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="eraldoluis/faquad",
+ hf_subset="plain_text",
+ hf_revision="205ba826a2282a4a5aa9bd3651e55ee4f2da1546",
+ hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ metrics=(
+ MultilingualQuasiExactMatchMetric(Language.PORTUGUESE, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.PORTUGUESE),
+ ),
+ generation_size=400,
+ stop_sequence=("\n",),
+ )
+]
diff --git a/community_tasks/filipino_evals.py b/src/lighteval/tasks/multilingual/tasks/filipino.py
similarity index 92%
rename from community_tasks/filipino_evals.py
rename to src/lighteval/tasks/multilingual/tasks/filipino.py
index 45011535e..daf29daa6 100644
--- a/community_tasks/filipino_evals.py
+++ b/src/lighteval/tasks/multilingual/tasks/filipino.py
@@ -1,31 +1,21 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
+"""
+name:
+Filipino Evals
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
+dataset:
+filbench/filbench-eval
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+abstract:
+Collection of benchmarks for Filipino language.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+languages:
+filipino
-# ruff: noqa: F405, F403, F401
+tags:
+knowledge, multilingual, multiple-choice
-"""
-This file contains the tasks for the Filipino language, collectively known as FilBench.
-It includes several tasks for the following categories: Cultural Knowledge, Classical NLP, Reading Comprehension, and Generation.
-For more information, please read the paper: https://github.com/filbench/filbench-eval/blob/main/filbench.pdf
+paper:
+https://github.com/filbench/filbench-eval/blob/main/filbench.pdf
Contact:
- Lester James V. Miranda
@@ -51,7 +41,6 @@
)
from lighteval.tasks.default_prompts import LETTER_INDICES
from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.multilingual.tasks import MMLU_SUBSETS
from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
from lighteval.tasks.requests import Doc
from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
@@ -65,6 +54,66 @@
from lighteval.utils.language import Language, iso_639_3_ind_to_iso_639_3_macro
+MMLU_SUBSETS = [
+ "abstract_algebra",
+ "anatomy",
+ "astronomy",
+ "business_ethics",
+ "clinical_knowledge",
+ "college_biology",
+ "college_chemistry",
+ "college_computer_science",
+ "college_mathematics",
+ "college_medicine",
+ "college_physics",
+ "computer_security",
+ "conceptual_physics",
+ "econometrics",
+ "electrical_engineering",
+ "elementary_mathematics",
+ "formal_logic",
+ "global_facts",
+ "high_school_biology",
+ "high_school_chemistry",
+ "high_school_computer_science",
+ "high_school_european_history",
+ "high_school_geography",
+ "high_school_government_and_politics",
+ "high_school_macroeconomics",
+ "high_school_mathematics",
+ "high_school_microeconomics",
+ "high_school_physics",
+ "high_school_psychology",
+ "high_school_statistics",
+ "high_school_us_history",
+ "high_school_world_history",
+ "human_aging",
+ "human_sexuality",
+ "international_law",
+ "jurisprudence",
+ "logical_fallacies",
+ "machine_learning",
+ "management",
+ "marketing",
+ "medical_genetics",
+ "miscellaneous",
+ "moral_disputes",
+ "moral_scenarios",
+ "nutrition",
+ "philosophy",
+ "prehistory",
+ "professional_accounting",
+ "professional_law",
+ "professional_medicine",
+ "professional_psychology",
+ "public_relations",
+ "security_studies",
+ "sociology",
+ "us_foreign_policy",
+ "virology",
+ "world_religions",
+]
+
# Balita NLP
FILIPINO_BALITA_TASKS = [
LightevalTaskConfig(
@@ -150,7 +199,6 @@
few_shots_select="random",
suite=["community"],
generation_size=-1,
- trust_dataset=True,
metrics=get_metrics_for_formulation(
formulation,
[
@@ -201,7 +249,6 @@
few_shots_split="test",
few_shots_select="random",
generation_size=-1,
- trust_dataset=True,
version=0,
)
for formulation in [MCFFormulation(), HybridFormulation()]
@@ -243,14 +290,13 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
hf_subset="default",
prompt_function=filipino_dengue_pfn,
hf_repo="jcblaise/dengue_filipino",
- metrics=[Metrics.loglikelihood_acc_norm],
+ metrics=[LogLikelihoodAccMetric(normalization=LogProbTokenNorm())],
hf_avail_splits=["train", "test", "validation"],
evaluation_splits=["train"],
few_shots_split="train",
few_shots_select="random",
suite=("community",),
generation_size=-1,
- trust_dataset=True,
version=0,
)
for subset in dengue_filipino_subsets
@@ -286,7 +332,6 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
few_shots_select="random",
suite=["community"],
generation_size=-1,
- trust_dataset=True,
version=0,
)
for formulation in [MCFFormulation(), HybridFormulation()]
@@ -370,7 +415,6 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
few_shots_split="test",
few_shots_select="random",
generation_size=-1,
- trust_dataset=True,
version=0,
)
for subset in ["culturology", "history", "language", "driving_license"]
@@ -432,7 +476,6 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
],
),
- trust_dataset=True,
)
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
]
@@ -465,7 +508,6 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
few_shots_split=None,
few_shots_select=None,
generation_size=64,
- trust_dataset=True,
version=0,
)
for language in ["fil_Latn"]
@@ -519,7 +561,6 @@ def create_sib200_task(language: Language, formulation):
few_shots_split="validation",
few_shots_select="random",
generation_size=-1,
- trust_dataset=True,
version=0,
)
@@ -575,7 +616,6 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str,
few_shots_split="test",
few_shots_select="random",
generation_size=-1,
- trust_dataset=True,
version=0,
)
for formulation in [MCFFormulation(), HybridFormulation()]
@@ -605,7 +645,6 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str,
few_shots_split="test",
few_shots_select="random",
generation_size=-1,
- trust_dataset=True,
version=0,
)
for formulation in [MCFFormulation(), HybridFormulation()]
@@ -652,7 +691,6 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str,
],
hf_avail_splits=["test"],
evaluation_splits=["test"],
- trust_dataset=True,
generation_size=64,
)
for language, meta in lang_dict.items()
@@ -685,7 +723,6 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str,
evaluation_splits=["validation"],
few_shots_split=["validation"],
few_shots_select="random",
- trust_dataset=True,
generation_size=64,
)
]
@@ -714,7 +751,6 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str,
few_shots_select="random",
suite=["community"],
generation_size=-1,
- trust_dataset=True,
metrics=get_metrics_for_formulation(
formulation,
[
@@ -758,7 +794,6 @@ def create_universalner_task(language: Language, formulation):
few_shots_select="random",
suite=["community"],
generation_size=-1,
- trust_dataset=True,
metrics=get_metrics_for_formulation(
formulation,
[
diff --git a/src/lighteval/tasks/multilingual/tasks/flores200.py b/src/lighteval/tasks/multilingual/tasks/flores200.py
new file mode 100644
index 000000000..c9d07122c
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/flores200.py
@@ -0,0 +1,271 @@
+"""
+name:
+Flores200
+
+dataset:
+facebook/flores
+
+abstract:
+Flores200 multilingual benchmark.
+
+languages:
+arabic, armenian, bengali, cyrillic, devanagari, ethiopic, georgian, greek,
+gujarati, gurmukhi, chinese (simplified), chinese (traditional), hangul, hebrew,
+japanese, khmer, kannada, lao, latin, malayalam, myanmar, odia, sinhala, tamil,
+telugu, thai, tibetan
+
+tags:
+multilingual, translation
+
+paper:
+"""
+
+from itertools import permutations
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.translation import get_translation_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+)
+from lighteval.utils.language import Language, manage_duplicate_language_codes
+
+
+flores_200_languages = [
+ # "ace_Arab",
+ "ace_Latn",
+ "acm_Arab",
+ "acq_Arab",
+ "aeb_Arab",
+ "afr_Latn",
+ "ajp_Arab",
+ "aka_Latn",
+ "amh_Ethi",
+ "apc_Arab",
+ "arb_Arab",
+ # "arb_Latn",
+ "ars_Arab",
+ "ary_Arab",
+ "arz_Arab",
+ "asm_Beng",
+ "ast_Latn",
+ "awa_Deva",
+ "ayr_Latn",
+ "azb_Arab",
+ "azj_Latn",
+ "bak_Cyrl",
+ "bam_Latn",
+ "ban_Latn",
+ "bel_Cyrl",
+ "bem_Latn",
+ "ben_Beng",
+ "bho_Deva",
+ # "bjn_Arab",
+ "bjn_Latn",
+ "bod_Tibt",
+ "bos_Latn",
+ "bug_Latn",
+ "bul_Cyrl",
+ "cat_Latn",
+ "ceb_Latn",
+ "ces_Latn",
+ "cjk_Latn",
+ "ckb_Arab",
+ "crh_Latn",
+ "cym_Latn",
+ "dan_Latn",
+ "deu_Latn",
+ "dik_Latn",
+ "dyu_Latn",
+ "dzo_Tibt",
+ "ell_Grek",
+ "eng_Latn",
+ "epo_Latn",
+ "est_Latn",
+ "eus_Latn",
+ "ewe_Latn",
+ "fao_Latn",
+ "fij_Latn",
+ "fin_Latn",
+ "fon_Latn",
+ "fra_Latn",
+ "fur_Latn",
+ "fuv_Latn",
+ "gla_Latn",
+ "gle_Latn",
+ "glg_Latn",
+ "grn_Latn",
+ "guj_Gujr",
+ "hat_Latn",
+ "hau_Latn",
+ "heb_Hebr",
+ "hin_Deva",
+ "hne_Deva",
+ "hrv_Latn",
+ "hun_Latn",
+ "hye_Armn",
+ "ibo_Latn",
+ "ilo_Latn",
+ "ind_Latn",
+ "isl_Latn",
+ "ita_Latn",
+ "jav_Latn",
+ "jpn_Jpan",
+ "kab_Latn",
+ "kac_Latn",
+ "kam_Latn",
+ "kan_Knda",
+ # "kas_Arab",
+ "kas_Deva",
+ "kat_Geor",
+ # "knc_Arab",
+ "knc_Latn",
+ "kaz_Cyrl",
+ "kbp_Latn",
+ "kea_Latn",
+ "khm_Khmr",
+ "kik_Latn",
+ "kin_Latn",
+ "kir_Cyrl",
+ "kmb_Latn",
+ "kmr_Latn",
+ "kon_Latn",
+ "kor_Hang",
+ "lao_Laoo",
+ "lij_Latn",
+ "lim_Latn",
+ "lin_Latn",
+ "lit_Latn",
+ "lmo_Latn",
+ "ltg_Latn",
+ "ltz_Latn",
+ "lua_Latn",
+ "lug_Latn",
+ "luo_Latn",
+ "lus_Latn",
+ "lvs_Latn",
+ "mag_Deva",
+ "mai_Deva",
+ "mal_Mlym",
+ "mar_Deva",
+ # "min_Arab",
+ "min_Latn",
+ "mkd_Cyrl",
+ "plt_Latn",
+ "mlt_Latn",
+ "mni_Beng",
+ "khk_Cyrl",
+ "mos_Latn",
+ "mri_Latn",
+ "mya_Mymr",
+ "nld_Latn",
+ "nno_Latn",
+ "nob_Latn",
+ "npi_Deva",
+ "nso_Latn",
+ "nus_Latn",
+ "nya_Latn",
+ "oci_Latn",
+ "gaz_Latn",
+ "ory_Orya",
+ "pag_Latn",
+ "pan_Guru",
+ "pap_Latn",
+ "pes_Arab",
+ "pol_Latn",
+ "por_Latn",
+ "prs_Arab",
+ "pbt_Arab",
+ "quy_Latn",
+ "ron_Latn",
+ "run_Latn",
+ "rus_Cyrl",
+ "sag_Latn",
+ "san_Deva",
+ "sat_Olck",
+ "scn_Latn",
+ "shn_Mymr",
+ "sin_Sinh",
+ "slk_Latn",
+ "slv_Latn",
+ "smo_Latn",
+ "sna_Latn",
+ "snd_Arab",
+ "som_Latn",
+ "sot_Latn",
+ "spa_Latn",
+ "als_Latn",
+ "srd_Latn",
+ "srp_Cyrl",
+ "ssw_Latn",
+ "sun_Latn",
+ "swe_Latn",
+ "swh_Latn",
+ "szl_Latn",
+ "tam_Taml",
+ "tat_Cyrl",
+ "tel_Telu",
+ "tgk_Cyrl",
+ "tgl_Latn",
+ "tha_Thai",
+ "tir_Ethi",
+ "taq_Latn",
+ "taq_Tfng",
+ "tpi_Latn",
+ "tsn_Latn",
+ "tso_Latn",
+ "tuk_Latn",
+ "tum_Latn",
+ "tur_Latn",
+ "twi_Latn",
+ "tzm_Tfng",
+ "uig_Arab",
+ "ukr_Cyrl",
+ "umb_Latn",
+ "urd_Arab",
+ "uzn_Latn",
+ "vec_Latn",
+ "vie_Latn",
+ "war_Latn",
+ "wol_Latn",
+ "xho_Latn",
+ "ydd_Hebr",
+ "yor_Latn",
+ "yue_Hant",
+ "zho_Hans",
+ # "zho_Hant",
+ "zsm_Latn",
+ "zul_Latn",
+]
+
+
+def flores_adapter(lang1, lang2):
+ return lambda line: {
+ "source_text": line[f"sentence_{lang1}"],
+ "target_text": line[f"sentence_{lang2}"],
+ }
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"flores200:{lang1}-{lang2}",
+ prompt_function=get_translation_prompt_function(
+ source_language=Language(manage_duplicate_language_codes(lang1.split("_")[0])),
+ target_language=Language(manage_duplicate_language_codes(lang2.split("_")[0])),
+ adapter=flores_adapter(lang1, lang2),
+ formulation=CFFormulation(),
+ ),
+ suite=("lighteval",),
+ hf_repo="facebook/flores",
+ hf_subset=f"{lang1}-{lang2}",
+ hf_avail_splits=["dev", "devtest"],
+ evaluation_splits=["devtest"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=300,
+ metrics=[Metrics.chrf_plus, Metrics.bleu, Metrics.bleu_1, Metrics.bleu_4],
+ stop_sequence=["\n"],
+ version=0,
+ )
+ for (lang1, lang2) in permutations(flores_200_languages, 2)
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/fquad_v2.py b/src/lighteval/tasks/multilingual/tasks/fquad_v2.py
new file mode 100644
index 000000000..b7f177a32
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/fquad_v2.py
@@ -0,0 +1,53 @@
+"""
+name:
+Fquad V2
+
+dataset:
+manu/fquad2_test
+
+abstract:
+FQuAD v2: French Question Answering Dataset version 2.
+
+languages:
+french
+
+tags:
+multilingual, qa
+
+paper:
+https://arxiv.org/abs/2002.06071
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"fquadv2_{Language.FRENCH.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.FRENCH,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="manu/fquad2_test",
+ hf_subset="default",
+ evaluation_splits=("test_hasAns",),
+ few_shots_split="valid_hasAns",
+ generation_size=400,
+ stop_sequence=("\n",),
+ metrics=(
+ MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.FRENCH),
+ ),
+ )
+]
diff --git a/community_tasks/french_evals.py b/src/lighteval/tasks/multilingual/tasks/french.py
similarity index 72%
rename from community_tasks/french_evals.py
rename to src/lighteval/tasks/multilingual/tasks/french.py
index 8e0480aac..12cf3d928 100644
--- a/community_tasks/french_evals.py
+++ b/src/lighteval/tasks/multilingual/tasks/french.py
@@ -1,33 +1,21 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
+"""
+name:
+French Evals
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+dataset:
+fr-gouv-coordination-ia/IFEval-fr, fr-gouv-coordination-ia/gpqa-fr, fr-gouv-coordination-ia/bac-fr
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+abstract:
+Collection of benchmarks for the french language.
-# ruff: noqa: F405, F403, F401
-"""
-Custom evaluation tasks for lighteval.
+languages:
+french
-This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
+tags:
+knowledge, multiple-choice, qa
-This module implements tasks for the french specific datasets
-See : https://huggingface.co/fr-gouv-coordination-ia
+paper:
+https://huggingface.co/fr-gouv-coordination-ia
"""
import random
@@ -35,9 +23,9 @@
from lighteval.metrics.metrics import Metrics
from lighteval.metrics.normalizations import math_normalizer
from lighteval.tasks.default_prompts import LETTER_INDICES
-from lighteval.tasks.extended.ifeval.main import ifeval_metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
+from lighteval.tasks.tasks.ifeval.main import ifeval_metrics
from lighteval.utils.utils import as_list
diff --git a/src/lighteval/tasks/multilingual/tasks/french_boolq.py b/src/lighteval/tasks/multilingual/tasks/french_boolq.py
new file mode 100644
index 000000000..d1bd58931
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/french_boolq.py
@@ -0,0 +1,53 @@
+"""
+name:
+French Boolq
+
+dataset:
+manu/french_boolq
+
+abstract:
+French Boolq multilingual benchmark.
+
+languages:
+french
+
+tags:
+classification, multilingual, qa
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+ MultilingualQuasiExactMatchMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.boolq import get_boolq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"community_boolq_{Language.FRENCH.value}",
+ prompt_function=get_boolq_prompt_function(
+ Language.FRENCH,
+ lambda line: {
+ "question": line["question"],
+ "answer": line["label"] == 1,
+ "context": line["passage"],
+ },
+ formulation=CFFormulation(),
+ ),
+ suite=("lighteval",),
+ hf_repo="manu/french_boolq",
+ hf_subset="default",
+ evaluation_splits=("test",),
+ few_shots_split="valid",
+ generation_size=5,
+ stop_sequence=["\n"],
+ metrics=[MultilingualQuasiExactMatchMetric(Language.FRENCH, "full"), LogLikelihoodAccMetric()],
+ )
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/french_triviqa.py b/src/lighteval/tasks/multilingual/tasks/french_triviqa.py
new file mode 100644
index 000000000..7fa335703
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/french_triviqa.py
@@ -0,0 +1,51 @@
+"""
+name:
+French Triviqa
+
+dataset:
+manu/french-trivia
+
+abstract:
+French Triviqa multilingual benchmark.
+
+languages:
+french
+
+tags:
+multilingual, qa
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"community_triviaqa_{Language.FRENCH.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.FRENCH,
+ lambda line: {
+ "question": line["Question"],
+ "choices": [line["Answer"]],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="manu/french-trivia",
+ hf_subset="default",
+ evaluation_splits=("train",),
+ hf_avail_splits=["train"],
+ generation_size=400,
+ stop_sequence=("\n",),
+ metrics=[
+ MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.FRENCH),
+ ],
+ )
+]
diff --git a/community_tasks/german_rag_evals.py b/src/lighteval/tasks/multilingual/tasks/german_rag.py
similarity index 78%
rename from community_tasks/german_rag_evals.py
rename to src/lighteval/tasks/multilingual/tasks/german_rag.py
index 052826287..06eb398d7 100644
--- a/community_tasks/german_rag_evals.py
+++ b/src/lighteval/tasks/multilingual/tasks/german_rag.py
@@ -1,33 +1,21 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-# Copyright (c) 2024 Philip May, Deutsche Telekom AG
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# ruff: noqa: F405, F403, F401
"""
-Custom evaluation tasks for lighteval.
+name:
+German RAG Evals
-This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
-This module implements the 4 tasks of deutsche-telekom/Ger-RAG-eval.
-See: https://huggingface.co/datasets/deutsche-telekom/Ger-RAG-eval
+dataset:
+deutsche-telekom/Ger-RAG-eval
+
+abstract:
+Collection of benchmarks for the German language.
+
+languages:
+german
+
+tags:
+knowledge, reasoning, multiple-choice
+
+paper:
+https://huggingface.co/datasets/deutsche-telekom/Ger-RAG-eval
"""
from lighteval.metrics.metrics import Metrics
diff --git a/src/lighteval/tasks/multilingual/tasks/germanquad.py b/src/lighteval/tasks/multilingual/tasks/germanquad.py
new file mode 100644
index 000000000..895c2bedc
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/germanquad.py
@@ -0,0 +1,55 @@
+"""
+name:
+Germanquad
+
+dataset:
+deepset/germanquad
+
+abstract:
+GermanQuAD: High-quality German QA dataset with 13,722 questions.
+
+languages:
+german
+
+tags:
+multilingual, qa
+
+paper:
+https://arxiv.org/abs/2104.12741
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"germanquad_{Language.GERMAN.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.GERMAN,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="deepset/germanquad",
+ hf_subset="plain_text",
+ hf_revision="fff05ceaf2ffbe5b65c7e0c57e678f7b7e1a0581",
+ hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
+ evaluation_splits=("test",),
+ few_shots_split="train",
+ generation_size=400,
+ stop_sequence=("\n",),
+ metrics=(
+ MultilingualQuasiExactMatchMetric(Language.GERMAN, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.GERMAN),
+ ),
+ )
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/global_mmlu.py b/src/lighteval/tasks/multilingual/tasks/global_mmlu.py
new file mode 100644
index 000000000..217eb25e6
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/global_mmlu.py
@@ -0,0 +1,184 @@
+"""
+name:
+Global Mmlu
+
+dataset:
+CohereForAI/Global-MMLU
+
+abstract:
+Translated MMLU using both professional and non-professional translators.
+Contains tags for cultural sensitivity.
+
+languages:
+amharic, arabic, bengali, chinese, czech, dutch, english, french, german,
+hebrew, hindi, indonesian, italian, japanese, korean, malay, norwegian, polish,
+portuguese, romanian, russian, serbian, spanish, swahili, swedish, tamil,
+telugu, thai, turkish, ukrainian, urdu, vietnamese, yoruba, zulu
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+https://huggingface.co/papers/2412.03304
+"""
+
+from functools import partial
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+MMLU_SUBSETS = [
+ "abstract_algebra",
+ "anatomy",
+ "astronomy",
+ "business_ethics",
+ "clinical_knowledge",
+ "college_biology",
+ "college_chemistry",
+ "college_computer_science",
+ "college_mathematics",
+ "college_medicine",
+ "college_physics",
+ "computer_security",
+ "conceptual_physics",
+ "econometrics",
+ "electrical_engineering",
+ "elementary_mathematics",
+ "formal_logic",
+ "global_facts",
+ "high_school_biology",
+ "high_school_chemistry",
+ "high_school_computer_science",
+ "high_school_european_history",
+ "high_school_geography",
+ "high_school_government_and_politics",
+ "high_school_macroeconomics",
+ "high_school_mathematics",
+ "high_school_microeconomics",
+ "high_school_physics",
+ "high_school_psychology",
+ "high_school_statistics",
+ "high_school_us_history",
+ "high_school_world_history",
+ "human_aging",
+ "human_sexuality",
+ "international_law",
+ "jurisprudence",
+ "logical_fallacies",
+ "machine_learning",
+ "management",
+ "marketing",
+ "medical_genetics",
+ "miscellaneous",
+ "moral_disputes",
+ "moral_scenarios",
+ "nutrition",
+ "philosophy",
+ "prehistory",
+ "professional_accounting",
+ "professional_law",
+ "professional_medicine",
+ "professional_psychology",
+ "public_relations",
+ "security_studies",
+ "sociology",
+ "us_foreign_policy",
+ "virology",
+ "world_religions",
+]
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"global_mmlu_{sensitivity_label.lower()}_{language.value}_{formulation.name.lower()}:{subset}",
+ prompt_function=get_mcq_prompt_function(
+ language,
+ lambda line: {
+ "question": line["question"],
+ "choices": [line["option_a"], line["option_b"], line["option_c"], line["option_d"]],
+ "gold_idx": LETTER_INDICES.index(line["answer"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="CohereForAI/Global-MMLU",
+ hf_subset=standardize_tag(language.value),
+ evaluation_splits=("test",),
+ few_shots_split="dev",
+ hf_filter=partial(
+ lambda subset, sensitivity_label, x: x["subject"].lower() == subset
+ and (
+ sensitivity_label == "ALL" or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
+ )
+ and all(x[f"option_{opt}"] is not None and x[f"option_{opt}"].strip() for opt in "abcd"),
+ subset,
+ sensitivity_label,
+ ),
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
+ ],
+ ),
+ )
+ for subset in MMLU_SUBSETS
+ for language in [
+ Language.AMHARIC,
+ Language.ARABIC,
+ Language.BENGALI,
+ Language.CHINESE,
+ Language.CZECH,
+ Language.GERMAN,
+ Language.ENGLISH,
+ Language.SPANISH,
+ Language.FRENCH,
+ Language.HEBREW,
+ Language.HINDI,
+ Language.INDONESIAN,
+ Language.ITALIAN,
+ Language.JAPANESE,
+ Language.KOREAN,
+ Language.MALAY,
+ Language.DUTCH,
+ Language.NORWEGIAN,
+ Language.POLISH,
+ Language.PORTUGUESE,
+ Language.ROMANIAN,
+ Language.RUSSIAN,
+ Language.SERBIAN,
+ Language.SWEDISH,
+ Language.SWAHILI,
+ Language.TAMIL,
+ Language.TELUGU,
+ Language.THAI,
+ Language.TURKISH,
+ Language.UKRAINIAN,
+ Language.URDU,
+ Language.VIETNAMESE,
+ Language.YORUBA,
+ Language.ZULU,
+ ]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+ for sensitivity_label in ["ALL", "CA", "CS", "UNK"]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/hellaswag_hin.py b/src/lighteval/tasks/multilingual/tasks/hellaswag_hin.py
new file mode 100644
index 000000000..ad3db12de
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/hellaswag_hin.py
@@ -0,0 +1,62 @@
+"""
+name:
+Hellaswag Hin
+
+dataset:
+ai4bharat/hellaswag-hi
+
+abstract:
+Hellaswag Hin multilingual benchmark.
+
+languages:
+hindi
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"community_hellaswag_{Language.HINDI.value}_{formulation.name.lower()}",
+ suite=["lighteval"],
+ prompt_function=get_hellaswag_prompt_function(
+ language=Language.HINDI,
+ adapter=lambda line: {
+ "ctx_a": line["ctx_a"],
+ "continuations": line["endings"],
+ "gold_idx": int(line["label"]),
+ },
+ formulation=formulation,
+ ),
+ hf_repo="ai4bharat/hellaswag-hi",
+ hf_filter=lambda line: all(len(choice.strip()) > 0 for choice in line["endings"]),
+ hf_subset="hi",
+ evaluation_splits=("validation",),
+ few_shots_split="validation",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ ],
+ ),
+ )
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/hellaswag_tel.py b/src/lighteval/tasks/multilingual/tasks/hellaswag_tel.py
new file mode 100644
index 000000000..127329160
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/hellaswag_tel.py
@@ -0,0 +1,61 @@
+"""
+name:
+Hellaswag Tel
+
+dataset:
+LightFury9/hellaswag-telugu
+
+abstract:
+Hellaswag Tel multilingual benchmark.
+
+languages:
+telugu
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"community_hellaswag_{Language.TELUGU.value}_{formulation.name.lower()}",
+ suite=["lighteval"],
+ prompt_function=get_hellaswag_prompt_function(
+ language=Language.TELUGU,
+ adapter=lambda line: {
+ "ctx_a": line["ctx_a"],
+ "continuations": line["endings"],
+ "gold_idx": int(line["label"]),
+ },
+ formulation=formulation,
+ ),
+ hf_repo="LightFury9/hellaswag-telugu",
+ hf_subset="default",
+ evaluation_splits=("valid",),
+ few_shots_split="train",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/hellaswag_tha.py b/src/lighteval/tasks/multilingual/tasks/hellaswag_tha.py
new file mode 100644
index 000000000..201f287bd
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/hellaswag_tha.py
@@ -0,0 +1,65 @@
+"""
+name:
+Hellaswag Tha
+
+dataset:
+lighteval/hellaswag_thai
+
+abstract:
+Hellaswag Thai This is a Thai adaptation of the Hellaswag task. Similar to the
+Turkish version, there's no specific paper, but it has been found to be
+effective for evaluating Thai language models on commonsense reasoning tasks.
+
+languages:
+thai
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"community_hellaswag_{Language.THAI.value}_{formulation.name.lower()}",
+ suite=["lighteval"],
+ prompt_function=get_hellaswag_prompt_function(
+ language=Language.THAI,
+ adapter=lambda line: {
+ "ctx_a": line["ctx_a"],
+ "ctx_b": line["ctx_b"],
+ "continuations": line["endings"],
+ "gold_idx": int(line["label"]),
+ },
+ formulation=formulation,
+ wikihow_artifacts=[" [ชื่อ]", " [ส่วนหัว]", " [ขั้นตอน]", " [header]", " [Header]"],
+ ),
+ hf_repo="lighteval/hellaswag_thai",
+ hf_subset="default",
+ evaluation_splits=["validation"],
+ few_shots_split="train",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/hellaswag_tur.py b/src/lighteval/tasks/multilingual/tasks/hellaswag_tur.py
new file mode 100644
index 000000000..84cb9bc52
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/hellaswag_tur.py
@@ -0,0 +1,68 @@
+"""
+name:
+Hellaswag Tur
+
+dataset:
+malhajar/hellaswag_tr-v0.2
+
+abstract:
+Hellaswag Turkish This is a Turkish adaptation of the Hellaswag task. While
+there's no specific paper for this version, it has been found to work well for
+evaluating Turkish language models on commonsense reasoning tasks. We don't
+handle them in single task as there is quite a lot of differences
+(dataset/subset, dot replacement, etc.) which would make it hard to read
+
+languages:
+turkish
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"community_hellaswag_{Language.TURKISH.value}_{formulation.name.lower()}",
+ suite=["lighteval"],
+ prompt_function=get_hellaswag_prompt_function(
+ language=Language.TURKISH,
+ adapter=lambda line: {
+ "ctx_a": line["ctx_a"],
+ "ctx_b": line["ctx_b"],
+ "continuations": line["endings"],
+ "gold_idx": int(line["label"]),
+ },
+ formulation=formulation,
+ # https://github.com/malhajar17/lm-evaluation-harness_turkish/blob/main/lm_eval/tasks/hellaswag_tr-v0.2/utils.py
+ wikihow_artifacts=[" [title]", " [başlık]", " [adım]", " [header]"],
+ ),
+ hf_repo="malhajar/hellaswag_tr-v0.2",
+ hf_subset="default",
+ evaluation_splits=["validation"],
+ hf_avail_splits=["validation"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/hindi_arc.py b/src/lighteval/tasks/multilingual/tasks/hindi_arc.py
new file mode 100644
index 000000000..625a0ebd0
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/hindi_arc.py
@@ -0,0 +1,70 @@
+"""
+name:
+Hindi Arc
+
+dataset:
+ai4bharat/ai2_arc-hi
+
+abstract:
+Hindi Arc multilingual benchmark.
+
+languages:
+hindi
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"community_arc_{Language.HINDI.value}_{formulation.name.lower()}:{subset}",
+ prompt_function=get_mcq_prompt_function(
+ Language.HINDI,
+ lambda line: {
+ "question": line["question"],
+ "choices": line["choices"]["text"],
+ "gold_idx": int(line["answerKey"]) - 1
+ if line["answerKey"].isdigit()
+ else LETTER_INDICES.index(line["answerKey"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="ai4bharat/ai2_arc-hi",
+ hf_subset=f"ARC-{subset.capitalize()}",
+ evaluation_splits=("test",),
+ few_shots_split="validation",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ]
+ + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore
+ ),
+ )
+ for subset in ["easy", "challenge"]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/hindi_boolq.py b/src/lighteval/tasks/multilingual/tasks/hindi_boolq.py
new file mode 100644
index 000000000..2a77d0ac2
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/hindi_boolq.py
@@ -0,0 +1,62 @@
+"""
+name:
+Hindi Boolq
+
+dataset:
+ai4bharat/boolq-hi
+
+abstract:
+Hindi Boolq multilingual benchmark.
+
+languages:
+gujarati, hindi, malayalam, marathi, tamil
+
+tags:
+classification, multilingual, qa
+
+paper:
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+ MultilingualQuasiExactMatchMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.boolq import get_boolq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"community_boolq_{language.value}",
+ prompt_function=get_boolq_prompt_function(
+ language,
+ lambda line: {
+ "question": line["question"],
+ "answer": line["answer"],
+ "context": line["passage"],
+ },
+ formulation=CFFormulation(),
+ ),
+ suite=("lighteval",),
+ hf_repo="ai4bharat/boolq-hi",
+ hf_subset=standardize_tag(language.value),
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ generation_size=5,
+ stop_sequence=["\n"],
+ metrics=[MultilingualQuasiExactMatchMetric(language, "full"), LogLikelihoodAccMetric()],
+ )
+ for language in [
+ Language.HINDI,
+ Language.GUJARATI,
+ Language.MALAYALAM,
+ Language.MARATHI,
+ Language.TAMIL,
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/indicqa.py b/src/lighteval/tasks/multilingual/tasks/indicqa.py
new file mode 100644
index 000000000..09eb297d5
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/indicqa.py
@@ -0,0 +1,71 @@
+"""
+name:
+Indicqa
+
+dataset:
+ai4bharat/IndicQA
+
+abstract:
+IndicQA: A reading comprehension dataset for 11 Indian languages.
+
+languages:
+assamese, bengali, gujarati, hindi, kannada, malayalam, marathi, oriya, punjabi,
+tamil, telugu
+
+tags:
+multilingual, qa
+
+paper:
+https://arxiv.org/abs/2407.13522
+"""
+
+from langcodes import Language as LangCodeLanguage
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"indicqa_{language.value}",
+ prompt_function=get_qa_prompt_function(
+ language,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="ai4bharat/IndicQA",
+ hf_subset=f"indicqa.{LangCodeLanguage.get(language.value).language}",
+ hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
+ hf_revision="92d96092ae229950973dac3b9998f8b3a8949b0a",
+ evaluation_splits=("test",),
+ hf_avail_splits=("test",),
+ generation_size=400,
+ metrics=(
+ MultilingualQuasiExactMatchMetric(language, "prefix"),
+ MultilingualQuasiF1ScoreMetric(language),
+ ),
+ stop_sequence=("\n",),
+ )
+ for language in [
+ Language.ASSAMESE,
+ Language.BENGALI,
+ Language.GUJARATI,
+ Language.HINDI,
+ Language.KANNADA,
+ Language.MALAYALAM,
+ Language.MARATHI,
+ Language.ORIYA,
+ Language.PUNJABI,
+ Language.TAMIL,
+ Language.TELUGU,
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/kenswquad.py b/src/lighteval/tasks/multilingual/tasks/kenswquad.py
new file mode 100644
index 000000000..c90ca1c36
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/kenswquad.py
@@ -0,0 +1,53 @@
+"""
+name:
+Kenswquad
+
+dataset:
+lighteval/KenSwQuAD
+
+abstract:
+KenSwQuAD: A question answering dataset for Kenyan Swahili.
+
+languages:
+swahili
+
+tags:
+multilingual, qa
+
+paper:
+https://arxiv.org/abs/2205.02364
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"kenswquad_{Language.SWAHILI.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.SWAHILI,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [line["answer"]],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="lighteval/KenSwQuAD",
+ hf_subset="default",
+ evaluation_splits=("test",),
+ few_shots_split="validation",
+ metrics=(
+ MultilingualQuasiExactMatchMetric(Language.SWAHILI, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.SWAHILI),
+ ),
+ generation_size=400,
+ stop_sequence=("\n",),
+ )
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/m3exams.py b/src/lighteval/tasks/multilingual/tasks/m3exams.py
new file mode 100644
index 000000000..65a03f94a
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/m3exams.py
@@ -0,0 +1,85 @@
+"""
+name:
+M3Exams
+
+dataset:
+chiayewken/m3exam
+
+abstract:
+M3Exam: Multitask Multilingual Multimodal Evaluation Benchmark It also contains
+a multimodal version but we don't support that Paper:
+https://arxiv.org/abs/2306.05179
+
+languages:
+afrikaans, chinese, english, italian, javanese, portuguese, swahili, thai,
+vietnamese
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+https://arxiv.org/abs/2306.05179
+"""
+
+from functools import partial
+
+from langcodes import Language as LangCodeLanguage
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.adapters import (
+ get_m3exam_adapter,
+)
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"m3exams_{language.value}_{formulation.name.lower()}",
+ suite=("lighteval",),
+ prompt_function=get_mcq_prompt_function(
+ language,
+ partial(get_m3exam_adapter, language),
+ formulation=formulation,
+ ),
+ hf_repo="chiayewken/m3exam",
+ hf_subset=LangCodeLanguage(standardize_tag(language.value)).language_name().lower(),
+ evaluation_splits=("test",),
+ few_shots_split="dev",
+ generation_size=-1,
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for language in [
+ Language.AFRIKAANS,
+ Language.CHINESE,
+ Language.ENGLISH,
+ Language.ITALIAN,
+ Language.JAVANESE,
+ Language.PORTUGUESE,
+ Language.SWAHILI,
+ Language.THAI,
+ Language.VIETNAMESE,
+ ]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/mathlogicqa_rus.py b/src/lighteval/tasks/multilingual/tasks/mathlogicqa_rus.py
new file mode 100644
index 000000000..ac7652a46
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/mathlogicqa_rus.py
@@ -0,0 +1,70 @@
+"""
+name:
+Mathlogicqa Rus
+
+dataset:
+ai-forever/MERA
+
+abstract:
+MathLogicQA is a dataset for evaluating mathematical reasoning in language
+models. It consists of multiple-choice questions that require logical reasoning
+and mathematical problem-solving. This Russian version is part of the MERA
+(Multilingual Evaluation of Reasoning Abilities) benchmark.
+
+languages:
+russian
+
+tags:
+math, multilingual, qa, reasoning
+
+paper:
+https://github.com/ai-forever/MERA
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mathlogic_qa_{Language.RUSSIAN.value}_{formulation.name.lower()}",
+ prompt_function=get_mcq_prompt_function(
+ Language.RUSSIAN,
+ lambda line: {
+ "question": line["inputs"]["text"],
+ "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]],
+ "gold_idx": LETTER_INDICES.index(line["outputs"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="ai-forever/MERA",
+ hf_subset="mathlogicqa",
+ evaluation_splits=("train",),
+ hf_avail_splits=["train"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [
+ CFFormulation(),
+ MCFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/meta_mmlu.py b/src/lighteval/tasks/multilingual/tasks/meta_mmlu.py
new file mode 100644
index 000000000..f7a88e3f6
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/meta_mmlu.py
@@ -0,0 +1,149 @@
+"""
+name:
+Meta Mmlu
+
+dataset:
+meta-llama/Meta-Llama-3.1-8B-Instruct-evals
+
+abstract:
+Meta MMLU: A multilingual version of MMLU (using google translation)
+
+languages:
+french, german, hindi, italian, portuguese, spanish, thai
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+https://arxiv.org/abs/2407.21783
+"""
+
+from functools import partial
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+MMLU_SUBSETS = [
+ "abstract_algebra",
+ "anatomy",
+ "astronomy",
+ "business_ethics",
+ "clinical_knowledge",
+ "college_biology",
+ "college_chemistry",
+ "college_computer_science",
+ "college_mathematics",
+ "college_medicine",
+ "college_physics",
+ "computer_security",
+ "conceptual_physics",
+ "econometrics",
+ "electrical_engineering",
+ "elementary_mathematics",
+ "formal_logic",
+ "global_facts",
+ "high_school_biology",
+ "high_school_chemistry",
+ "high_school_computer_science",
+ "high_school_european_history",
+ "high_school_geography",
+ "high_school_government_and_politics",
+ "high_school_macroeconomics",
+ "high_school_mathematics",
+ "high_school_microeconomics",
+ "high_school_physics",
+ "high_school_psychology",
+ "high_school_statistics",
+ "high_school_us_history",
+ "high_school_world_history",
+ "human_aging",
+ "human_sexuality",
+ "international_law",
+ "jurisprudence",
+ "logical_fallacies",
+ "machine_learning",
+ "management",
+ "marketing",
+ "medical_genetics",
+ "miscellaneous",
+ "moral_disputes",
+ "moral_scenarios",
+ "nutrition",
+ "philosophy",
+ "prehistory",
+ "professional_accounting",
+ "professional_law",
+ "professional_medicine",
+ "professional_psychology",
+ "public_relations",
+ "security_studies",
+ "sociology",
+ "us_foreign_policy",
+ "virology",
+ "world_religions",
+]
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"meta_mmlu_{language.value}_{formulation.name.lower()}:{subset}",
+ prompt_function=get_mcq_prompt_function(
+ language,
+ lambda line: {
+ "question": line["input_question"],
+ "choices": [v for _, v in sorted(line["input_choice_list"].items(), key=lambda x: x[0])],
+ "gold_idx": LETTER_INDICES.index(line["input_correct_responses"][0]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="meta-llama/Meta-Llama-3.1-8B-Instruct-evals",
+ hf_subset=f"Llama-3.1-8B-Instruct-evals__multilingual_mmlu_{standardize_tag(language.value)}__details",
+ hf_filter=partial(
+ lambda language, subset, line: line["subtask_name"]
+ == f"mmlu_{standardize_tag(language.value)}_chat.{subset}",
+ language,
+ subset,
+ ),
+ evaluation_splits=("latest",),
+ hf_avail_splits=["latest"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
+ ],
+ ),
+ )
+ for subset in MMLU_SUBSETS
+ for language in [
+ Language.GERMAN,
+ Language.SPANISH,
+ Language.FRENCH,
+ Language.HINDI,
+ Language.ITALIAN,
+ Language.PORTUGUESE,
+ Language.THAI,
+ ]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/mgsm.py b/src/lighteval/tasks/multilingual/tasks/mgsm.py
new file mode 100644
index 000000000..c72cf1ca7
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/mgsm.py
@@ -0,0 +1,67 @@
+"""
+name:
+Mgsm
+
+dataset:
+juletxara/mgsm
+
+abstract:
+Mgsm multilingual benchmark.
+
+languages:
+bengali, chinese, english, french, german, japanese, russian, spanish, swahili,
+telugu, thai
+
+tags:
+math, multilingual, reasoning
+
+paper:
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mgsm_{language.value}",
+ prompt_function=get_qa_prompt_function(
+ language,
+ lambda line: {
+ "question": line["question"],
+ # The cot is available but we have no use:
+ # line["answer"]
+ "choices": [str(line["answer_number"])],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="juletxara/mgsm",
+ hf_subset=standardize_tag(language.value),
+ evaluation_splits=("test",),
+ few_shots_split="train",
+ generation_size=25,
+ metrics=[
+ MultilingualQuasiExactMatchMetric(language, "full"),
+ ],
+ stop_sequence=("\n",),
+ )
+ for language in [
+ Language.ENGLISH,
+ Language.SPANISH,
+ Language.FRENCH,
+ Language.GERMAN,
+ Language.RUSSIAN,
+ Language.CHINESE,
+ Language.JAPANESE,
+ Language.THAI,
+ Language.SWAHILI,
+ Language.BENGALI,
+ Language.TELUGU,
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/mintaka.py b/src/lighteval/tasks/multilingual/tasks/mintaka.py
new file mode 100644
index 000000000..e888a103e
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/mintaka.py
@@ -0,0 +1,64 @@
+"""
+name:
+Mintaka
+
+dataset:
+AmazonScience/mintaka
+
+abstract:
+Mintaka multilingual benchmark.
+
+languages:
+arabic, english, french, german, hindi, italian, japanese, portuguese, spanish
+
+tags:
+knowledge, multilingual, qa
+
+paper:
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mintaka_{lang.value}",
+ prompt_function=get_qa_prompt_function(
+ lang,
+ lambda line: {
+ "question": line["question"],
+ "choices": [line["answerText"]],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="AmazonScience/mintaka",
+ hf_subset=standardize_tag(lang.value),
+ evaluation_splits=("test",),
+ few_shots_split="train",
+ generation_size=400,
+ stop_sequence=("\n",),
+ metrics=[
+ MultilingualQuasiExactMatchMetric(lang, "prefix"),
+ MultilingualQuasiF1ScoreMetric(lang),
+ ],
+ )
+ for lang in [
+ Language.ARABIC,
+ Language.GERMAN,
+ Language.ENGLISH,
+ Language.SPANISH,
+ Language.FRENCH,
+ Language.HINDI,
+ Language.ITALIAN,
+ Language.JAPANESE,
+ Language.PORTUGUESE,
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/mkqa.py b/src/lighteval/tasks/multilingual/tasks/mkqa.py
new file mode 100644
index 000000000..a4d803633
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/mkqa.py
@@ -0,0 +1,108 @@
+"""
+name:
+Mkqa
+
+dataset:
+apple/mkqa
+
+abstract:
+Mkqa multilingual benchmark.
+
+languages:
+arabic, chinese, chinese_hong_kong, chinese_traditional, danish, dutch, english,
+finnish, french, german, hebrew, hungarian, italian, japanese, khmer, korean,
+malay, norwegian, polish, portuguese, russian, spanish, swedish, thai, turkish,
+vietnamese
+
+tags:
+multilingual, qa
+
+paper:
+"""
+
+from functools import partial
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.adapters import (
+ get_mkqa_adapter,
+)
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+MKQA_TASK_TO_ID = {
+ "entity": 0,
+ "long_answer": 1,
+ # "unanswerable": 2,
+ "date": 3,
+ "number": 4,
+ "number_with_unit": 5,
+ "short_phrase": 6,
+ "binary": 7,
+}
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mkqa_{language.value}:{subset}",
+ prompt_function=get_qa_prompt_function(language, partial(get_mkqa_adapter, language)),
+ suite=("lighteval",),
+ hf_repo="apple/mkqa",
+ hf_subset="mkqa",
+ hf_revision="325131889721ae0ed885b76ecb8011369d75abad",
+ hf_filter=partial(
+ lambda language, subset, line: line["answers"][
+ "zh_cn" if language == Language.CHINESE else standardize_tag(language.value)
+ ][0]["type"]
+ == MKQA_TASK_TO_ID[subset],
+ language,
+ subset,
+ ),
+ evaluation_splits=("train",),
+ hf_avail_splits=["train"],
+ stop_sequence=("\n",),
+ metrics=[
+ MultilingualQuasiExactMatchMetric(language, "prefix"),
+ MultilingualQuasiF1ScoreMetric(language),
+ ]
+ if subset in ["entity", "long_answer", "short_phrase"]
+ else [
+ MultilingualQuasiExactMatchMetric(language, "full"),
+ ],
+ )
+ for subset in MKQA_TASK_TO_ID.keys()
+ for language in [
+ Language.ARABIC,
+ Language.DANISH,
+ Language.GERMAN,
+ Language.ENGLISH,
+ Language.SPANISH,
+ Language.FINNISH,
+ Language.FRENCH,
+ Language.HEBREW,
+ Language.HUNGARIAN,
+ Language.ITALIAN,
+ Language.JAPANESE,
+ Language.KOREAN,
+ Language.KHMER,
+ Language.MALAY,
+ Language.DUTCH,
+ Language.NORWEGIAN,
+ Language.POLISH,
+ Language.PORTUGUESE,
+ Language.RUSSIAN,
+ Language.SWEDISH,
+ Language.THAI,
+ Language.TURKISH,
+ Language.VIETNAMESE,
+ Language.CHINESE, # Simplified
+ # Language.CHINESE_HONG_KONG,
+ # Language.CHINESE_TRADITIONAL,
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py b/src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py
new file mode 100644
index 000000000..2a48c369b
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py
@@ -0,0 +1,110 @@
+"""
+name:
+Mlmm Arc Challenge
+
+dataset:
+jon-tow/okapi_arc_challenge
+
+abstract:
+ARC (AI2 Reasoning Challenge) is a dataset for question answering that requires
+reasoning. It consists of multiple-choice science questions from 3rd to 9th
+grade exams. The dataset is split into two parts: ARC-Easy and ARC-Challenge.
+ARC-Easy contains questions that can be answered correctly by both humans and
+simple baseline models. ARC-Challenge contains questions that are difficult for
+both humans and current AI systems. Similar to MMLU, ARC tasks uses PMI
+normalization by default but only for the challenge set.
+
+languages:
+arabic, bengali, catalan, chinese, croatian, danish, dutch, french, german,
+hindi, hungarian, indonesian, italian, kannada, malayalam, marathi, nepali,
+romanian, russian, serbian, slovak, spanish, tamil, telugu, ukrainian,
+vietnamese
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+https://github.com/nlp-uoregon/mlmm-evaluation
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mlmm_arc_{language.value}_{formulation.name.lower()}:challenge",
+ prompt_function=get_mcq_prompt_function(
+ language,
+ lambda line: {
+ "question": line["question"],
+ "choices": line["choices"]["text"],
+ "gold_idx": int(line["answerKey"]) - 1
+ if line["answerKey"].isdigit()
+ else LETTER_INDICES.index(line["answerKey"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="jon-tow/okapi_arc_challenge",
+ hf_subset=standardize_tag(language.value),
+ hf_revision="823d5d7bfaf8974a3ab52a825b6cf4903b35dbc4",
+ evaluation_splits=("test",),
+ few_shots_split="train",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
+ ],
+ ),
+ )
+ for language in [
+ Language.RUSSIAN,
+ Language.GERMAN,
+ Language.CHINESE,
+ Language.FRENCH,
+ Language.SPANISH,
+ Language.ITALIAN,
+ Language.DUTCH,
+ Language.VIETNAMESE,
+ Language.INDONESIAN,
+ Language.ARABIC,
+ Language.HUNGARIAN,
+ Language.ROMANIAN,
+ Language.DANISH,
+ Language.SLOVAK,
+ Language.UKRAINIAN,
+ Language.CATALAN,
+ Language.SERBIAN,
+ Language.CROATIAN,
+ Language.HINDI,
+ Language.BENGALI,
+ Language.TAMIL,
+ Language.NEPALI,
+ Language.MALAYALAM,
+ Language.MARATHI,
+ Language.TELUGU,
+ Language.KANNADA,
+ ]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/mlmm_hellaswag.py b/src/lighteval/tasks/multilingual/tasks/mlmm_hellaswag.py
new file mode 100644
index 000000000..a8933a101
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/mlmm_hellaswag.py
@@ -0,0 +1,108 @@
+"""
+name:
+Mlmm Hellaswag
+
+dataset:
+jon-tow/okapi_hellaswag
+
+abstract:
+Hellaswag is a commonsense reasoning task that requires models to complete a
+given scenario with the most plausible ending. It tests the model's ability to
+understand and reason about everyday situations and human behavior.
+MLMM-Hellaswag: Multilingual adaptation of Hellaswag
+
+languages:
+arabic, armenian, basque, bengali, catalan, chinese, croatian, danish, dutch,
+french, german, gujarati, hindi, hungarian, icelandic, indonesian, italian,
+kannada, malayalam, marathi, nepali, norwegian, portuguese, romanian, russian,
+serbian, slovak, spanish, swedish, tamil, telugu, ukrainian, vietnamese
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+https://arxiv.org/abs/2306.07610
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mlmm_hellaswag_{lang.value}_{formulation.name.lower()}",
+ suite=["lighteval"],
+ prompt_function=get_hellaswag_prompt_function(
+ language=lang,
+ adapter=lambda line: {
+ # We don't use activity_label as they are not available
+ "ctx_a": line["ctx_a"],
+ "ctx_b": line["ctx_b"],
+ "continuations": line["endings"],
+ "gold_idx": int(line["label"]),
+ },
+ formulation=formulation,
+ ),
+ hf_repo="jon-tow/okapi_hellaswag",
+ hf_subset=standardize_tag(lang.value),
+ hf_revision="96ed8e0dfc6172dad1d3df338d7b8ba6c1ff9d83",
+ evaluation_splits=["validation"],
+ hf_avail_splits=["validation"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for lang in [
+ Language.ARABIC,
+ Language.BENGALI,
+ Language.CATALAN,
+ Language.DANISH,
+ Language.GERMAN,
+ Language.SPANISH,
+ Language.BASQUE,
+ Language.FRENCH,
+ Language.GUJARATI,
+ Language.HINDI,
+ Language.CROATIAN,
+ Language.HUNGARIAN,
+ Language.ARMENIAN,
+ Language.INDONESIAN,
+ Language.ICELANDIC,
+ Language.ITALIAN,
+ Language.KANNADA,
+ Language.MALAYALAM,
+ Language.MARATHI,
+ Language.NORWEGIAN,
+ Language.NEPALI,
+ Language.DUTCH,
+ Language.PORTUGUESE,
+ Language.ROMANIAN,
+ Language.RUSSIAN,
+ Language.SLOVAK,
+ Language.SERBIAN,
+ Language.SWEDISH,
+ Language.TAMIL,
+ Language.TELUGU,
+ Language.UKRAINIAN,
+ Language.VIETNAMESE,
+ Language.CHINESE,
+ ]
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/mlmm_mmlu.py b/src/lighteval/tasks/multilingual/tasks/mlmm_mmlu.py
new file mode 100644
index 000000000..031cdc767
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/mlmm_mmlu.py
@@ -0,0 +1,167 @@
+"""
+name:
+Mlmm Mmlu
+
+dataset:
+jon-tow/okapi_mmlu
+
+abstract:
+MLMM MMLU: Another multilingual version of MMLU
+
+languages:
+arabic, bengali, catalan, chinese, croatian, danish, dutch, french, german,
+hindi, hungarian, indonesian, italian, kannada, malayalam, marathi, nepali,
+romanian, russian, serbian, slovak, spanish, tamil, telugu, ukrainian,
+vietnamese
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+https://github.com/nlp-uoregon/mlmm-evaluation
+"""
+
+from functools import partial
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+MMLU_SUBSETS = [
+ "abstract_algebra",
+ "anatomy",
+ "astronomy",
+ "business_ethics",
+ "clinical_knowledge",
+ "college_biology",
+ "college_chemistry",
+ "college_computer_science",
+ "college_mathematics",
+ "college_medicine",
+ "college_physics",
+ "computer_security",
+ "conceptual_physics",
+ "econometrics",
+ "electrical_engineering",
+ "elementary_mathematics",
+ "formal_logic",
+ "global_facts",
+ "high_school_biology",
+ "high_school_chemistry",
+ "high_school_computer_science",
+ "high_school_european_history",
+ "high_school_geography",
+ "high_school_government_and_politics",
+ "high_school_macroeconomics",
+ "high_school_mathematics",
+ "high_school_microeconomics",
+ "high_school_physics",
+ "high_school_psychology",
+ "high_school_statistics",
+ "high_school_us_history",
+ "high_school_world_history",
+ "human_aging",
+ "human_sexuality",
+ "international_law",
+ "jurisprudence",
+ "logical_fallacies",
+ "machine_learning",
+ "management",
+ "marketing",
+ "medical_genetics",
+ "miscellaneous",
+ "moral_disputes",
+ "moral_scenarios",
+ "nutrition",
+ "philosophy",
+ "prehistory",
+ "professional_accounting",
+ "professional_law",
+ "professional_medicine",
+ "professional_psychology",
+ "public_relations",
+ "security_studies",
+ "sociology",
+ "us_foreign_policy",
+ "virology",
+ "world_religions",
+]
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mlmm_mmlu_{language.value}_{formulation.name.lower()}:{subset}",
+ prompt_function=get_mcq_prompt_function(
+ language,
+ lambda line: {
+ "question": line["question"],
+ "choices": line["choices"],
+ "gold_idx": LETTER_INDICES.index(line["answer"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="jon-tow/okapi_mmlu",
+ hf_subset=standardize_tag(language.value),
+ hf_revision="refs/pr/1",
+ hf_filter=partial(lambda subset, line: line["id"].split("/")[0] == subset, subset),
+ evaluation_splits=("test",),
+ few_shots_split="dev",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
+ ],
+ ),
+ )
+ for subset in MMLU_SUBSETS
+ for language in [
+ Language.RUSSIAN,
+ Language.GERMAN,
+ Language.CHINESE,
+ Language.FRENCH,
+ Language.SPANISH,
+ Language.ITALIAN,
+ Language.DUTCH,
+ Language.VIETNAMESE,
+ Language.INDONESIAN,
+ Language.ARABIC,
+ Language.HUNGARIAN,
+ Language.ROMANIAN,
+ Language.DANISH,
+ Language.SLOVAK,
+ Language.UKRAINIAN,
+ Language.CATALAN,
+ Language.SERBIAN,
+ Language.CROATIAN,
+ Language.HINDI,
+ Language.BENGALI,
+ Language.TAMIL,
+ Language.NEPALI,
+ Language.MALAYALAM,
+ Language.MARATHI,
+ Language.TELUGU,
+ Language.KANNADA,
+ ]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/mlmm_truthfulqa.py b/src/lighteval/tasks/multilingual/tasks/mlmm_truthfulqa.py
new file mode 100644
index 000000000..1851693fa
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/mlmm_truthfulqa.py
@@ -0,0 +1,113 @@
+"""
+name:
+Mlmm Truthfulqa
+
+dataset:
+jon-tow/okapi_truthfulqa
+
+abstract:
+TruthfulQA: Measuring How Models Mimic Human Falsehoods
+
+languages:
+arabic, armenian, basque, bengali, catalan, chinese, croatian, danish, dutch,
+french, german, gujarati, hindi, hungarian, icelandic, indonesian, italian,
+kannada, malayalam, marathi, nepali, norwegian, portuguese, romanian, russian,
+serbian, slovak, spanish, swedish, tamil, telugu, ukrainian, vietnamese
+
+tags:
+factuality, multilingual, qa
+
+paper:
+https://arxiv.org/abs/2109.07958
+"""
+
+from functools import partial
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mlmm_truthfulqa_{language.value}_{formulation.name.lower()}:{subset}",
+ prompt_function=get_mcq_prompt_function(
+ language,
+ partial(
+ lambda subset, line: {
+ "question": line["question"],
+ "choices": line[f"{subset}_targets"]["choices"],
+ "gold_idx": [ix for ix, label in enumerate(line[f"{subset}_targets"]["labels"]) if label == 1], # type: ignore
+ },
+ subset,
+ ),
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="jon-tow/okapi_truthfulqa",
+ hf_subset=standardize_tag(language.value),
+ hf_revision="cdd5db1a66fd04105622109d1c2a5cbc8cde7586",
+ evaluation_splits=("validation",),
+ hf_avail_splits=["validation"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for subset in ["mc1", "mc2"]
+ for language in [
+ Language.ARABIC,
+ Language.BENGALI,
+ Language.CATALAN,
+ Language.DANISH,
+ Language.GERMAN,
+ Language.SPANISH,
+ Language.BASQUE,
+ Language.FRENCH,
+ Language.GUJARATI,
+ Language.HINDI,
+ Language.CROATIAN,
+ Language.HUNGARIAN,
+ Language.ARMENIAN,
+ Language.INDONESIAN,
+ Language.ICELANDIC,
+ Language.ITALIAN,
+ Language.KANNADA,
+ Language.MALAYALAM,
+ Language.MARATHI,
+ Language.NORWEGIAN,
+ Language.NEPALI,
+ Language.DUTCH,
+ Language.PORTUGUESE,
+ Language.ROMANIAN,
+ Language.RUSSIAN,
+ Language.SLOVAK,
+ Language.SERBIAN,
+ Language.SWEDISH,
+ Language.TAMIL,
+ Language.TELUGU,
+ Language.UKRAINIAN,
+ Language.VIETNAMESE,
+ Language.CHINESE,
+ ]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/mlqa.py b/src/lighteval/tasks/multilingual/tasks/mlqa.py
new file mode 100644
index 000000000..70515b678
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/mlqa.py
@@ -0,0 +1,68 @@
+"""
+name:
+Mlqa
+
+dataset:
+facebook/mlqa
+
+abstract:
+MLQA (MultiLingual Question Answering) is a benchmark dataset for evaluating
+cross-lingual question answering performance. It consists of QA instances in 7
+languages: English, Arabic, German, Spanish, Hindi, Vietnamese, and Chinese. The
+dataset is derived from the SQuAD v1.1 dataset, with questions and contexts
+translated by professional translators.
+
+languages:
+arabic, chinese, german, hindi, spanish, vietnamese
+
+tags:
+multilingual, qa
+
+paper:
+https://arxiv.org/abs/1910.07475
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mlqa_{lang.value}",
+ prompt_function=get_qa_prompt_function(
+ lang,
+ lambda line: {
+ "context": line["context"],
+ "question": line["question"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="facebook/mlqa",
+ hf_subset=f"mlqa.{standardize_tag(lang.value)}.{standardize_tag(lang.value)}",
+ hf_revision="397ed406c1a7902140303e7faf60fff35b58d285",
+ evaluation_splits=("test",),
+ hf_avail_splits=["test"],
+ generation_size=400,
+ stop_sequence=("\n",),
+ metrics=[
+ MultilingualQuasiExactMatchMetric(lang, "prefix"),
+ MultilingualQuasiF1ScoreMetric(lang),
+ ],
+ )
+ for lang in [
+ Language.ARABIC,
+ Language.GERMAN,
+ Language.SPANISH,
+ Language.CHINESE,
+ Language.HINDI,
+ Language.VIETNAMESE,
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/oab_exams.py b/src/lighteval/tasks/multilingual/tasks/oab_exams.py
new file mode 100644
index 000000000..88302cf53
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/oab_exams.py
@@ -0,0 +1,68 @@
+"""
+name:
+Oab Exams
+
+dataset:
+eduagarcia/oab_exams
+
+abstract:
+OAB Exams: A collection of questions from the Brazilian Bar Association exam The
+exam is required for anyone who wants to practice law in Brazil
+
+languages:
+portuguese
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+https://huggingface.co/datasets/eduagarcia/oab_exams
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"oab_exams_{Language.PORTUGUESE.value}_{formulation.name.lower()}",
+ prompt_function=get_mcq_prompt_function(
+ Language.PORTUGUESE,
+ lambda line: {
+ "question": line["question"],
+ "choices": line["choices"]["text"],
+ "gold_idx": LETTER_INDICES.index(line["answerKey"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="eduagarcia/oab_exams",
+ hf_subset="default",
+ evaluation_splits=("train",),
+ hf_avail_splits=["train"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/ocnli.py b/src/lighteval/tasks/multilingual/tasks/ocnli.py
new file mode 100644
index 000000000..48a7278b1
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/ocnli.py
@@ -0,0 +1,67 @@
+"""
+name:
+Ocnli
+
+dataset:
+clue/clue
+
+abstract:
+Native Chinese NLI dataset based.
+
+languages:
+chinese
+
+tags:
+classification, multilingual, nli
+
+paper:
+https://arxiv.org/pdf/2010.05444
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.nli import get_nli_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"ocnli_{Language.CHINESE.value}_{formulation.name.lower()}",
+ prompt_function=get_nli_prompt_function(
+ language=Language.CHINESE,
+ adapter=lambda line: {
+ "premise": line["sentence1"],
+ "hypothesis": line["sentence2"],
+ # Since we ignore the neutral label
+ "gold_idx": {1: 0, 2: 1}[line["label"]],
+ },
+ relations=["entailment", "contradiction"],
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="clue/clue",
+ hf_subset="ocnli",
+ # Only keep the positive and negative examples
+ hf_filter=lambda x: int(x["label"]) in [1, 2],
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=None),
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/openai_mmlu.py b/src/lighteval/tasks/multilingual/tasks/openai_mmlu.py
new file mode 100644
index 000000000..4a4df728a
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/openai_mmlu.py
@@ -0,0 +1,150 @@
+"""
+name:
+Openai Mmlu
+
+dataset:
+openai/MMMLU
+
+abstract:
+Openai Mmlu multilingual benchmark.
+
+languages:
+arabic, bengali, chinese, french, german, hindi, indonesian, italian, japanese,
+korean, portuguese, spanish, swahili, yoruba
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+"""
+
+from functools import partial
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+MMLU_SUBSETS = [
+ "abstract_algebra",
+ "anatomy",
+ "astronomy",
+ "business_ethics",
+ "clinical_knowledge",
+ "college_biology",
+ "college_chemistry",
+ "college_computer_science",
+ "college_mathematics",
+ "college_medicine",
+ "college_physics",
+ "computer_security",
+ "conceptual_physics",
+ "econometrics",
+ "electrical_engineering",
+ "elementary_mathematics",
+ "formal_logic",
+ "global_facts",
+ "high_school_biology",
+ "high_school_chemistry",
+ "high_school_computer_science",
+ "high_school_european_history",
+ "high_school_geography",
+ "high_school_government_and_politics",
+ "high_school_macroeconomics",
+ "high_school_mathematics",
+ "high_school_microeconomics",
+ "high_school_physics",
+ "high_school_psychology",
+ "high_school_statistics",
+ "high_school_us_history",
+ "high_school_world_history",
+ "human_aging",
+ "human_sexuality",
+ "international_law",
+ "jurisprudence",
+ "logical_fallacies",
+ "machine_learning",
+ "management",
+ "marketing",
+ "medical_genetics",
+ "miscellaneous",
+ "moral_disputes",
+ "moral_scenarios",
+ "nutrition",
+ "philosophy",
+ "prehistory",
+ "professional_accounting",
+ "professional_law",
+ "professional_medicine",
+ "professional_psychology",
+ "public_relations",
+ "security_studies",
+ "sociology",
+ "us_foreign_policy",
+ "virology",
+ "world_religions",
+]
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"openai_mmlu_{language[0].value}_{formulation.name.lower()}:{subset}",
+ prompt_function=get_mcq_prompt_function(
+ language[0],
+ lambda line: {
+ "question": line["Question"],
+ "choices": [line["A"], line["B"], line["C"], line["D"]],
+ "gold_idx": LETTER_INDICES.index(line["Answer"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="openai/MMMLU",
+ hf_subset=language[1],
+ evaluation_splits=("test",),
+ hf_avail_splits=["test"],
+ hf_filter=partial(lambda subset, x: x["Subject"].lower() == subset, subset),
+ hf_revision="038c7808122969ead7456361af05cb8f47d247f8",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
+ ],
+ ),
+ )
+ for subset in MMLU_SUBSETS
+ for language in [
+ (Language.ARABIC, "AR_XY"),
+ (Language.BENGALI, "BN_BD"),
+ (Language.GERMAN, "DE_DE"),
+ (Language.SPANISH, "ES_LA"),
+ (Language.FRENCH, "FR_FR"),
+ (Language.HINDI, "HI_IN"),
+ (Language.INDONESIAN, "ID_ID"),
+ (Language.ITALIAN, "IT_IT"),
+ (Language.JAPANESE, "JA_JP"),
+ (Language.KOREAN, "KO_KR"),
+ (Language.PORTUGUESE, "PT_BR"),
+ (Language.SWAHILI, "SW_KE"),
+ (Language.YORUBA, "YO_NG"),
+ (Language.CHINESE, "ZH_CN"),
+ ]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/openbook_ara.py b/src/lighteval/tasks/multilingual/tasks/openbook_ara.py
new file mode 100644
index 000000000..db5b3a426
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/openbook_ara.py
@@ -0,0 +1,67 @@
+"""
+name:
+Openbook Ara
+
+dataset:
+OALL/AlGhafa-Arabic-LLM-Benchmark-Translated
+
+abstract:
+OpenBookQA: A Question-Answering Dataset for Open-Book Exams OpenBookQA is a
+question-answering dataset modeled after open-book exams for assessing human
+understanding of a subject. It consists of multiple-choice questions that
+require combining facts from a given open book with broad common knowledge. The
+task tests language models' ability to leverage provided information and apply
+common sense reasoning.
+
+languages:
+arabic
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+https://arxiv.org/abs/1809.02789
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.adapters import (
+ alghafa_adapter,
+)
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"alghafa_openbookqa_{Language.ARABIC.value}_{formulation.name.lower()}",
+ prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
+ suite=["lighteval"],
+ hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+ hf_subset="openbook_qa_ext_ar",
+ hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff",
+ evaluation_splits=["test"],
+ few_shots_split="validation",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/openbook_es.py b/src/lighteval/tasks/multilingual/tasks/openbook_es.py
new file mode 100644
index 000000000..c428275fe
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/openbook_es.py
@@ -0,0 +1,67 @@
+"""
+name:
+Openbook Es
+
+dataset:
+BSC-LT/openbookqa-es
+
+abstract:
+Spanish version of OpenBookQA from BSC Language Technology group
+
+languages:
+spanish
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+https://huggingface.co/datasets/BSC-LT/openbookqa-es
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"openbookqa_{Language.SPANISH.value}_{formulation.name.lower()}",
+ prompt_function=get_mcq_prompt_function(
+ Language.SPANISH,
+ lambda line: {
+ "question": line["question_stem"],
+ "choices": line["choices"]["text"],
+ "gold_idx": LETTER_INDICES.index(line["answerKey"]),
+ },
+ formulation=formulation,
+ ),
+ suite=["lighteval"],
+ hf_repo="BSC-LT/openbookqa-es",
+ hf_subset="default",
+ evaluation_splits=("test",),
+ few_shots_split="validation",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/openbook_rus.py b/src/lighteval/tasks/multilingual/tasks/openbook_rus.py
new file mode 100644
index 000000000..498d32eed
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/openbook_rus.py
@@ -0,0 +1,68 @@
+"""
+name:
+Openbook Rus
+
+dataset:
+ai-forever/MERA
+
+abstract:
+The Russian version is part of the MERA (Multilingual Enhanced Russian NLP
+Architectures) project.
+
+languages:
+russian
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+https://arxiv.org/abs/2401.04531
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mera_openbookqa_{Language.RUSSIAN.value}_{formulation.name.lower()}",
+ prompt_function=get_mcq_prompt_function(
+ Language.RUSSIAN,
+ lambda line: {
+ "question": line["inputs"]["question"],
+ "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]],
+ "gold_idx": LETTER_INDICES.index(line["outputs"]),
+ },
+ formulation=formulation,
+ ),
+ suite=["lighteval"],
+ hf_repo="ai-forever/MERA",
+ hf_subset="ruopenbookqa",
+ evaluation_splits=("train",),
+ hf_avail_splits=["train"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/oz.py b/src/lighteval/tasks/multilingual/tasks/oz.py
new file mode 100644
index 000000000..dde7552a1
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/oz.py
@@ -0,0 +1,77 @@
+"""
+name:
+OZ Serbian Evals
+
+dataset:
+DjMel/oz-eval
+
+abstract:
+OZ Eval (sr. Opšte Znanje Evaluacija) dataset was created for the purposes of
+evaluating General Knowledge of LLM models in Serbian language. Data consists
+of 1k+ high-quality questions and answers which were used as part of entry exams
+at the Faculty of Philosophy and Faculty of Organizational Sciences, University
+of Belgrade. The exams test the General Knowledge of students and were used in
+the enrollment periods from 2003 to 2024.
+
+languages:
+serbian
+
+tags:
+knowledge, multiple-choice
+
+paper:
+"""
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+
+def prompt_fn_oz_eval_task(line, task_name: str = None):
+ query_template = """Pitanje: {question}\n
+ Ponuđeni odgovori:
+ A. {choice_a}
+ B. {choice_b}
+ C. {choice_c}
+ D. {choice_d}
+ E. {choice_e}
+
+ Krajnji odgovor:"""
+
+ options = line["options"]
+
+ query = query_template.format(
+ question=line["questions"],
+ choice_a=options[0],
+ choice_b=options[1],
+ choice_c=options[2],
+ choice_d=options[3],
+ choice_e=options[4],
+ )
+
+ choices = ["A", "B", "C", "D", "E"]
+ return Doc(
+ task_name=task_name,
+ query=query,
+ choices=choices,
+ gold_index=choices.index(line["answer"]),
+ )
+
+
+oz_eval_task = LightevalTaskConfig(
+ name="serbian_evals:oz_task",
+ prompt_function=prompt_fn_oz_eval_task,
+ suite=["community"],
+ hf_repo="DjMel/oz-eval",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ metrics=[Metrics.loglikelihood_acc],
+ version=0,
+)
+
+
+# STORE YOUR EVALS
+TASKS_TABLE = [oz_eval_task]
diff --git a/src/lighteval/tasks/multilingual/tasks/parus.py b/src/lighteval/tasks/multilingual/tasks/parus.py
new file mode 100644
index 000000000..6ff91448b
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/parus.py
@@ -0,0 +1,65 @@
+"""
+name:
+Parus
+
+dataset:
+ai-forever/MERA
+
+abstract:
+PARus: Plausible Alternatives for Russian PARus is the Russian adaptation of the
+COPA task, part of the Russian SuperGLUE benchmark. It evaluates common sense
+reasoning and causal inference abilities in Russian language models.
+
+languages:
+russian
+
+tags:
+multilingual
+
+paper:
+https://russiansuperglue.com/tasks/task_info/PARus
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.copa import get_copa_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"parus_{Language.RUSSIAN.value}_{formulation.name.lower()}",
+ suite=["lighteval"],
+ prompt_function=get_copa_prompt_function(
+ language=Language.RUSSIAN,
+ adapter=lambda line: {
+ "context": line["inputs"]["premise"],
+ "cause_effect": line["meta"]["task"],
+ "continuations": [line["inputs"]["choice1"], line["inputs"]["choice2"]],
+ "gold_idx": int(line["outputs"]) - 1,
+ },
+ formulation=formulation,
+ ),
+ hf_repo="ai-forever/MERA",
+ hf_subset="parus",
+ evaluation_splits=["train"],
+ few_shots_split="validation",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/paws_x.py b/src/lighteval/tasks/multilingual/tasks/paws_x.py
new file mode 100644
index 000000000..e294cc15c
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/paws_x.py
@@ -0,0 +1,79 @@
+"""
+name:
+Paws X
+
+dataset:
+google-research-datasets/paws-x
+
+abstract:
+PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification This
+dataset contains paraphrase identification pairs in multiple languages. It's
+derived from PAWS (Paraphrase Adversaries from Word Scrambling) and We treat
+paraphrase as entailment and non-paraphrase as contradiction
+
+languages:
+chinese, english, french, german, japanese, korean, spanish
+
+tags:
+classification, multilingual, nli
+
+paper:
+https://arxiv.org/abs/1908.11828
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.nli import get_nli_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"pawsx_{language.value}_{formulation.name.lower()}",
+ suite=("lighteval",),
+ prompt_function=get_nli_prompt_function(
+ language=language,
+ adapter=lambda line: {
+ "premise": line["sentence1"],
+ "hypothesis": line["sentence2"],
+ # Since we ignore the neutral label
+ "gold_idx": int(line["label"]),
+ },
+ relations=["entailment", "contradiction"],
+ formulation=formulation,
+ ),
+ hf_repo="google-research-datasets/paws-x",
+ hf_subset=standardize_tag(language.value),
+ evaluation_splits=("test",),
+ few_shots_split="train",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=None),
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for language in [
+ Language.GERMAN,
+ Language.ENGLISH,
+ Language.SPANISH,
+ Language.FRENCH,
+ Language.JAPANESE,
+ Language.KOREAN,
+ Language.CHINESE,
+ ]
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/piqa_ar.py b/src/lighteval/tasks/multilingual/tasks/piqa_ar.py
new file mode 100644
index 000000000..e3f7b2f40
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/piqa_ar.py
@@ -0,0 +1,66 @@
+"""
+name:
+Piqa Ar
+
+dataset:
+OALL/AlGhafa-Arabic-LLM-Benchmark-Translated
+
+abstract:
+PIQA: Physical Interaction Question Answering PIQA is a benchmark for testing
+physical commonsense reasoning. This Arabic version is a translation of the
+original PIQA dataset, adapted for Arabic language evaluation. It tests the
+ability to reason about physical interactions in everyday situations.
+
+languages:
+arabic
+
+tags:
+multilingual, multiple-choice, qa, reasoning
+
+paper:
+https://arxiv.org/abs/1911.11641
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.adapters import (
+ alghafa_adapter,
+)
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"alghafa_piqa_{Language.ARABIC.value}_{formulation.name.lower()}",
+ prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
+ suite=["lighteval"],
+ hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
+ hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff",
+ hf_subset="piqa_ar",
+ hf_avail_splits=["test", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split="validation",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/rcb.py b/src/lighteval/tasks/multilingual/tasks/rcb.py
new file mode 100644
index 000000000..7091126a5
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/rcb.py
@@ -0,0 +1,68 @@
+"""
+name:
+Rcb
+
+dataset:
+ai-forever/MERA
+
+abstract:
+Russian Commitment Bank (RCB) is a large-scale NLI dataset with Russian
+sentences, collected from the web and crowdsourcing.
+
+languages:
+russian
+
+tags:
+classification, multilingual, nli
+
+paper:
+https://arxiv.org/abs/2401.04531
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.nli import get_nli_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"rcb_{Language.RUSSIAN.value}_{formulation.name.lower()}",
+ prompt_function=get_nli_prompt_function(
+ language=Language.RUSSIAN,
+ adapter=lambda line: {
+ "premise": line["inputs"]["premise"],
+ "hypothesis": line["inputs"]["hypothesis"],
+ # Since we ignore the neutral label
+ "gold_idx": int(line["outputs"]) - 1,
+ },
+ relations=["entailment", "contradiction"],
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="ai-forever/MERA",
+ hf_subset="rcb",
+ # Ignore neutral label
+ hf_filter=lambda x: int(x["outputs"] or "0") in [1, 2],
+ evaluation_splits=("train",),
+ few_shots_split="validation",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=None),
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/sber_squad.py b/src/lighteval/tasks/multilingual/tasks/sber_squad.py
new file mode 100644
index 000000000..51abc0609
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/sber_squad.py
@@ -0,0 +1,53 @@
+"""
+name:
+Sber Squad
+
+dataset:
+kuznetsoffandrey/sberquad
+
+abstract:
+SberQuAD: A large-scale Russian reading comprehension dataset.
+
+languages:
+russian
+
+tags:
+multilingual, qa
+
+paper:
+https://arxiv.org/abs/1912.09723
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"sber_squad_{Language.RUSSIAN.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.RUSSIAN,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="kuznetsoffandrey/sberquad",
+ hf_subset="sberquad",
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ metrics=(
+ MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.RUSSIAN),
+ ),
+ generation_size=400,
+ stop_sequence=("\n",),
+ )
+]
diff --git a/community_tasks/serbian_eval.py b/src/lighteval/tasks/multilingual/tasks/serbian_eval.py
similarity index 95%
rename from community_tasks/serbian_eval.py
rename to src/lighteval/tasks/multilingual/tasks/serbian_eval.py
index c235c7e47..e2df1f57a 100644
--- a/community_tasks/serbian_eval.py
+++ b/src/lighteval/tasks/multilingual/tasks/serbian_eval.py
@@ -1,34 +1,22 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+"""
+name:
+Serbian Evals
+dataset:
+datatab/serbian-llm-benchmark
-"""
-This module contains task configurations and prompt functions for evaluating
-LLM models on Serbian datasets.
-Each task is defined using the `LightevalTaskConfig` class with its respective
-prompt function.
+abstract:
The tasks cover a variety of benchmarks, including: standard task like ARC[E][C],
BoolQ, Hellaswag, OpenBookQA,PIQA, Winogrande and a custom OZ Eval.
MMLU is separated by subject and also all in one.
+
+languages:
+serbian
+
+tags:
+knowledge, multiple-choice
+
+paper:
"""
from enum import Enum
diff --git a/src/lighteval/tasks/multilingual/tasks/soqal.py b/src/lighteval/tasks/multilingual/tasks/soqal.py
new file mode 100644
index 000000000..ad41456c9
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/soqal.py
@@ -0,0 +1,61 @@
+"""
+name:
+Soqal
+
+dataset:
+OALL/AlGhafa-Arabic-LLM-Benchmark-Native
+
+abstract:
+SOQAL: A large-scale Arabic reading comprehension dataset.
+
+languages:
+arabic
+
+tags:
+multilingual, qa
+
+paper:
+https://arxiv.org/abs/1906.05394
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.adapters import (
+ alghafa_adapter,
+)
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"soqal_{Language.ARABIC.value}_{formulation.name.lower()}",
+ hf_subset="multiple_choice_grounded_statement_soqal_task",
+ prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
+ evaluation_splits=["test"],
+ few_shots_split="validation",
+ suite=["lighteval"],
+ hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/squad_es.py b/src/lighteval/tasks/multilingual/tasks/squad_es.py
new file mode 100644
index 000000000..4022a8420
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/squad_es.py
@@ -0,0 +1,54 @@
+"""
+name:
+Squad Es
+
+dataset:
+ccasimiro/squad_es
+
+abstract:
+SQuAD-es: Spanish translation of the Stanford Question Answering Dataset
+
+languages:
+spanish
+
+tags:
+multilingual, qa
+
+paper:
+https://huggingface.co/datasets/ccasimiro/squad_es
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"squad_{Language.SPANISH.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.SPANISH,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="ccasimiro/squad_es",
+ hf_subset="v2.0.0",
+ hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ metrics=(
+ MultilingualQuasiExactMatchMetric(Language.SPANISH, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.SPANISH),
+ ),
+ generation_size=400,
+ stop_sequence=("\n",),
+ )
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/squad_it.py b/src/lighteval/tasks/multilingual/tasks/squad_it.py
new file mode 100644
index 000000000..d894e19be
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/squad_it.py
@@ -0,0 +1,54 @@
+"""
+name:
+Squad It
+
+dataset:
+crux82/squad_it
+
+abstract:
+SQuAD-it: Italian translation of the SQuAD dataset.
+
+languages:
+italian
+
+tags:
+multilingual, qa
+
+paper:
+https://github.com/crux82/squad-it
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"squad_{Language.ITALIAN.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.ITALIAN,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="crux82/squad_it",
+ hf_subset="default",
+ hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
+ evaluation_splits=("test",),
+ few_shots_split="train",
+ generation_size=400,
+ stop_sequence=("\n",),
+ metrics=(
+ MultilingualQuasiExactMatchMetric(Language.ITALIAN, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.ITALIAN),
+ ),
+ )
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/swahili_arc.py b/src/lighteval/tasks/multilingual/tasks/swahili_arc.py
new file mode 100644
index 000000000..c40efa573
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/swahili_arc.py
@@ -0,0 +1,72 @@
+"""
+name:
+Swahili Arc
+
+dataset:
+
+abstract:
+Swahili Arc multilingual benchmark.
+
+languages:
+swahili
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"community_arc_{Language.SWAHILI.value}_{formulation.name.lower()}:{subset}",
+ prompt_function=get_mcq_prompt_function(
+ Language.SWAHILI,
+ lambda line: {
+ "question": line["question"],
+ "choices": line["choices"]["text"],
+ "gold_idx": int(line["answerKey"]) - 1
+ if line["answerKey"].isdigit()
+ else LETTER_INDICES.index(line["answerKey"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo=f"Mollel/ARC_{subset.capitalize()}_SWH",
+ hf_subset="default",
+ hf_revision="5347439d3193c8a0dabaab3819914bf076dc94d4"
+ if subset == "easy"
+ else "dc1df9df632d14c251594d9129fb833d2ca4429c",
+ evaluation_splits=("test",),
+ few_shots_split="train",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ]
+ + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore
+ ),
+ )
+ for subset in ["easy", "challenge"]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/thai_exams.py b/src/lighteval/tasks/multilingual/tasks/thai_exams.py
new file mode 100644
index 000000000..73f8140f7
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/thai_exams.py
@@ -0,0 +1,64 @@
+"""
+name:
+Thai Exams
+
+dataset:
+scb10x/thai_exam
+
+abstract:
+Thai Exams multilingual benchmark.
+
+languages:
+thai
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.adapters import (
+ thai_exams_adapter,
+)
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+THAI_EXAMS_SUBSETS = ["a_level", "ic", "onet", "tgat", "tpat1"]
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"thai_exams_{Language.THAI.value}_{formulation.name.lower()}:{subset}",
+ prompt_function=get_mcq_prompt_function(Language.THAI, thai_exams_adapter, formulation=formulation),
+ suite=("lighteval",),
+ hf_repo="scb10x/thai_exam",
+ hf_subset=subset,
+ evaluation_splits=("test",),
+ few_shots_split="train",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for subset in THAI_EXAMS_SUBSETS
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/thaiqa.py b/src/lighteval/tasks/multilingual/tasks/thaiqa.py
new file mode 100644
index 000000000..bf2b5c279
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/thaiqa.py
@@ -0,0 +1,52 @@
+"""
+name:
+Thaiqa
+
+dataset:
+lighteval/thaiqa_squad_fixed
+
+abstract:
+ThaiQA: A question answering dataset for the Thai language.
+
+languages:
+thai
+
+tags:
+multilingual, qa
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"thaiqa_{Language.THAI.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.THAI,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["answer"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="lighteval/thaiqa_squad_fixed",
+ hf_subset="default",
+ evaluation_splits=("train",),
+ few_shots_split="validation",
+ generation_size=400,
+ stop_sequence=("\n",),
+ metrics=(
+ MultilingualQuasiExactMatchMetric(Language.THAI, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.THAI),
+ ),
+ )
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/tquad_v2.py b/src/lighteval/tasks/multilingual/tasks/tquad_v2.py
new file mode 100644
index 000000000..e337ff538
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/tquad_v2.py
@@ -0,0 +1,52 @@
+"""
+name:
+Tquad V2
+
+dataset:
+erdometo/tquad2
+
+abstract:
+TQuAD v2: Turkish Question Answering Dataset version 2.
+
+languages:
+turkish
+
+tags:
+multilingual, qa
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"tquadv2_{Language.TURKISH.value}",
+ prompt_function=get_qa_prompt_function(
+ Language.TURKISH,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [a["text"] for a in line["answers"]],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="erdometo/tquad2",
+ hf_subset="default",
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ generation_size=400,
+ stop_sequence=("\n",),
+ metrics=(
+ MultilingualQuasiExactMatchMetric(Language.TURKISH, "prefix"),
+ MultilingualQuasiF1ScoreMetric(Language.TURKISH),
+ ),
+ )
+]
diff --git a/community_tasks/turkic_evals.py b/src/lighteval/tasks/multilingual/tasks/turkic.py
similarity index 64%
rename from community_tasks/turkic_evals.py
rename to src/lighteval/tasks/multilingual/tasks/turkic.py
index 242b25f81..074fc9b4a 100644
--- a/community_tasks/turkic_evals.py
+++ b/src/lighteval/tasks/multilingual/tasks/turkic.py
@@ -1,40 +1,22 @@
-# MIT License
+"""
+name:
+Turkic Evals
-# Copyright (c) 2024 The HuggingFace Team
+dataset:
+jafarisbarov/TUMLU-mini
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
+abstract:
+TUMLU-mini is a benchmark for Turkic language understanding, comprising 1,000
+prompts organized into 10 subsets.
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+languages:
+turkic
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+tags:
+knowledge, multiple-choice
-# ruff: noqa: F405, F403, F401
-"""
-Task to evaluate LLMs on TUMLU-mini benchmark: https://huggingface.co/datasets/jafarisbarov/TUMLU-mini
-
-For more details, see the associated paper:
-
-@misc{isbarov2025tumluunifiednativelanguage,
- title={{TUMLU: A Unified and Native Language Understanding Benchmark for Turkic Languages}},
- author={Jafar Isbarov and Arofat Akhundjanova and Mammad Hajili and Kavsar Huseynova and Dmitry Gaynullin and Anar Rzayev and Osman Tursun and Ilshat Saetov and Rinat Kharisov and Saule Belginova and Ariana Kenbayeva and Amina Alisheva and Aizirek Turdubaeva and Abdullatif Köksal and Samir Rustamov and Duygu Ataman},
- year={2025},
- eprint={2502.11020},
- archivePrefix={arXiv},
- primaryClass={cs.CL},
- url={https://arxiv.org/abs/2502.11020},
-}
+paper:
+https://arxiv.org/abs/2502.11020
"""
from functools import partial
diff --git a/src/lighteval/tasks/multilingual/tasks/turkish_arc.py b/src/lighteval/tasks/multilingual/tasks/turkish_arc.py
new file mode 100644
index 000000000..9174851e6
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/turkish_arc.py
@@ -0,0 +1,70 @@
+"""
+name:
+Turkish Arc
+
+dataset:
+malhajar/arc-tr
+
+abstract:
+Turkish ARC Comes from the Turkish leaderboard
+
+languages:
+turkish
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"community_arc_{Language.TURKISH.value}_{formulation.name.lower()}:{subset}",
+ prompt_function=get_mcq_prompt_function(
+ Language.TURKISH,
+ lambda line: {
+ "question": line["question"],
+ "choices": line["choices"]["text"],
+ "gold_idx": int(line["answerKey"]) - 1
+ if line["answerKey"].isdigit()
+ else LETTER_INDICES.index(line["answerKey"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="malhajar/arc-tr",
+ hf_subset=f"ARC-{subset.capitalize()}",
+ evaluation_splits=("test",),
+ hf_avail_splits=["train"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ]
+ + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore
+ ),
+ )
+ for subset in ["easy", "challenge"]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py b/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py
new file mode 100644
index 000000000..cc0605456
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py
@@ -0,0 +1,81 @@
+"""
+name:
+Turkish Mmlu
+
+dataset:
+AYueksel/TurkishMMLU
+
+abstract:
+Turkish Mmlu multilingual benchmark.
+
+languages:
+turkish
+
+tags:
+knowledge, multilingual, multiple-choice
+
+paper:
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation, normalize_subset
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TURKISH_MMLU_SUBSET = [
+ "Biology",
+ "Chemistry",
+ "Geography",
+ "History",
+ "Mathematics",
+ "Philosophy",
+ "Physics",
+ "Religion_and_Ethics",
+ "Turkish_Language_and_Literature",
+]
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mmlu_{Language.TURKISH.value}_{formulation.name.lower()}:{normalize_subset(subset)}",
+ prompt_function=get_mcq_prompt_function(
+ Language.TURKISH,
+ lambda line: {
+ "question": line["question"],
+ "choices": line["choices"],
+ "gold_idx": LETTER_INDICES.index(line["answer"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="AYueksel/TurkishMMLU",
+ hf_subset=subset,
+ evaluation_splits=("test",),
+ few_shots_split="dev",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
+ ],
+ ),
+ )
+ for subset in TURKISH_MMLU_SUBSET
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/tydiqa.py b/src/lighteval/tasks/multilingual/tasks/tydiqa.py
new file mode 100644
index 000000000..b7a62e2dd
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/tydiqa.py
@@ -0,0 +1,66 @@
+"""
+name:
+Tydiqa
+
+dataset:
+google-research-datasets/tydiqa
+
+abstract:
+Other QA tasks for RC TyDi QA: A benchmark for information-seeking question answering in typologically diverse languages. https://arxiv.org/abs/2003.05002
+
+languages:
+arabic, bengali, english, finnish, indonesian, japanese, korean, russian, swahili, telugu, thai
+
+tags:
+multilingual, qa
+
+paper:
+https://arxiv.org/abs/2003.05002
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"tydiqa_{language.value}",
+ prompt_function=get_qa_prompt_function(
+ language,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="google-research-datasets/tydiqa",
+ hf_subset="secondary_task",
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ generation_size=400,
+ stop_sequence=("\n",),
+ metrics=(
+ MultilingualQuasiExactMatchMetric(language, "prefix"),
+ MultilingualQuasiF1ScoreMetric(language),
+ ),
+ )
+ for language in [
+ Language.ENGLISH,
+ Language.ARABIC,
+ Language.BENGALI,
+ Language.FINNISH,
+ Language.INDONESIAN,
+ Language.JAPANESE,
+ Language.KOREAN,
+ Language.SWAHILI,
+ Language.RUSSIAN,
+ Language.TELUGU,
+ Language.THAI,
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/worldtree_rus.py b/src/lighteval/tasks/multilingual/tasks/worldtree_rus.py
new file mode 100644
index 000000000..814c80b49
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/worldtree_rus.py
@@ -0,0 +1,70 @@
+"""
+name:
+Worldtree Rus
+
+dataset:
+ai-forever/MERA
+
+abstract:
+WorldTree is a dataset for multi-hop inference in science question answering. It
+provides explanations for elementary science questions by combining facts from a
+semi-structured knowledge base. This Russian version is part of the MERA
+(Multilingual Evaluation of Reasoning Abilities) benchmark.
+
+languages:
+russian
+
+tags:
+multilingual
+
+paper:
+https://github.com/ai-forever/MERA
+"""
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mera_worldtree_{Language.RUSSIAN.value}_{formulation.name.lower()}",
+ prompt_function=get_mcq_prompt_function(
+ Language.RUSSIAN,
+ lambda line: {
+ "question": line["inputs"]["question"],
+ "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]],
+ "gold_idx": LETTER_INDICES.index(line["outputs"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="ai-forever/MERA",
+ hf_subset="ruworldtree",
+ evaluation_splits=("train",),
+ hf_avail_splits=["train"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/xcodah.py b/src/lighteval/tasks/multilingual/tasks/xcodah.py
new file mode 100644
index 000000000..5b6783eaf
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/xcodah.py
@@ -0,0 +1,83 @@
+"""
+name:
+Xcodah
+
+dataset:
+INK-USC/xcsr
+
+abstract:
+Xcodah multilingual benchmark.
+
+languages:
+arabic, chinese, dutch, english, french, german, hindi, italian, japanese,
+polish, portuguese, russian, spanish, swahili, urdu, vietnamese
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+"""
+
+from functools import partial
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.adapters import (
+ xcodah_adapter,
+)
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"xcodah_{language.value}_{formulation.name.lower()}",
+ prompt_function=get_mcq_prompt_function(language, partial(xcodah_adapter, language), formulation=formulation),
+ suite=("lighteval",),
+ hf_repo="INK-USC/xcsr",
+ hf_subset=f"X-CODAH-{standardize_tag(language.value) if language != Language.JAPANESE else 'jap'}",
+ evaluation_splits=("validation",),
+ hf_avail_splits=["validation"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for language in [
+ Language.ARABIC,
+ Language.GERMAN,
+ Language.ENGLISH,
+ Language.SPANISH,
+ Language.FRENCH,
+ Language.HINDI,
+ Language.ITALIAN,
+ Language.JAPANESE,
+ Language.DUTCH,
+ Language.POLISH,
+ Language.PORTUGUESE,
+ Language.RUSSIAN,
+ Language.SWAHILI,
+ Language.URDU,
+ Language.VIETNAMESE,
+ Language.CHINESE,
+ ]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/xcopa.py b/src/lighteval/tasks/multilingual/tasks/xcopa.py
new file mode 100644
index 000000000..aafb34c77
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/xcopa.py
@@ -0,0 +1,82 @@
+"""
+name:
+Xcopa
+
+dataset:
+
+abstract:
+COPA (Choice of Plausible Alternatives) tasks involve determining the most
+plausible cause or effect for a given premise. These tasks test common sense
+reasoning and causal inference abilities. XCOPA: Cross-lingual Choice of
+Plausible Alternatives.
+
+languages:
+arabic, chinese, estonian, haitian, indonesian, italian, quechua, swahili,
+tamil, thai, turkish, vietnamese
+
+tags:
+multilingual, multiple-choice, narrative, reasoning
+
+paper:
+https://aclanthology.org/2020.emnlp-main.185/
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.copa import get_copa_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"xcopa_{language.value}_{formulation.name.lower()}",
+ suite=["lighteval"],
+ prompt_function=get_copa_prompt_function(
+ language,
+ adapter=lambda line: {
+ "context": line["premise"],
+ "cause_effect": line["question"],
+ "continuations": [line["choice1"], line["choice2"]],
+ "gold_idx": int(line["label"]),
+ },
+ formulation=formulation,
+ ),
+ hf_repo=("OALL/AlGhafa-Arabic-LLM-Benchmark-Translated" if language == Language.ARABIC else "xcopa"),
+ hf_subset=("copa_ext_ar" if language == Language.ARABIC else standardize_tag(language.value)),
+ evaluation_splits=["test"],
+ few_shots_split="validation",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for language in [
+ Language.ARABIC,
+ Language.ESTONIAN,
+ Language.INDONESIAN,
+ Language.ITALIAN,
+ Language.SWAHILI,
+ Language.TAMIL,
+ Language.THAI,
+ Language.TURKISH,
+ Language.VIETNAMESE,
+ Language.CHINESE,
+ Language.HAITIAN,
+ Language.QUECHUA,
+ ]
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/xcsqa.py b/src/lighteval/tasks/multilingual/tasks/xcsqa.py
new file mode 100644
index 000000000..ef12349f6
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/xcsqa.py
@@ -0,0 +1,95 @@
+"""
+name:
+Xcsqa
+
+dataset:
+INK-USC/xcsr
+
+abstract:
+XCSQA (Cross-lingual Commonsense QA) is part of the XCSR (Cross-lingual
+Commonsense Reasoning) benchmark It is a multilingual extension of the
+CommonsenseQA dataset, covering 16 languages The task involves answering
+multiple-choice questions that require commonsense reasoning Uses PMI
+normalization.
+
+languages:
+arabic, chinese, dutch, english, french, german, hindi, italian, japanese,
+polish, portuguese, russian, spanish, swahili, urdu, vietnamese
+
+tags:
+multilingual, multiple-choice, qa, reasoning
+
+paper:
+https://arxiv.org/abs/2110.08462
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"xcsqa_{language.value}_{formulation.name.lower()}",
+ prompt_function=get_mcq_prompt_function(
+ language,
+ lambda line: {
+ "question": line["question"]["stem"],
+ "choices": line["question"]["choices"]["text"],
+ "gold_idx": line["question"]["choices"]["label"].index(line["answerKey"]),
+ },
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="INK-USC/xcsr",
+ hf_subset=f"X-CSQA-{standardize_tag(language.value) if language != Language.JAPANESE else 'jap'}",
+ hf_filter=lambda x: all(
+ len(x["question"]["choices"]["text"][i].strip()) > 0 for i in range(len(x["question"]["choices"]["text"]))
+ ),
+ evaluation_splits=("validation",),
+ hf_avail_splits=["validation"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
+ ],
+ ),
+ )
+ for language in [
+ Language.ARABIC,
+ Language.GERMAN,
+ Language.ENGLISH,
+ Language.SPANISH,
+ Language.FRENCH,
+ Language.HINDI,
+ Language.ITALIAN,
+ Language.JAPANESE,
+ Language.DUTCH,
+ Language.POLISH,
+ Language.PORTUGUESE,
+ Language.RUSSIAN,
+ Language.SWAHILI,
+ Language.URDU,
+ Language.VIETNAMESE,
+ Language.CHINESE,
+ ]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/xnli.py b/src/lighteval/tasks/multilingual/tasks/xnli.py
new file mode 100644
index 000000000..9c55458ec
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/xnli.py
@@ -0,0 +1,93 @@
+"""
+name:
+Xnli
+
+dataset:
+facebook/xnli
+
+abstract:
+NLI (Natural Language Inference) tasks involve determining the logical
+relationship between two given sentences: a premise and a hypothesis. The goal
+is to classify whether the hypothesis is entailed by, contradicts, or is neutral
+with respect to the premise. After our inspection we found the neutral label to
+be quite ambiguous and decided to exclude it. But you can easily add it by
+modifying the adapters The XNLI dataset is a multilingual variant of MultiNLI
+
+languages:
+arabic, bulgarian, chinese, english, french, german, greek, hindi, russian,
+spanish, swahili, thai, turkish, urdu, vietnamese
+
+tags:
+classification, multilingual, nli
+
+paper:
+https://aclanthology.org/D18-1269/
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.nli import get_nli_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"xnli_{language.value}_{formulation.name.lower()}",
+ suite=["lighteval"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=None),
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ prompt_function=get_nli_prompt_function(
+ language=language,
+ adapter=lambda line: {
+ "premise": line["premise"],
+ "hypothesis": line["hypothesis"],
+ # Since we ignore the neutral label
+ "gold_idx": {0: 0, 2: 1}[line["label"]],
+ },
+ relations=["entailment", "contradiction"],
+ formulation=formulation,
+ ),
+ hf_filter=lambda line: line["label"] in [0, 2],
+ hf_repo="facebook/xnli",
+ hf_subset=standardize_tag(language.value),
+ evaluation_splits=["validation"],
+ few_shots_split="train",
+ )
+ for language in [
+ Language.ARABIC,
+ Language.ENGLISH,
+ Language.FRENCH,
+ Language.SPANISH,
+ Language.BULGARIAN,
+ Language.GERMAN,
+ Language.GREEK,
+ Language.ENGLISH,
+ Language.FRENCH,
+ Language.HINDI,
+ Language.RUSSIAN,
+ Language.SWAHILI,
+ Language.THAI,
+ Language.TURKISH,
+ Language.URDU,
+ Language.VIETNAMESE,
+ Language.CHINESE,
+ ]
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/xnli2.py b/src/lighteval/tasks/multilingual/tasks/xnli2.py
new file mode 100644
index 000000000..cf3ec6a66
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/xnli2.py
@@ -0,0 +1,100 @@
+"""
+name:
+Xnli2
+
+dataset:
+
+abstract:
+Improvement on XNLI with better translation, from our experience models tend to
+perform better on XNLI2.0 than XNLI.
+
+languages:
+arabic, assamese, bengali, bulgarian, chinese, english, french, german, greek,
+gujarati, hindi, kannada, marathi, punjabi, russian, sanskrit, spanish, swahili,
+tamil, thai, turkish, urdu, vietnamese
+
+tags:
+classification, multilingual, nli
+
+paper:
+https://arxiv.org/abs/2301.06527
+"""
+
+from langcodes import Language as LangCodeLanguage
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.nli import get_nli_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"xnli2.0_{language.value}_{formulation.name.lower()}",
+ suite=["lighteval"],
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=None),
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ prompt_function=get_nli_prompt_function(
+ language=language,
+ adapter=lambda line: {
+ "premise": line["premise"],
+ "hypothesis": line["hypothesis"],
+ # Since we ignore the neutral label
+ "gold_idx": {0: 0, 2: 1}[line["label"]],
+ },
+ relations=["entailment", "contradiction"],
+ formulation=formulation,
+ ),
+ hf_filter=lambda line: line["label"] in [0, 2]
+ and line["premise"] is not None
+ and line["hypothesis"] is not None,
+ hf_repo=f"Harsit/xnli2.0_train_{LangCodeLanguage(standardize_tag(language.value)).language_name().lower()}",
+ hf_subset="default",
+ evaluation_splits=["train"],
+ hf_avail_splits=["train"],
+ )
+ for language in [
+ Language.ENGLISH,
+ Language.FRENCH,
+ Language.PUNJABI,
+ Language.GUJARATI,
+ Language.KANNADA,
+ Language.ASSAMESE,
+ Language.BENGALI,
+ Language.MARATHI,
+ Language.SANSKRIT,
+ Language.TAMIL,
+ Language.GERMAN,
+ Language.ENGLISH,
+ Language.URDU,
+ Language.VIETNAMESE,
+ Language.TURKISH,
+ Language.THAI,
+ Language.SWAHILI,
+ Language.SPANISH,
+ Language.RUSSIAN,
+ Language.HINDI,
+ Language.GREEK,
+ Language.CHINESE,
+ Language.BULGARIAN,
+ Language.ARABIC,
+ # Theoretically also: Bhojpuri, Gujarati, Odiya
+ ]
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/xnli_indic.py b/src/lighteval/tasks/multilingual/tasks/xnli_indic.py
new file mode 100644
index 000000000..4d3cf481c
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/xnli_indic.py
@@ -0,0 +1,83 @@
+"""
+name:
+Xnli Indic
+
+dataset:
+Divyanshu/indicxnli
+
+abstract:
+Another variant of XNLI, with emphasis on Indic languages.
+
+languages:
+assamese, bengali, gujarati, hindi, kannada, malayalam, marathi, oriya, punjabi,
+tamil, telugu
+
+tags:
+classification, multilingual, nli
+
+paper:
+https://arxiv.org/abs/2204.08776
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.nli import get_nli_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"indicnxnli_{language.value}_{formulation.name.lower()}",
+ suite=["lighteval"],
+ prompt_function=get_nli_prompt_function(
+ language=language,
+ adapter=lambda line: {
+ "premise": line["premise"],
+ "hypothesis": line["hypothesis"],
+ # Since we ignore the neutral label
+ "gold_idx": {0: 0, 2: 1}[line["label"]],
+ },
+ relations=["entailment", "contradiction"],
+ formulation=formulation,
+ ),
+ hf_repo="Divyanshu/indicxnli",
+ hf_subset=standardize_tag(language.value),
+ # Ignore neutral
+ hf_filter=lambda x: int(x["label"]) in [0, 2],
+ evaluation_splits=["validation"],
+ few_shots_split="train",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=None),
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for language in [
+ Language.ASSAMESE,
+ Language.BENGALI,
+ Language.GUJARATI,
+ Language.HINDI,
+ Language.KANNADA,
+ Language.MALAYALAM,
+ Language.MARATHI,
+ Language.ORIYA,
+ Language.PUNJABI,
+ Language.TAMIL,
+ Language.TELUGU,
+ ]
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/xquad.py b/src/lighteval/tasks/multilingual/tasks/xquad.py
new file mode 100644
index 000000000..858b3a6ee
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/xquad.py
@@ -0,0 +1,74 @@
+"""
+name:
+Xquad
+
+dataset:
+google/xquad
+
+abstract:
+Reading Comprehension (RC) tasks evaluate a model's ability to understand and
+extract information from text passages. These tasks typically involve answering
+questions based on given contexts, spanning multiple languages and formats. Add
+RC tasks supporting about 130 unique languages/scripts. SQuAD - like XQuAD:
+Cross-lingual Question Answering Dataset, extending SQuAD to 11 languages.
+
+languages:
+arabic, chinese, english, german, greek, hindi, romanian, russian, spanish,
+thai, turkish, vietnamese
+
+tags:
+multilingual, qa
+
+paper:
+https://arxiv.org/abs/1910.11856
+"""
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ MultilingualQuasiExactMatchMetric,
+ MultilingualQuasiF1ScoreMetric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"xquad_{language.value}",
+ prompt_function=get_qa_prompt_function(
+ language,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="google/xquad",
+ hf_subset=f"xquad.{standardize_tag(language.value)}",
+ evaluation_splits=("validation",),
+ few_shots_split="validation",
+ generation_size=400,
+ stop_sequence=("\n",),
+ metrics=(
+ MultilingualQuasiExactMatchMetric(language, "prefix"),
+ MultilingualQuasiF1ScoreMetric(language),
+ ),
+ )
+ for language in [
+ Language.ARABIC,
+ Language.GERMAN,
+ Language.GREEK,
+ Language.ENGLISH,
+ Language.SPANISH,
+ Language.HINDI,
+ Language.ROMANIAN,
+ Language.RUSSIAN,
+ Language.THAI,
+ Language.TURKISH,
+ Language.VIETNAMESE,
+ Language.CHINESE,
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/xstory.py b/src/lighteval/tasks/multilingual/tasks/xstory.py
new file mode 100644
index 000000000..aaf9842c5
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/xstory.py
@@ -0,0 +1,93 @@
+"""
+name:
+Xstory
+
+dataset:
+juletxara/xstory_cloze
+
+abstract:
+Xstory multilingual benchmark.
+
+languages:
+arabic, basque, burmese, chinese, hindi, indonesian, russian, spanish, swahili,
+telugu
+
+tags:
+multilingual, narrative
+
+paper:
+"""
+
+from functools import partial
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.continuation import get_continuation_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.tasks.templates.utils.translation_literals import TRANSLATION_LITERALS
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"xstory_cloze_{lang.value}_{formulation.name.lower()}",
+ prompt_function=get_continuation_prompt_function(
+ lang,
+ partial(
+ lambda lang, line: {
+ "context": TRANSLATION_LITERALS[lang].sentence_space.join(
+ [
+ line["input_sentence_1"],
+ line["input_sentence_2"],
+ line["input_sentence_3"],
+ line["input_sentence_4"],
+ ]
+ ),
+ "continuations": [line["sentence_quiz1"], line["sentence_quiz2"]],
+ "gold_idx": int(line["answer_right_ending"]) - 1, # type: ignore
+ },
+ lang,
+ ),
+ formulation=formulation,
+ ),
+ suite=("lighteval",),
+ hf_repo="juletxara/xstory_cloze",
+ hf_subset=standardize_tag(lang.value),
+ evaluation_splits=["eval"],
+ few_shots_split="train",
+ metrics=get_metrics_for_formulation(
+ formulation,
+ [
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ )
+ for lang in [
+ Language.RUSSIAN,
+ Language.CHINESE,
+ Language.SPANISH,
+ Language.ARABIC,
+ Language.HINDI,
+ Language.INDONESIAN,
+ Language.TELUGU,
+ Language.SWAHILI,
+ Language.BASQUE,
+ Language.BURMESE,
+ ]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/multilingual/tasks/xwinograd.py b/src/lighteval/tasks/multilingual/tasks/xwinograd.py
new file mode 100644
index 000000000..827399e42
--- /dev/null
+++ b/src/lighteval/tasks/multilingual/tasks/xwinograd.py
@@ -0,0 +1,71 @@
+"""
+name:
+Xwinograd
+
+dataset:
+Muennighoff/xwinograd
+
+abstract:
+Xwinograd multilingual benchmark.
+
+languages:
+chinese, english, french, japanese, portuguese, russian
+
+tags:
+multilingual, multiple-choice, reasoning
+
+paper:
+"""
+
+from functools import partial
+
+from langcodes import standardize_tag
+
+from lighteval.metrics.dynamic_metrics import (
+ LogLikelihoodAccMetric,
+)
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.multilingual.adapters import (
+ winogrand_adapter,
+)
+from lighteval.tasks.templates.continuation import get_continuation_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+ CFFormulation,
+ HybridFormulation,
+ MCFFormulation,
+)
+from lighteval.utils.language import Language
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"xwinograd_{language.value}_{formulation.name.lower()}",
+ suite=("lighteval",),
+ prompt_function=get_continuation_prompt_function(
+ language, partial(winogrand_adapter, language), formulation=formulation
+ ),
+ hf_repo="Muennighoff/xwinograd",
+ hf_subset=standardize_tag(language.value) if language != Language.JAPANESE else "jp",
+ evaluation_splits=("test",),
+ hf_avail_splits=["test"],
+ metrics=[
+ LogLikelihoodAccMetric(normalization=None),
+ LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+ LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+ ],
+ )
+ for language in [
+ Language.ENGLISH,
+ Language.FRENCH,
+ Language.JAPANESE,
+ Language.PORTUGUESE,
+ Language.RUSSIAN,
+ Language.CHINESE,
+ ]
+ for formulation in [
+ MCFFormulation(),
+ CFFormulation(),
+ HybridFormulation(),
+ ]
+]
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index 95914991c..cabde57be 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -28,13 +28,12 @@
import logging
import os
import sys
+import time
from functools import lru_cache
from itertools import groupby
from pathlib import Path
from types import ModuleType
-import lighteval.tasks.default_tasks as default_tasks
-from lighteval.tasks.extended import AVAILABLE_EXTENDED_TASKS_MODULES
from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
@@ -114,10 +113,8 @@ class Registry:
def __init__(
self,
tasks: str | Path | None = None,
- custom_tasks: str | Path | ModuleType | None = None,
- load_community: bool = False,
- load_extended: bool = False,
load_multilingual: bool = False,
+ custom_tasks: str | Path | ModuleType | None = None,
):
"""
Initialize the Registry class.
@@ -130,8 +127,6 @@ def __init__(
- A Path object pointing to a custom tasks file
- A module object containing custom task configurations
- None for default behavior (no custom tasks)
- load_community: Whether to load community-contributed tasks.
- load_extended: Whether to load extended tasks with custom logic.
load_multilingual: Whether to load multilingual tasks.
Each custom task module should contain a TASKS_TABLE exposing
@@ -146,8 +141,6 @@ def __init__(
)
]
"""
- self._custom_tasks = custom_tasks
-
if tasks is None:
logger.warning(
"You passed no task name. This should only occur if you are using the CLI to inspect tasks."
@@ -155,16 +148,10 @@ def __init__(
self.tasks_list = []
else:
self.tasks_list = self._get_full_task_list_from_input_string(tasks)
- # These parameters are dynamically set by the task names provided, thanks to `activate_suites_to_load`,
- # except in the `tasks` CLI command to display the full list
- self._load_community = load_community
- self._load_extended = load_extended
- self._load_multilingual = load_multilingual
- self._activate_loading_of_optional_suite() # we dynamically set the loading parameters
-
- # We load all task to
- self._task_registry = self._load_full_registry()
+ self._task_registry = Registry.load_all_task_configs(
+ custom_tasks=custom_tasks, load_multilingual=load_multilingual
+ )
self.task_to_configs = self._update_task_configs()
def _get_full_task_list_from_input_string(self, tasks: str | Path) -> list[str]:
@@ -175,21 +162,7 @@ def _get_full_task_list_from_input_string(self, tasks: str | Path) -> list[str]:
else:
tasks_list = tasks.split(",")
- # We might have tasks provided as task groups in the custom tasks
- # We load the whole task_groups mapping
- if self._custom_tasks is None:
- task_groups = {}
- else:
- custom_tasks_module = Registry.create_custom_tasks_module(custom_tasks=self._custom_tasks)
- tasks_group_dict = {}
- if hasattr(custom_tasks_module, "TASKS_GROUPS"):
- tasks_group_dict = custom_tasks_module.TASKS_GROUPS
-
- # We should allow defining task groups as comma-separated strings or lists of tasks
- task_groups = {k: v if isinstance(v, list) else v.split(",") for k, v in tasks_group_dict.items()}
-
- # Then link actual task_group to task list if needed
- # (At this point the strings are either task name/superset name or group names)
+ task_groups = {}
expanded_tasks_list: list[str] = []
for maybe_task_group in tasks_list:
# We either expand the group (in case it's a group name), or we keep it as is (in case it's a task name or superset name)
@@ -203,76 +176,6 @@ def _get_full_task_list_from_input_string(self, tasks: str | Path) -> list[str]:
return expanded_tasks_list
- def _activate_loading_of_optional_suite(self) -> None:
- """Dynamically selects which of the optional suite we want to load."""
- suites = {task.split("|")[0] for task in self.tasks_list}
-
- for suite_name in suites:
- if suite_name not in DEFAULT_SUITES:
- logger.warning(
- f"Suite {suite_name} unknown. This is not normal, unless you are testing adding new evaluations."
- )
-
- if "extended" in suites:
- self._load_extended = True
- if "multilingual" in suites:
- self._load_multilingual = True
- if "community" in suites:
- self._load_community = True
-
- def _load_full_registry(self) -> dict[str, LightevalTaskConfig]:
- """
- Returns:
- dict[str, LightevalTaskConfig]: A dictionary mapping task names (suite|task) to their corresponding LightevalTask classes.
-
- Example:
- {
- "lighteval|arc_easy": LightevalTaskConfig(name="arc_easy", suite="lighteval", ...),
- }
- """
- custom_tasks_registry = {}
- custom_tasks_module = []
- custom_task_configs = []
-
- if self._custom_tasks is not None:
- custom_tasks_module.append(Registry.create_custom_tasks_module(custom_tasks=self._custom_tasks))
-
- # Need to load extended tasks
- if self._load_extended:
- for extended_task_module in AVAILABLE_EXTENDED_TASKS_MODULES:
- custom_tasks_module.append(extended_task_module)
-
- # Need to load community tasks
- if self._load_community:
- community_modules = load_community_tasks()
- for community_task_module in community_modules:
- custom_tasks_module.append(community_task_module)
-
- # Need to load multilingual tasks
- if self._load_multilingual:
- import lighteval.tasks.multilingual.tasks as multilingual_tasks
-
- custom_tasks_module.append(multilingual_tasks)
-
- # We load all
- for module in custom_tasks_module:
- custom_task_configs.extend(module.TASKS_TABLE)
- logger.info(f"Found {len(module.TASKS_TABLE)} custom tasks in {module.__file__}")
-
- if len(custom_task_configs) > 0:
- custom_tasks_registry = Registry.create_task_config_dict(meta_table=custom_task_configs)
-
- default_tasks_registry = Registry.create_task_config_dict()
-
- # Check the overlap between default_tasks_registry and custom_tasks_registry
- intersection = set(default_tasks_registry.keys()).intersection(set(custom_tasks_registry.keys()))
- if len(intersection) > 0:
- logger.warning(
- f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the custom ones on conflict."
- )
-
- return {**default_tasks_registry, **custom_tasks_registry}
-
def _update_task_configs(self) -> dict[str, LightevalTaskConfig]: # noqa: C901
"""
Updates each config depending on the input tasks (we replace all provided params, like few shot number, sampling params, etc)
@@ -401,26 +304,68 @@ def create_custom_tasks_module(custom_tasks: str | Path | ModuleType) -> ModuleT
return importlib.import_module(str(custom_tasks))
@staticmethod
- def create_task_config_dict(meta_table: list[LightevalTaskConfig] | None = None) -> dict[str, LightevalTaskConfig]:
- """Create configuration tasks based on the provided meta_table.
+ def _extract_configs(module: ModuleType) -> dict[str, LightevalTaskConfig]:
+ configs = {}
+ if hasattr(module, "TASKS_TABLE"):
+ for config in getattr(module, "TASKS_TABLE"):
+ configs[f"{config.suite[0]}|{config.name}"] = config
+ return configs
- Args:
- meta_table: meta_table containing tasks
- configurations. If not provided, it will be loaded from TABLE_PATH.
+ @staticmethod
+ def _load_from_files(files: list[Path], module_prefix: str) -> dict[str, LightevalTaskConfig]:
+ configs = {}
+ for task_file in files:
+ module_name = task_file.stem
+ module = importlib.import_module(f"{module_prefix}.{module_name}")
+ configs.update(Registry._extract_configs(module))
+ return configs
- Returns:
- Dict[str, LightevalTaskConfig]: A dictionary of task names mapped to their corresponding LightevalTaskConfig.
- """
- if meta_table is None:
- meta_table = [config for config in vars(default_tasks).values() if isinstance(config, LightevalTaskConfig)]
+ @staticmethod
+ def _load_from_subdirs(subdirs: list[Path]) -> dict[str, LightevalTaskConfig]:
+ configs = {}
+ for task_dir in subdirs:
+ module_name = task_dir.name
+ module = importlib.import_module(f"lighteval.tasks.tasks.{module_name}.main")
+ configs.update(Registry._extract_configs(module))
+ return configs
- tasks_with_config: dict[str, LightevalTaskConfig] = {}
- for config in meta_table:
- for suite in config.suite:
- if suite in DEFAULT_SUITES:
- tasks_with_config[f"{suite}|{config.name}"] = config
+ @staticmethod
+ def load_all_task_configs(
+ custom_tasks: str | Path | None = None, load_multilingual: bool = False
+ ) -> dict[str, LightevalTaskConfig]:
+ """Load all LightevalTaskConfig objects from all Python files in the tasks/ directory."""
+ time_start = time.perf_counter()
+ # Get the tasks directory
+ TASKS_DIR = Path(__file__).parent / "tasks"
+ TASKS_DIR_MULTILINGUAL = Path(__file__).parent / "multilingual" / "tasks"
+ loaded_configs = {}
+
+ # Get all Python files in the tasks directory (excluding __init__.py)
+ task_files = [f for f in TASKS_DIR.glob("*.py") if f.name != "__init__.py"]
+ task_files_multilingual = [f for f in TASKS_DIR_MULTILINGUAL.glob("*.py") if f.name != "__init__.py"]
+
+ # Also get all subdirectories with main.py files
+ task_subdirs = [d for d in TASKS_DIR.iterdir() if d.is_dir() and (d / "main.py").exists()]
+
+ loaded_configs.update(Registry._load_from_files(task_files, "lighteval.tasks.tasks"))
+ if load_multilingual:
+ loaded_configs.update(
+ Registry._load_from_files(task_files_multilingual, "lighteval.tasks.multilingual.tasks")
+ )
+ loaded_configs.update(Registry._load_from_subdirs(task_subdirs))
+
+ if custom_tasks is not None:
+ custom_tasks_module = Registry.create_custom_tasks_module(custom_tasks)
+ custom_tasks_configs = Registry._extract_configs(custom_tasks_module)
+ if set(custom_tasks_configs.keys()) & set(loaded_configs.keys()):
+ raise ValueError(
+ f"Custom tasks {custom_tasks} conflict with built-in tasks, please use a different name. Conflicting tasks: {set(custom_tasks_configs.keys()) & set(loaded_configs.keys())}"
+ )
+ loaded_configs.update(custom_tasks_configs)
- return tasks_with_config
+ time_end = time.perf_counter()
+ logger.info(f"Loaded {len(loaded_configs)} task configs in {time_end - time_start:.1f} seconds")
+ return loaded_configs
def print_all_tasks(self, suites: str | None = None):
"""Print all the tasks in the task registry.
diff --git a/src/lighteval/tasks/tasks/agieval.py b/src/lighteval/tasks/tasks/agieval.py
new file mode 100644
index 000000000..1f6f6f3d2
--- /dev/null
+++ b/src/lighteval/tasks/tasks/agieval.py
@@ -0,0 +1,356 @@
+"""
+name:
+Agieval
+
+dataset:
+dmayhem93/agieval-aqua-rat, dmayhem93/agieval-gaokao-biology, dmayhem93/agieval-gaokao-chemistry, dmayhem93/agieval-gaokao-chinese, dmayhem93/agieval-gaokao-english, dmayhem93/agieval-gaokao-geography, dmayhem93/agieval-gaokao-history, dmayhem93/agieval-gaokao-mathqa, dmayhem93/agieval-gaokao-physics, dmayhem93/agieval-logiqa-en, dmayhem93/agieval-logiqa-zh, dmayhem93/agieval-lsat-ar, dmayhem93/agieval-lsat-lr, dmayhem93/agieval-lsat-rc, dmayhem93/agieval-sat-en, dmayhem93/agieval-sat-en-without-passage, dmayhem93/agieval-sat-math
+
+abstract:
+AGIEval is a human-centric benchmark specifically designed to evaluate the
+general abilities of foundation models in tasks pertinent to human cognition and
+problem-solving. This benchmark is derived from 20 official, public, and
+high-standard admission and qualification exams intended for general human
+test-takers, such as general college admission tests (e.g., Chinese College
+Entrance Exam (Gaokao) and American SAT), law school admission tests, math
+competitions, lawyer qualification tests, and national civil service exams.
+
+languages:
+english, chinese
+
+tags:
+biology, chemistry, geography, history, knowledge, language, multiple-choice, physics, reasoning
+
+paper:
+https://arxiv.org/abs/2304.06364
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+agieval_aqua_rat = LightevalTaskConfig(
+ name="agieval:aqua-rat",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-aqua-rat",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_gaokao_biology = LightevalTaskConfig(
+ name="agieval:gaokao-biology",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-gaokao-biology",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_gaokao_chemistry = LightevalTaskConfig(
+ name="agieval:gaokao-chemistry",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-gaokao-chemistry",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_gaokao_chinese = LightevalTaskConfig(
+ name="agieval:gaokao-chinese",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-gaokao-chinese",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_gaokao_english = LightevalTaskConfig(
+ name="agieval:gaokao-english",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-gaokao-english",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_gaokao_geography = LightevalTaskConfig(
+ name="agieval:gaokao-geography",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-gaokao-geography",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_gaokao_history = LightevalTaskConfig(
+ name="agieval:gaokao-history",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-gaokao-history",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_gaokao_mathqa = LightevalTaskConfig(
+ name="agieval:gaokao-mathqa",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-gaokao-mathqa",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_gaokao_physics = LightevalTaskConfig(
+ name="agieval:gaokao-physics",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-gaokao-physics",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_logiqa_en = LightevalTaskConfig(
+ name="agieval:logiqa-en",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-logiqa-en",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_logiqa_zh = LightevalTaskConfig(
+ name="agieval:logiqa-zh",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-logiqa-zh",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_lsat_ar = LightevalTaskConfig(
+ name="agieval:lsat-ar",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-lsat-ar",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_lsat_lr = LightevalTaskConfig(
+ name="agieval:lsat-lr",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-lsat-lr",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_lsat_rc = LightevalTaskConfig(
+ name="agieval:lsat-rc",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-lsat-rc",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_sat_en = LightevalTaskConfig(
+ name="agieval:sat-en",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-sat-en",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_sat_en_without_passage = LightevalTaskConfig(
+ name="agieval:sat-en-without-passage",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-sat-en-without-passage",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+agieval_sat_math = LightevalTaskConfig(
+ name="agieval:sat-math",
+ suite=["lighteval"],
+ prompt_function=prompt.agieval,
+ hf_repo="dmayhem93/agieval-sat-math",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=None,
+ version=0,
+)
+
+TASKS_TABLE = [
+ agieval_aqua_rat,
+ agieval_gaokao_biology,
+ agieval_gaokao_chemistry,
+ agieval_gaokao_chinese,
+ agieval_gaokao_english,
+ agieval_gaokao_geography,
+ agieval_gaokao_history,
+ agieval_gaokao_mathqa,
+ agieval_gaokao_physics,
+ agieval_logiqa_en,
+ agieval_logiqa_zh,
+ agieval_lsat_ar,
+ agieval_lsat_lr,
+ agieval_lsat_rc,
+ agieval_sat_en,
+ agieval_sat_en_without_passage,
+ agieval_sat_math,
+]
diff --git a/src/lighteval/tasks/tasks/aime.py b/src/lighteval/tasks/tasks/aime.py
new file mode 100644
index 000000000..ac82a00eb
--- /dev/null
+++ b/src/lighteval/tasks/tasks/aime.py
@@ -0,0 +1,127 @@
+"""
+name:
+Aime
+
+dataset:
+HuggingFaceH4/aime_2024, yentinglin/aime_2025
+
+abstract:
+The American Invitational Mathematics Examination (AIME) is a prestigious,
+invite-only mathematics competition for high-school students who perform in the
+top 5% of the AMC 12 mathematics exam. It involves 15 questions of increasing
+difficulty, with the answer to every question being a single integer from 0 to
+999. The median score is historically between 4 and 6 questions correct (out of
+the 15 possible). Two versions of the test are given every year (thirty
+questions total).
+
+languages:
+english
+
+tags:
+math, reasoning
+
+paper:
+https://maa.org/aime-thresholds-are-available/
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+aime24 = LightevalTaskConfig(
+ name="aime24",
+ suite=["lighteval"],
+ prompt_function=prompt.aime_prompt_fn,
+ hf_repo="HuggingFaceH4/aime_2024",
+ hf_subset="default",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.pass_at_k_math(sample_params={"k": 1}), Metrics.avg_at_n_math(sample_params={"n": 1})],
+ version=2,
+)
+
+aime24_avg = LightevalTaskConfig(
+ name="aime24_avg",
+ suite=["lighteval"],
+ prompt_function=prompt.aime_prompt_fn,
+ hf_repo="HuggingFaceH4/aime_2024",
+ hf_subset="default",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.avg_at_n_math(sample_params={"n": 64})],
+ version=2,
+)
+
+aime24_gpassk = LightevalTaskConfig(
+ name="aime24_gpassk",
+ suite=["lighteval"],
+ prompt_function=prompt.aime_prompt_fn,
+ hf_repo="HuggingFaceH4/aime_2024",
+ hf_subset="default",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.g_pass_at_k_math(sample_params={"k": 16, "n": 48})],
+ version=1,
+)
+
+aime25 = LightevalTaskConfig(
+ name="aime25",
+ suite=["lighteval"],
+ prompt_function=prompt.aime_prompt_fn,
+ hf_repo="yentinglin/aime_2025",
+ hf_subset="default",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}), Metrics.avg_at_n_math(sample_params={"n": 1})],
+ version=2,
+)
+
+aime25_avg = LightevalTaskConfig(
+ name="aime25_avg",
+ suite=["lighteval"],
+ prompt_function=prompt.aime_prompt_fn,
+ hf_repo="yentinglin/aime_2025",
+ hf_subset="default",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.avg_at_n_math(sample_params={"n": 64})],
+ version=2,
+)
+
+aime25_gpassk = LightevalTaskConfig(
+ name="aime25_gpassk",
+ suite=["lighteval"],
+ prompt_function=prompt.aime_prompt_fn,
+ hf_repo="yentinglin/aime_2025",
+ hf_subset="default",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.g_pass_at_k_math(sample_params={"k": 16, "n": 48})],
+ version=1,
+)
+
+TASKS_TABLE = [
+ aime24,
+ aime24_gpassk,
+ aime25,
+ aime25_gpassk,
+]
diff --git a/src/lighteval/tasks/tasks/aimo.py b/src/lighteval/tasks/tasks/aimo.py
new file mode 100644
index 000000000..615e26ffa
--- /dev/null
+++ b/src/lighteval/tasks/tasks/aimo.py
@@ -0,0 +1,53 @@
+"""
+name:
+AIMO Progress Prize 1
+
+dataset:
+https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize
+
+abstract:
+Task to evaluate LLMs on the training set of the Kaggle AIMO competition:
+
+languages:
+english
+
+tags:
+math, reasoning
+
+paper:
+"""
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import math_normalizer
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+
+def aimo_prompt(line, task_name: str = None):
+ return Doc(
+ task_name=task_name,
+ choices=[str(line["answer"])],
+ gold_index=0,
+ query=line["problem"],
+ )
+
+
+task = LightevalTaskConfig(
+ name="aimo_progress_prize_1",
+ prompt_function=aimo_prompt,
+ suite=["community"],
+ hf_subset="",
+ hf_repo="lighteval/aimo_progress_prize_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split="train",
+ few_shots_select="sequential",
+ metrics=[
+ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer})
+ ],
+ generation_size=2048,
+ stop_sequence=None,
+)
+
+# STORE YOUR EVALS
+TASKS_TABLE = [task]
diff --git a/src/lighteval/tasks/tasks/anli.py b/src/lighteval/tasks/tasks/anli.py
new file mode 100644
index 000000000..86ea842b1
--- /dev/null
+++ b/src/lighteval/tasks/tasks/anli.py
@@ -0,0 +1,84 @@
+"""
+name:
+Anli
+
+dataset:
+facebook/anli
+
+abstract:
+The Adversarial Natural Language Inference (ANLI) is a new large-scale NLI
+benchmark dataset, The dataset is collected via an iterative, adversarial
+human-and-model-in-the-loop procedure. ANLI is much more difficult than its
+predecessors including SNLI and MNLI. It contains three rounds. Each round has
+train/dev/test splits.
+
+languages:
+english
+
+tags:
+nli, reasoning
+
+paper:
+https://arxiv.org/abs/1910.14599
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+anli_r1 = LightevalTaskConfig(
+ name="anli:r1",
+ suite=["lighteval"],
+ prompt_function=prompt.anli,
+ hf_repo="facebook/anli",
+ hf_subset="plain_text",
+ hf_avail_splits=["train_r1", "dev_r1", "test_r1"],
+ evaluation_splits=["test_r1"],
+ few_shots_split="train_r1",
+ few_shots_select="random_sampling_from_train",
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+anli_r2 = LightevalTaskConfig(
+ name="anli:r2",
+ suite=["lighteval"],
+ prompt_function=prompt.anli,
+ hf_repo="facebook/anli",
+ hf_subset="plain_text",
+ hf_avail_splits=["train_r2", "dev_r2", "test_r2"],
+ evaluation_splits=["test_r2"],
+ few_shots_split="train_r2",
+ few_shots_select="random_sampling_from_train",
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+anli_r3 = LightevalTaskConfig(
+ name="anli:r3",
+ suite=["lighteval"],
+ prompt_function=prompt.anli,
+ hf_repo="facebook/anli",
+ hf_subset="plain_text",
+ hf_avail_splits=["train_r3", "dev_r3", "test_r3"],
+ evaluation_splits=["test_r3"],
+ few_shots_split="train_r3",
+ few_shots_select="random_sampling_from_train",
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ anli_r1,
+ anli_r2,
+ anli_r3,
+]
diff --git a/src/lighteval/tasks/tasks/arc.py b/src/lighteval/tasks/tasks/arc.py
new file mode 100644
index 000000000..25c7d3464
--- /dev/null
+++ b/src/lighteval/tasks/tasks/arc.py
@@ -0,0 +1,66 @@
+"""
+name:
+Arc
+
+dataset:
+allenai/ai2_arc
+
+abstract:
+7,787 genuine grade-school level, multiple-choice science questions, assembled
+to encourage research in advanced question-answering. The dataset is partitioned
+into a Challenge Set and an Easy Set, where the former contains only questions
+answered incorrectly by both a retrieval-based algorithm and a word
+co-occurrence algorithm
+
+languages:
+english
+
+tags:
+multiple-choice
+
+paper:
+https://arxiv.org/abs/1803.05457
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+arc_challenge = LightevalTaskConfig(
+ name="arc:challenge",
+ suite=["lighteval"],
+ prompt_function=prompt.arc,
+ hf_repo="allenai/ai2_arc",
+ hf_subset="ARC-Challenge",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling_from_train",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+arc_easy = LightevalTaskConfig(
+ name="arc:easy",
+ suite=["lighteval"],
+ prompt_function=prompt.arc,
+ hf_repo="allenai/ai2_arc",
+ hf_subset="ARC-Easy",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling_from_train",
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [arc_challenge, arc_easy]
diff --git a/src/lighteval/tasks/tasks/arc_agi_2.py b/src/lighteval/tasks/tasks/arc_agi_2.py
new file mode 100644
index 000000000..6e6302a44
--- /dev/null
+++ b/src/lighteval/tasks/tasks/arc_agi_2.py
@@ -0,0 +1,52 @@
+"""
+name:
+ArcAgi 2
+
+dataset:
+arc-agi-community/arc-agi-2
+
+abstract:
+ARC-AGI tasks are a series of three to five input and output tasks followed by a
+final task with only the input listed. Each task tests the utilization of a
+specific learned skill based on a minimal number of cognitive priors.
+In their native form, tasks are a JSON lists of integers. These JSON can also be
+represented visually as a grid of colors using an ARC-AGI task viewer. You can
+view an example of a task here.
+A successful submission is a pixel-perfect description (color and position) of
+the final task's output.
+100% of tasks in the ARC-AGI-2 dataset were solved by a minimim of 2 people in
+less than or equal to 2 attempts (many were solved more). ARC-AGI-2 is more
+difficult for AI.
+
+languages:
+english
+
+tags:
+multiple-choice
+
+paper:
+https://arcprize.org/guide
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+arc_agi_2 = LightevalTaskConfig(
+ name="arc_agi_2",
+ suite=["lighteval"],
+ prompt_function=prompt.arc_agi_2,
+ hf_repo="arc-agi-community/arc-agi-2",
+ hf_subset="default",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=2048,
+ metrics=[Metrics.exact_match],
+ stop_sequence=None,
+ version=0,
+)
+
+TASKS_TABLE = [arc_agi_2]
diff --git a/src/lighteval/tasks/tasks/arithmetic.py b/src/lighteval/tasks/tasks/arithmetic.py
new file mode 100644
index 000000000..d1e6b6107
--- /dev/null
+++ b/src/lighteval/tasks/tasks/arithmetic.py
@@ -0,0 +1,198 @@
+"""
+name:
+Arithmetic
+
+dataset:
+EleutherAI/arithmetic
+
+abstract:
+A small battery of 10 tests that involve asking language models a simple
+arithmetic problem in natural language.
+
+languages:
+english
+
+tags:
+math, reasoning
+
+paper:
+https://arxiv.org/abs/2005.14165
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+arithmetic_1dc = LightevalTaskConfig(
+ name="arithmetic:1dc",
+ suite=["lighteval"],
+ prompt_function=prompt.arithmetic,
+ hf_repo="EleutherAI/arithmetic",
+ hf_subset="arithmetic_1dc",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=256,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+arithmetic_2da = LightevalTaskConfig(
+ name="arithmetic:2da",
+ suite=["lighteval"],
+ prompt_function=prompt.arithmetic,
+ hf_repo="EleutherAI/arithmetic",
+ hf_subset="arithmetic_2da",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=256,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+arithmetic_2dm = LightevalTaskConfig(
+ name="arithmetic:2dm",
+ suite=["lighteval"],
+ prompt_function=prompt.arithmetic,
+ hf_repo="EleutherAI/arithmetic",
+ hf_subset="arithmetic_2dm",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=256,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+arithmetic_2ds = LightevalTaskConfig(
+ name="arithmetic:2ds",
+ suite=["lighteval"],
+ prompt_function=prompt.arithmetic,
+ hf_repo="EleutherAI/arithmetic",
+ hf_subset="arithmetic_2ds",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=256,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+arithmetic_3da = LightevalTaskConfig(
+ name="arithmetic:3da",
+ suite=["lighteval"],
+ prompt_function=prompt.arithmetic,
+ hf_repo="EleutherAI/arithmetic",
+ hf_subset="arithmetic_3da",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=256,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+arithmetic_3ds = LightevalTaskConfig(
+ name="arithmetic:3ds",
+ suite=["lighteval"],
+ prompt_function=prompt.arithmetic,
+ hf_repo="EleutherAI/arithmetic",
+ hf_subset="arithmetic_3ds",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=256,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+arithmetic_4da = LightevalTaskConfig(
+ name="arithmetic:4da",
+ suite=["lighteval"],
+ prompt_function=prompt.arithmetic,
+ hf_repo="EleutherAI/arithmetic",
+ hf_subset="arithmetic_4da",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=256,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+arithmetic_4ds = LightevalTaskConfig(
+ name="arithmetic:4ds",
+ suite=["lighteval"],
+ prompt_function=prompt.arithmetic,
+ hf_repo="EleutherAI/arithmetic",
+ hf_subset="arithmetic_4ds",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=256,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+arithmetic_5da = LightevalTaskConfig(
+ name="arithmetic:5da",
+ suite=["lighteval"],
+ prompt_function=prompt.arithmetic,
+ hf_repo="EleutherAI/arithmetic",
+ hf_subset="arithmetic_5da",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=256,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+arithmetic_5ds = LightevalTaskConfig(
+ name="arithmetic:5ds",
+ suite=["lighteval"],
+ prompt_function=prompt.arithmetic,
+ hf_repo="EleutherAI/arithmetic",
+ hf_subset="arithmetic_5ds",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=256,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ arithmetic_1dc,
+ arithmetic_2da,
+ arithmetic_2dm,
+ arithmetic_2ds,
+ arithmetic_3da,
+ arithmetic_3ds,
+ arithmetic_4da,
+ arithmetic_4ds,
+ arithmetic_5da,
+ arithmetic_5ds,
+]
diff --git a/src/lighteval/tasks/tasks/asdiv.py b/src/lighteval/tasks/tasks/asdiv.py
new file mode 100644
index 000000000..e7141449d
--- /dev/null
+++ b/src/lighteval/tasks/tasks/asdiv.py
@@ -0,0 +1,43 @@
+"""
+name:
+Asdiv
+
+dataset:
+EleutherAI/asdiv
+
+abstract:
+ASDiv is a dataset for arithmetic reasoning that contains 2,000+ questions
+covering addition, subtraction, multiplication, and division.
+
+languages:
+english
+
+tags:
+math, reasoning
+
+paper:
+https://arxiv.org/abs/2410.12853
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+asdiv = LightevalTaskConfig(
+ name="asdiv",
+ suite=["lighteval"],
+ prompt_function=prompt.asdiv,
+ hf_repo="EleutherAI/asdiv",
+ hf_subset="asdiv",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [asdiv]
diff --git a/src/lighteval/tasks/tasks/babi_qa.py b/src/lighteval/tasks/tasks/babi_qa.py
new file mode 100644
index 000000000..5ade7cb23
--- /dev/null
+++ b/src/lighteval/tasks/tasks/babi_qa.py
@@ -0,0 +1,43 @@
+"""
+name:
+Babi Qa
+
+dataset:
+facebook/babi_qa
+
+abstract:
+The bAbI benchmark for measuring understanding and reasoning, evaluates reading
+comprehension via question answering.
+
+languages:
+english
+
+tags:
+qa, reasoning
+
+paper:
+https://arxiv.org/abs/1502.05698
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+babi_qa = LightevalTaskConfig(
+ name="babi_qa",
+ suite=["lighteval"],
+ prompt_function=prompt.babi_qa,
+ hf_repo="facebook/babi_qa",
+ hf_subset="en-valid-qa1",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [babi_qa]
diff --git a/src/lighteval/tasks/tasks/bbq.py b/src/lighteval/tasks/tasks/bbq.py
new file mode 100644
index 000000000..3b58f2a91
--- /dev/null
+++ b/src/lighteval/tasks/tasks/bbq.py
@@ -0,0 +1,232 @@
+"""
+name:
+Bbq
+
+dataset:
+lighteval/bbq_helm
+
+abstract:
+The Bias Benchmark for Question Answering (BBQ) for measuring social bias in
+question answering in ambiguous and unambigous context .
+
+languages:
+english
+
+tags:
+bias, multiple-choice, qa
+
+paper:
+https://arxiv.org/abs/2110.08193
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+bbq = LightevalTaskConfig(
+ name="bbq",
+ suite=["lighteval"],
+ prompt_function=prompt.bbq,
+ hf_repo="lighteval/bbq_helm",
+ hf_subset="all",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bbq_Age = LightevalTaskConfig(
+ name="bbq:Age",
+ suite=["lighteval"],
+ prompt_function=prompt.bbq,
+ hf_repo="lighteval/bbq_helm",
+ hf_subset="Age",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bbq_Disability_status = LightevalTaskConfig(
+ name="bbq:Disability_status",
+ suite=["lighteval"],
+ prompt_function=prompt.bbq,
+ hf_repo="lighteval/bbq_helm",
+ hf_subset="Disability_status",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bbq_Gender_identity = LightevalTaskConfig(
+ name="bbq:Gender_identity",
+ suite=["lighteval"],
+ prompt_function=prompt.bbq,
+ hf_repo="lighteval/bbq_helm",
+ hf_subset="Gender_identity",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bbq_Nationality = LightevalTaskConfig(
+ name="bbq:Nationality",
+ suite=["lighteval"],
+ prompt_function=prompt.bbq,
+ hf_repo="lighteval/bbq_helm",
+ hf_subset="Nationality",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bbq_Physical_appearance = LightevalTaskConfig(
+ name="bbq:Physical_appearance",
+ suite=["lighteval"],
+ prompt_function=prompt.bbq,
+ hf_repo="lighteval/bbq_helm",
+ hf_subset="Physical_appearance",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bbq_Race_ethnicity = LightevalTaskConfig(
+ name="bbq:Race_ethnicity",
+ suite=["lighteval"],
+ prompt_function=prompt.bbq,
+ hf_repo="lighteval/bbq_helm",
+ hf_subset="Race_ethnicity",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bbq_Race_x_SES = LightevalTaskConfig(
+ name="bbq:Race_x_SES",
+ suite=["lighteval"],
+ prompt_function=prompt.bbq,
+ hf_repo="lighteval/bbq_helm",
+ hf_subset="Race_x_SES",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bbq_Race_x_gender = LightevalTaskConfig(
+ name="bbq:Race_x_gender",
+ suite=["lighteval"],
+ prompt_function=prompt.bbq,
+ hf_repo="lighteval/bbq_helm",
+ hf_subset="Race_x_gender",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bbq_Religion = LightevalTaskConfig(
+ name="bbq:Religion",
+ suite=["lighteval"],
+ prompt_function=prompt.bbq,
+ hf_repo="lighteval/bbq_helm",
+ hf_subset="Religion",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bbq_SES = LightevalTaskConfig(
+ name="bbq:SES",
+ suite=["lighteval"],
+ prompt_function=prompt.bbq,
+ hf_repo="lighteval/bbq_helm",
+ hf_subset="SES",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bbq_Sexual_orientation = LightevalTaskConfig(
+ name="bbq:Sexual_orientation",
+ suite=["lighteval"],
+ prompt_function=prompt.bbq,
+ hf_repo="lighteval/bbq_helm",
+ hf_subset="Sexual_orientation",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ bbq,
+ bbq_Age,
+ bbq_Disability_status,
+ bbq_Gender_identity,
+ bbq_Nationality,
+ bbq_Physical_appearance,
+ bbq_Race_ethnicity,
+ bbq_Race_x_SES,
+ bbq_Race_x_gender,
+ bbq_Religion,
+ bbq_SES,
+ bbq_Sexual_orientation,
+]
diff --git a/src/lighteval/tasks/tasks/bigbench.py b/src/lighteval/tasks/tasks/bigbench.py
new file mode 100644
index 000000000..8d3c62d26
--- /dev/null
+++ b/src/lighteval/tasks/tasks/bigbench.py
@@ -0,0 +1,2746 @@
+"""
+name:
+Bigbench
+
+dataset:
+tasksource/bigbench
+
+abstract:
+Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models
+166 tasks from bigbench benchmark.
+
+languages:
+english
+
+tags:
+reasoning
+
+paper:
+https://arxiv.org/abs/2206.04615
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+abstract_narrative_understanding = LightevalTaskConfig(
+ name="bigbench:abstract_narrative_understanding",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="abstract_narrative_understanding",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+anachronisms = LightevalTaskConfig(
+ name="bigbench:anachronisms",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="anachronisms",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+analogical_similarity = LightevalTaskConfig(
+ name="bigbench:analogical_similarity",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="analogical_similarity",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+analytic_entailment = LightevalTaskConfig(
+ name="bigbench:analytic_entailment",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="analytic_entailment",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+arithmetic_bb = LightevalTaskConfig(
+ name="bigbench:arithmetic_bb",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="arithmetic",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+ascii_word_recognition = LightevalTaskConfig(
+ name="bigbench:ascii_word_recognition",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="ascii_word_recognition",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+authorship_verification = LightevalTaskConfig(
+ name="bigbench:authorship_verification",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="authorship_verification",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+auto_categorization = LightevalTaskConfig(
+ name="bigbench:auto_categorization",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="auto_categorization",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+auto_debugging = LightevalTaskConfig(
+ name="bigbench:auto_debugging",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_linefeed_before_and_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="auto_debugging",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=None,
+ version=0,
+)
+
+bbq_lite_json = LightevalTaskConfig(
+ name="bigbench:bbq_lite_json",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="bbq_lite_json",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bridging_anaphora_resolution_barqa = LightevalTaskConfig(
+ name="bigbench:bridging_anaphora_resolution_barqa",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="bridging_anaphora_resolution_barqa",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+causal_judgment = LightevalTaskConfig(
+ name="bigbench:causal_judgment",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="causal_judgment",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+cause_and_effect = LightevalTaskConfig(
+ name="bigbench:cause_and_effect",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="cause_and_effect",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+checkmate_in_one = LightevalTaskConfig(
+ name="bigbench:checkmate_in_one",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="checkmate_in_one",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+chess_state_tracking = LightevalTaskConfig(
+ name="bigbench:chess_state_tracking",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="chess_state_tracking",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+chinese_remainder_theorem = LightevalTaskConfig(
+ name="bigbench:chinese_remainder_theorem",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="chinese_remainder_theorem",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+cifar10_classification = LightevalTaskConfig(
+ name="bigbench:cifar10_classification",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="cifar10_classification",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+code_line_description = LightevalTaskConfig(
+ name="bigbench:code_line_description",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_linefeed_before_and_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="code_line_description",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+codenames = LightevalTaskConfig(
+ name="bigbench:codenames",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="codenames",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.rouge_t5, Metrics.bleu],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+color = LightevalTaskConfig(
+ name="bigbench:color",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="color",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[
+ Metrics.rouge_t5,
+ Metrics.bleu,
+ Metrics.loglikelihood_acc,
+ Metrics.exact_match(sample_params={"strip_strings": False}),
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+common_morpheme = LightevalTaskConfig(
+ name="bigbench:common_morpheme",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="common_morpheme",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+conceptual_combinations = LightevalTaskConfig(
+ name="bigbench:conceptual_combinations",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="conceptual_combinations",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+conlang_translation = LightevalTaskConfig(
+ name="bigbench:conlang_translation",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="conlang_translation",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=[".", ";", "!", "?"],
+ version=0,
+)
+
+contextual_parametric_knowledge_conflicts = LightevalTaskConfig(
+ name="bigbench:contextual_parametric_knowledge_conflicts",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="contextual_parametric_knowledge_conflicts",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+crash_blossom = LightevalTaskConfig(
+ name="bigbench:crash_blossom",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="crash_blossom",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+crass_ai = LightevalTaskConfig(
+ name="bigbench:crass_ai",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="crass_ai",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+cryobiology_spanish = LightevalTaskConfig(
+ name="bigbench:cryobiology_spanish",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="cryobiology_spanish",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+cryptonite = LightevalTaskConfig(
+ name="bigbench:cryptonite",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="cryptonite",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+cs_algorithms = LightevalTaskConfig(
+ name="bigbench:cs_algorithms",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="cs_algorithms",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+dark_humor_detection = LightevalTaskConfig(
+ name="bigbench:dark_humor_detection",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="dark_humor_detection",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+date_understanding = LightevalTaskConfig(
+ name="bigbench:date_understanding",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="date_understanding",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+disambiguation_qa = LightevalTaskConfig(
+ name="bigbench:disambiguation_qa",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="disambiguation_qa",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+discourse_marker_prediction = LightevalTaskConfig(
+ name="bigbench:discourse_marker_prediction",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="discourse_marker_prediction",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+disfl_qa = LightevalTaskConfig(
+ name="bigbench:disfl_qa",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="disfl_qa",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+dyck_languages = LightevalTaskConfig(
+ name="bigbench:dyck_languages",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="dyck_languages",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+elementary_math_qa = LightevalTaskConfig(
+ name="bigbench:elementary_math_qa",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="elementary_math_qa",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+emoji_movie = LightevalTaskConfig(
+ name="bigbench:emoji_movie",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="emoji_movie",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[
+ Metrics.rouge_t5,
+ Metrics.bleu,
+ Metrics.loglikelihood_acc,
+ Metrics.exact_match(sample_params={"strip_strings": False}),
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+emojis_emotion_prediction = LightevalTaskConfig(
+ name="bigbench:emojis_emotion_prediction",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="emojis_emotion_prediction",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+empirical_judgments = LightevalTaskConfig(
+ name="bigbench:empirical_judgments",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="empirical_judgments",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+english_proverbs = LightevalTaskConfig(
+ name="bigbench:english_proverbs",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="english_proverbs",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+english_russian_proverbs = LightevalTaskConfig(
+ name="bigbench:english_russian_proverbs",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="english_russian_proverbs",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entailed_polarity = LightevalTaskConfig(
+ name="bigbench:entailed_polarity",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="entailed_polarity",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entailed_polarity_hindi = LightevalTaskConfig(
+ name="bigbench:entailed_polarity_hindi",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="entailed_polarity_hindi",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+epistemic_reasoning = LightevalTaskConfig(
+ name="bigbench:epistemic_reasoning",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="epistemic_reasoning",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+evaluating_information_essentiality = LightevalTaskConfig(
+ name="bigbench:evaluating_information_essentiality",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="evaluating_information_essentiality",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+fact_checker = LightevalTaskConfig(
+ name="bigbench:fact_checker",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="fact_checker",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+fantasy_reasoning = LightevalTaskConfig(
+ name="bigbench:fantasy_reasoning",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="fantasy_reasoning",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+few_shot_nlg = LightevalTaskConfig(
+ name="bigbench:few_shot_nlg",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="few_shot_nlg",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.bleurt],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+figure_of_speech_detection = LightevalTaskConfig(
+ name="bigbench:figure_of_speech_detection",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="figure_of_speech_detection",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+formal_fallacies_syllogisms_negation = LightevalTaskConfig(
+ name="bigbench:formal_fallacies_syllogisms_negation",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="formal_fallacies_syllogisms_negation",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+gem = LightevalTaskConfig(
+ name="bigbench:gem",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="gem",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.rouge_t5],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+gender_inclusive_sentences_german = LightevalTaskConfig(
+ name="bigbench:gender_inclusive_sentences_german",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="gender_inclusive_sentences_german",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+general_knowledge = LightevalTaskConfig(
+ name="bigbench:general_knowledge",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="general_knowledge",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+geometric_shapes = LightevalTaskConfig(
+ name="bigbench:geometric_shapes",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="geometric_shapes",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[
+ Metrics.rouge_t5,
+ Metrics.bleu,
+ Metrics.loglikelihood_acc,
+ Metrics.exact_match(sample_params={"strip_strings": False}),
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+goal_step_wikihow = LightevalTaskConfig(
+ name="bigbench:goal_step_wikihow",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="goal_step_wikihow",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+gre_reading_comprehension = LightevalTaskConfig(
+ name="bigbench:gre_reading_comprehension",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="gre_reading_comprehension",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+hhh_alignment = LightevalTaskConfig(
+ name="bigbench:hhh_alignment",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="hhh_alignment",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+hindi_question_answering = LightevalTaskConfig(
+ name="bigbench:hindi_question_answering",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="hindi_question_answering",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+hindu_knowledge = LightevalTaskConfig(
+ name="bigbench:hindu_knowledge",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="hindu_knowledge",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+hinglish_toxicity = LightevalTaskConfig(
+ name="bigbench:hinglish_toxicity",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="hinglish_toxicity",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+human_organs_senses = LightevalTaskConfig(
+ name="bigbench:human_organs_senses",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="human_organs_senses",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+hyperbaton = LightevalTaskConfig(
+ name="bigbench:hyperbaton",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="hyperbaton",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+identify_math_theorems = LightevalTaskConfig(
+ name="bigbench:identify_math_theorems",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="identify_math_theorems",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+identify_odd_metaphor = LightevalTaskConfig(
+ name="bigbench:identify_odd_metaphor",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="identify_odd_metaphor",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+implicatures = LightevalTaskConfig(
+ name="bigbench:implicatures",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="implicatures",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+implicit_relations = LightevalTaskConfig(
+ name="bigbench:implicit_relations",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="implicit_relations",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+intent_recognition = LightevalTaskConfig(
+ name="bigbench:intent_recognition",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="intent_recognition",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+international_phonetic_alphabet_nli = LightevalTaskConfig(
+ name="bigbench:international_phonetic_alphabet_nli",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="international_phonetic_alphabet_nli",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+international_phonetic_alphabet_transliterate = LightevalTaskConfig(
+ name="bigbench:international_phonetic_alphabet_transliterate",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="international_phonetic_alphabet_transliterate",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+intersect_geometry = LightevalTaskConfig(
+ name="bigbench:intersect_geometry",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="intersect_geometry",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+irony_identification = LightevalTaskConfig(
+ name="bigbench:irony_identification",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="irony_identification",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+kanji_ascii = LightevalTaskConfig(
+ name="bigbench:kanji_ascii",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="kanji_ascii",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+kannada = LightevalTaskConfig(
+ name="bigbench:kannada",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="kannada",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+key_value_maps = LightevalTaskConfig(
+ name="bigbench:key_value_maps",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="key_value_maps",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+known_unknowns = LightevalTaskConfig(
+ name="bigbench:known_unknowns",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="known_unknowns",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+language_games = LightevalTaskConfig(
+ name="bigbench:language_games",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="language_games",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+language_identification = LightevalTaskConfig(
+ name="bigbench:language_identification",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="language_identification",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+linguistic_mappings = LightevalTaskConfig(
+ name="bigbench:linguistic_mappings",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="linguistic_mappings",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+linguistics_puzzles = LightevalTaskConfig(
+ name="bigbench:linguistics_puzzles",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="linguistics_puzzles",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=None,
+ version=0,
+)
+
+logic_grid_puzzle = LightevalTaskConfig(
+ name="bigbench:logic_grid_puzzle",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="logic_grid_puzzle",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+logical_args = LightevalTaskConfig(
+ name="bigbench:logical_args",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="logical_args",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+logical_deduction = LightevalTaskConfig(
+ name="bigbench:logical_deduction",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="logical_deduction",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+logical_fallacy_detection = LightevalTaskConfig(
+ name="bigbench:logical_fallacy_detection",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="logical_fallacy_detection",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+logical_sequence = LightevalTaskConfig(
+ name="bigbench:logical_sequence",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="logical_sequence",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mathematical_induction = LightevalTaskConfig(
+ name="bigbench:mathematical_induction",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="mathematical_induction",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+matrixshapes = LightevalTaskConfig(
+ name="bigbench:matrixshapes",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="matrixshapes",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+metaphor_boolean = LightevalTaskConfig(
+ name="bigbench:metaphor_boolean",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="metaphor_boolean",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+metaphor_understanding = LightevalTaskConfig(
+ name="bigbench:metaphor_understanding",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="metaphor_understanding",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+minute_mysteries_qa = LightevalTaskConfig(
+ name="bigbench:minute_mysteries_qa",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="minute_mysteries_qa",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc, Metrics.rouge_t5],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+misconceptions = LightevalTaskConfig(
+ name="bigbench:misconceptions",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="misconceptions",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+misconceptions_russian = LightevalTaskConfig(
+ name="bigbench:misconceptions_russian",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="misconceptions_russian",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mnist_ascii = LightevalTaskConfig(
+ name="bigbench:mnist_ascii",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="mnist_ascii",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+modified_arithmetic = LightevalTaskConfig(
+ name="bigbench:modified_arithmetic",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="modified_arithmetic",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+moral_permissibility = LightevalTaskConfig(
+ name="bigbench:moral_permissibility",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="moral_permissibility",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+movie_dialog_same_or_different = LightevalTaskConfig(
+ name="bigbench:movie_dialog_same_or_different",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="movie_dialog_same_or_different",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+movie_recommendation = LightevalTaskConfig(
+ name="bigbench:movie_recommendation",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="movie_recommendation",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mult_data_wrangling = LightevalTaskConfig(
+ name="bigbench:mult_data_wrangling",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="mult_data_wrangling",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+navigate = LightevalTaskConfig(
+ name="bigbench:navigate",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="navigate",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+nonsense_words_grammar = LightevalTaskConfig(
+ name="bigbench:nonsense_words_grammar",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="nonsense_words_grammar",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+novel_concepts = LightevalTaskConfig(
+ name="bigbench:novel_concepts",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="novel_concepts",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+object_counting = LightevalTaskConfig(
+ name="bigbench:object_counting",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="object_counting",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+odd_one_out = LightevalTaskConfig(
+ name="bigbench:odd_one_out",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="odd_one_out",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+operators = LightevalTaskConfig(
+ name="bigbench:operators",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="operators",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+paragraph_segmentation = LightevalTaskConfig(
+ name="bigbench:paragraph_segmentation",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="paragraph_segmentation",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+parsinlu_qa = LightevalTaskConfig(
+ name="bigbench:parsinlu_qa",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="parsinlu_qa",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+parsinlu_reading_comprehension = LightevalTaskConfig(
+ name="bigbench:parsinlu_reading_comprehension",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="parsinlu_reading_comprehension",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=None,
+ version=0,
+)
+
+penguins_in_a_table = LightevalTaskConfig(
+ name="bigbench:penguins_in_a_table",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="penguins_in_a_table",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+periodic_elements = LightevalTaskConfig(
+ name="bigbench:periodic_elements",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="periodic_elements",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+persian_idioms = LightevalTaskConfig(
+ name="bigbench:persian_idioms",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="persian_idioms",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+phrase_relatedness = LightevalTaskConfig(
+ name="bigbench:phrase_relatedness",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="phrase_relatedness",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+physical_intuition = LightevalTaskConfig(
+ name="bigbench:physical_intuition",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="physical_intuition",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+physics = LightevalTaskConfig(
+ name="bigbench:physics",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="physics",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+physics_questions = LightevalTaskConfig(
+ name="bigbench:physics_questions",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="physics_questions",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+play_dialog_same_or_different = LightevalTaskConfig(
+ name="bigbench:play_dialog_same_or_different",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="play_dialog_same_or_different",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+polish_sequence_labeling = LightevalTaskConfig(
+ name="bigbench:polish_sequence_labeling",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="polish_sequence_labeling",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.f1_score],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+presuppositions_as_nli = LightevalTaskConfig(
+ name="bigbench:presuppositions_as_nli",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="presuppositions_as_nli",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+qa_wikidata = LightevalTaskConfig(
+ name="bigbench:qa_wikidata",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="qa_wikidata",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[
+ Metrics.bleurt,
+ Metrics.bleu,
+ Metrics.rouge_t5,
+ Metrics.exact_match(sample_params={"strip_strings": False}),
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+question_selection = LightevalTaskConfig(
+ name="bigbench:question_selection",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="question_selection",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+real_or_fake_text = LightevalTaskConfig(
+ name="bigbench:real_or_fake_text",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="real_or_fake_text",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+reasoning_about_colored_objects = LightevalTaskConfig(
+ name="bigbench:reasoning_about_colored_objects",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="reasoning_about_colored_objects",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+repeat_copy_logic = LightevalTaskConfig(
+ name="bigbench:repeat_copy_logic",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="repeat_copy_logic",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+rephrase = LightevalTaskConfig(
+ name="bigbench:rephrase",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="rephrase",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[
+ Metrics.rouge_t5,
+ Metrics.bleu,
+ Metrics.loglikelihood_acc,
+ Metrics.exact_match(sample_params={"strip_strings": False}),
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+rhyming = LightevalTaskConfig(
+ name="bigbench:rhyming",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="rhyming",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+riddle_sense = LightevalTaskConfig(
+ name="bigbench:riddle_sense",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="riddle_sense",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+ruin_names = LightevalTaskConfig(
+ name="bigbench:ruin_names",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="ruin_names",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+salient_translation_error_detection = LightevalTaskConfig(
+ name="bigbench:salient_translation_error_detection",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="salient_translation_error_detection",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+scientific_press_release = LightevalTaskConfig(
+ name="bigbench:scientific_press_release",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="scientific_press_release",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+semantic_parsing_in_context_sparc = LightevalTaskConfig(
+ name="bigbench:semantic_parsing_in_context_sparc",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="semantic_parsing_in_context_sparc",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+semantic_parsing_spider = LightevalTaskConfig(
+ name="bigbench:semantic_parsing_spider",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="semantic_parsing_spider",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+sentence_ambiguity = LightevalTaskConfig(
+ name="bigbench:sentence_ambiguity",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="sentence_ambiguity",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+similarities_abstraction = LightevalTaskConfig(
+ name="bigbench:similarities_abstraction",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="similarities_abstraction",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+simp_turing_concept = LightevalTaskConfig(
+ name="bigbench:simp_turing_concept",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="simp_turing_concept",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+simple_arithmetic_json = LightevalTaskConfig(
+ name="bigbench:simple_arithmetic_json",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="simple_arithmetic_json",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+simple_arithmetic_json_multiple_choice = LightevalTaskConfig(
+ name="bigbench:simple_arithmetic_json_multiple_choice",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="simple_arithmetic_json_multiple_choice",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+simple_arithmetic_json_subtasks = LightevalTaskConfig(
+ name="bigbench:simple_arithmetic_json_subtasks",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="simple_arithmetic_json_subtasks",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+simple_arithmetic_multiple_targets_json = LightevalTaskConfig(
+ name="bigbench:simple_arithmetic_multiple_targets_json",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="simple_arithmetic_multiple_targets_json",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+simple_ethical_questions = LightevalTaskConfig(
+ name="bigbench:simple_ethical_questions",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="simple_ethical_questions",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+simple_text_editing = LightevalTaskConfig(
+ name="bigbench:simple_text_editing",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="simple_text_editing",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+snarks = LightevalTaskConfig(
+ name="bigbench:snarks",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="snarks",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+social_iqa = LightevalTaskConfig(
+ name="bigbench:social_iqa",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="social_iqa",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+social_support = LightevalTaskConfig(
+ name="bigbench:social_support",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="social_support",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.f1_score_macro],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+sports_understanding = LightevalTaskConfig(
+ name="bigbench:sports_understanding",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="sports_understanding",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+strange_stories = LightevalTaskConfig(
+ name="bigbench:strange_stories",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="strange_stories",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+strategyqa = LightevalTaskConfig(
+ name="bigbench:strategyqa",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="strategyqa",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+sufficient_information = LightevalTaskConfig(
+ name="bigbench:sufficient_information",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="sufficient_information",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+suicide_risk = LightevalTaskConfig(
+ name="bigbench:suicide_risk",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="suicide_risk",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+swahili_english_proverbs = LightevalTaskConfig(
+ name="bigbench:swahili_english_proverbs",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="swahili_english_proverbs",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+swedish_to_german_proverbs = LightevalTaskConfig(
+ name="bigbench:swedish_to_german_proverbs",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="swedish_to_german_proverbs",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+symbol_interpretation = LightevalTaskConfig(
+ name="bigbench:symbol_interpretation",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="symbol_interpretation",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+tellmewhy = LightevalTaskConfig(
+ name="bigbench:tellmewhy",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="tellmewhy",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.rouge_t5],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+temporal_sequences = LightevalTaskConfig(
+ name="bigbench:temporal_sequences",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="temporal_sequences",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+tense = LightevalTaskConfig(
+ name="bigbench:tense",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="tense",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+timedial = LightevalTaskConfig(
+ name="bigbench:timedial",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="timedial",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+topical_chat = LightevalTaskConfig(
+ name="bigbench:topical_chat",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="topical_chat",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.bleurt],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+tracking_shuffled_objects = LightevalTaskConfig(
+ name="bigbench:tracking_shuffled_objects",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="tracking_shuffled_objects",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+understanding_fables = LightevalTaskConfig(
+ name="bigbench:understanding_fables",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="understanding_fables",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+undo_permutation = LightevalTaskConfig(
+ name="bigbench:undo_permutation",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="undo_permutation",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+unit_conversion = LightevalTaskConfig(
+ name="bigbench:unit_conversion",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="unit_conversion",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+unit_interpretation = LightevalTaskConfig(
+ name="bigbench:unit_interpretation",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="unit_interpretation",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+unnatural_in_context_learning = LightevalTaskConfig(
+ name="bigbench:unnatural_in_context_learning",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="unnatural_in_context_learning",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+vitaminc_fact_verification = LightevalTaskConfig(
+ name="bigbench:vitaminc_fact_verification",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="vitaminc_fact_verification",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+what_is_the_tao = LightevalTaskConfig(
+ name="bigbench:what_is_the_tao",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="what_is_the_tao",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+which_wiki_edit = LightevalTaskConfig(
+ name="bigbench:which_wiki_edit",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="which_wiki_edit",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+winowhy = LightevalTaskConfig(
+ name="bigbench:winowhy",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench_whitespace_after_query,
+ hf_repo="tasksource/bigbench",
+ hf_subset="winowhy",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+word_sorting = LightevalTaskConfig(
+ name="bigbench:word_sorting",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="word_sorting",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+word_unscrambling = LightevalTaskConfig(
+ name="bigbench:word_unscrambling",
+ suite=["lighteval"],
+ prompt_function=prompt.bigbench,
+ hf_repo="tasksource/bigbench",
+ hf_subset="word_unscrambling",
+ hf_avail_splits=["default", "train", "validation"],
+ evaluation_splits=["default"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ abstract_narrative_understanding,
+ anachronisms,
+ analogical_similarity,
+ moral_permissibility,
+ movie_dialog_same_or_different,
+ movie_recommendation,
+ mult_data_wrangling,
+ simple_ethical_questions,
+ simple_text_editing,
+ snarks,
+ social_iqa,
+ social_support,
+ sports_understanding,
+ strange_stories,
+ strategyqa,
+ sufficient_information,
+ suicide_risk,
+ swahili_english_proverbs,
+ swedish_to_german_proverbs,
+ symbol_interpretation,
+ tellmewhy,
+ temporal_sequences,
+ tense,
+ timedial,
+ topical_chat,
+ tracking_shuffled_objects,
+ understanding_fables,
+ undo_permutation,
+ unit_conversion,
+ unit_interpretation,
+ unnatural_in_context_learning,
+ vitaminc_fact_verification,
+ what_is_the_tao,
+ which_wiki_edit,
+ winowhy,
+ word_sorting,
+ word_unscrambling,
+]
diff --git a/src/lighteval/tasks/tasks/bigbench_hard.py b/src/lighteval/tasks/tasks/bigbench_hard.py
new file mode 100644
index 000000000..f17781c2b
--- /dev/null
+++ b/src/lighteval/tasks/tasks/bigbench_hard.py
@@ -0,0 +1,330 @@
+"""
+name:
+Bigbench Hard
+
+dataset:
+lighteval/bbh
+
+abstract:
+
+languages:
+
+tags:
+reasoning
+
+paper:
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+causal_judgment = LightevalTaskConfig(
+ name="bigbench_hard:causal_judgment",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="causal_judgement",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+date_understanding = LightevalTaskConfig(
+ name="bigbench_hard:date_understanding",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="date_understanding",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+disambiguation_qa = LightevalTaskConfig(
+ name="bigbench_hard:disambiguation_qa",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="disambiguation_qa",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+geometric_shapes = LightevalTaskConfig(
+ name="bigbench_hard:geometric_shapes",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="geometric_shapes",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+logical_deduction_five_objects = LightevalTaskConfig(
+ name="bigbench_hard:logical_deduction_five_objects",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="logical_deduction_five_objects",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+logical_deduction_seven_objects = LightevalTaskConfig(
+ name="bigbench_hard:logical_deduction_seven_objects",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="logical_deduction_seven_objects",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+logical_deduction_three_objects = LightevalTaskConfig(
+ name="bigbench_hard:logical_deduction_three_objects",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="logical_deduction_three_objects",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+movie_recommendation = LightevalTaskConfig(
+ name="bigbench_hard:movie_recommendation",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="movie_recommendation",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+navigate = LightevalTaskConfig(
+ name="bigbench_hard:navigate",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="navigate",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+reasoning_about_colored_objects = LightevalTaskConfig(
+ name="bigbench_hard:reasoning_about_colored_objects",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="reasoning_about_colored_objects",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+ruin_names = LightevalTaskConfig(
+ name="bigbench_hard:ruin_names",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="ruin_names",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+salient_translation_error_detection = LightevalTaskConfig(
+ name="bigbench_hard:salient_translation_error_detection",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="salient_translation_error_detection",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+snarks = LightevalTaskConfig(
+ name="bigbench_hard:snarks",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="snarks",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+sports_understanding = LightevalTaskConfig(
+ name="bigbench_hard:sports_understanding",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="sports_understanding",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+temporal_sequences = LightevalTaskConfig(
+ name="bigbench_hard:temporal_sequences",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="temporal_sequences",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+tracking_shuffled_objects_five_objects = LightevalTaskConfig(
+ name="bigbench_hard:tracking_shuffled_objects_five_objects",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="tracking_shuffled_objects_five_objects",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+tracking_shuffled_objects_seven_objects = LightevalTaskConfig(
+ name="bigbench_hard:tracking_shuffled_objects_seven_objects",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="tracking_shuffled_objects_seven_objects",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+tracking_shuffled_objects_three_objects = LightevalTaskConfig(
+ name="bigbench_hard:tracking_shuffled_objects_three_objects",
+ suite=["lighteval"],
+ prompt_function=prompt.bbh_lighteval,
+ hf_repo="lighteval/bbh",
+ hf_subset="tracking_shuffled_objects_three_objects",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["", "Q=", "\n\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ causal_judgment,
+ date_understanding,
+ disambiguation_qa,
+ geometric_shapes,
+ logical_deduction_five_objects,
+ logical_deduction_seven_objects,
+ logical_deduction_three_objects,
+ movie_recommendation,
+ navigate,
+ reasoning_about_colored_objects,
+ ruin_names,
+ salient_translation_error_detection,
+ snarks,
+ sports_understanding,
+ temporal_sequences,
+ tracking_shuffled_objects_five_objects,
+ tracking_shuffled_objects_seven_objects,
+ tracking_shuffled_objects_three_objects,
+]
diff --git a/src/lighteval/tasks/tasks/blimp.py b/src/lighteval/tasks/tasks/blimp.py
new file mode 100644
index 000000000..822122bda
--- /dev/null
+++ b/src/lighteval/tasks/tasks/blimp.py
@@ -0,0 +1,1141 @@
+"""
+name:
+Blimp
+
+dataset:
+nyu-mll/blimp
+
+abstract:
+BLiMP is a challenge set for evaluating what language models (LMs) know
+about major grammatical phenomena in English. BLiMP consists of 67
+sub-datasets, each containing 1000 minimal pairs isolating specific
+contrasts in syntax, morphology, or semantics. The data is automatically
+generated according to expert-crafted grammars.
+
+languages:
+english
+
+tags:
+language-modeling
+
+paper:
+https://arxiv.org/abs/1912.00582
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+blimp_adjunct_island = LightevalTaskConfig(
+ name="blimp:adjunct_island",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="adjunct_island",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_anaphor_gender_agreement = LightevalTaskConfig(
+ name="blimp:anaphor_gender_agreement",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="anaphor_gender_agreement",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_anaphor_number_agreement = LightevalTaskConfig(
+ name="blimp:anaphor_number_agreement",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="anaphor_number_agreement",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_animate_subject_passive = LightevalTaskConfig(
+ name="blimp:animate_subject_passive",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="animate_subject_passive",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_animate_subject_trans = LightevalTaskConfig(
+ name="blimp:animate_subject_trans",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="animate_subject_trans",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_causative = LightevalTaskConfig(
+ name="blimp:causative",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="causative",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_complex_NP_island = LightevalTaskConfig(
+ name="blimp:complex_NP_island",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="complex_NP_island",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_coordinate_structure_constraint_complex_left_branch = LightevalTaskConfig(
+ name="blimp:coordinate_structure_constraint_complex_left_branch",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="coordinate_structure_constraint_complex_left_branch",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_coordinate_structure_constraint_object_extraction = LightevalTaskConfig(
+ name="blimp:coordinate_structure_constraint_object_extraction",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="coordinate_structure_constraint_object_extraction",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_determiner_noun_agreement_1 = LightevalTaskConfig(
+ name="blimp:determiner_noun_agreement_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="determiner_noun_agreement_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_determiner_noun_agreement_2 = LightevalTaskConfig(
+ name="blimp:determiner_noun_agreement_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="determiner_noun_agreement_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_determiner_noun_agreement_irregular_1 = LightevalTaskConfig(
+ name="blimp:determiner_noun_agreement_irregular_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="determiner_noun_agreement_irregular_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_determiner_noun_agreement_irregular_2 = LightevalTaskConfig(
+ name="blimp:determiner_noun_agreement_irregular_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="determiner_noun_agreement_irregular_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_determiner_noun_agreement_with_adj_2 = LightevalTaskConfig(
+ name="blimp:determiner_noun_agreement_with_adj_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="determiner_noun_agreement_with_adj_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_determiner_noun_agreement_with_adj_irregular_1 = LightevalTaskConfig(
+ name="blimp:determiner_noun_agreement_with_adj_irregular_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="determiner_noun_agreement_with_adj_irregular_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_determiner_noun_agreement_with_adj_irregular_2 = LightevalTaskConfig(
+ name="blimp:determiner_noun_agreement_with_adj_irregular_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="determiner_noun_agreement_with_adj_irregular_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_determiner_noun_agreement_with_adjective_1 = LightevalTaskConfig(
+ name="blimp:determiner_noun_agreement_with_adjective_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="determiner_noun_agreement_with_adjective_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_distractor_agreement_relational_noun = LightevalTaskConfig(
+ name="blimp:distractor_agreement_relational_noun",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="distractor_agreement_relational_noun",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_distractor_agreement_relative_clause = LightevalTaskConfig(
+ name="blimp:distractor_agreement_relative_clause",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="distractor_agreement_relative_clause",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_drop_argument = LightevalTaskConfig(
+ name="blimp:drop_argument",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="drop_argument",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_ellipsis_n_bar_1 = LightevalTaskConfig(
+ name="blimp:ellipsis_n_bar_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="ellipsis_n_bar_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_ellipsis_n_bar_2 = LightevalTaskConfig(
+ name="blimp:ellipsis_n_bar_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="ellipsis_n_bar_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_existential_there_object_raising = LightevalTaskConfig(
+ name="blimp:existential_there_object_raising",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="existential_there_object_raising",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_existential_there_quantifiers_1 = LightevalTaskConfig(
+ name="blimp:existential_there_quantifiers_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="existential_there_quantifiers_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_existential_there_quantifiers_2 = LightevalTaskConfig(
+ name="blimp:existential_there_quantifiers_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="existential_there_quantifiers_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_existential_there_subject_raising = LightevalTaskConfig(
+ name="blimp:existential_there_subject_raising",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="existential_there_subject_raising",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_expletive_it_object_raising = LightevalTaskConfig(
+ name="blimp:expletive_it_object_raising",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="expletive_it_object_raising",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_inchoative = LightevalTaskConfig(
+ name="blimp:inchoative",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="inchoative",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_intransitive = LightevalTaskConfig(
+ name="blimp:intransitive",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="intransitive",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_irregular_past_participle_adjectives = LightevalTaskConfig(
+ name="blimp:irregular_past_participle_adjectives",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="irregular_past_participle_adjectives",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_irregular_past_participle_verbs = LightevalTaskConfig(
+ name="blimp:irregular_past_participle_verbs",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="irregular_past_participle_verbs",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_irregular_plural_subject_verb_agreement_1 = LightevalTaskConfig(
+ name="blimp:irregular_plural_subject_verb_agreement_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="irregular_plural_subject_verb_agreement_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_irregular_plural_subject_verb_agreement_2 = LightevalTaskConfig(
+ name="blimp:irregular_plural_subject_verb_agreement_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="irregular_plural_subject_verb_agreement_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_left_branch_island_echo_question = LightevalTaskConfig(
+ name="blimp:left_branch_island_echo_question",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="left_branch_island_echo_question",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_left_branch_island_simple_question = LightevalTaskConfig(
+ name="blimp:left_branch_island_simple_question",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="left_branch_island_simple_question",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_matrix_question_npi_licensor_present = LightevalTaskConfig(
+ name="blimp:matrix_question_npi_licensor_present",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="matrix_question_npi_licensor_present",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_npi_present_1 = LightevalTaskConfig(
+ name="blimp:npi_present_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="npi_present_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_npi_present_2 = LightevalTaskConfig(
+ name="blimp:npi_present_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="npi_present_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_only_npi_licensor_present = LightevalTaskConfig(
+ name="blimp:only_npi_licensor_present",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="only_npi_licensor_present",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_only_npi_scope = LightevalTaskConfig(
+ name="blimp:only_npi_scope",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="only_npi_scope",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_passive_1 = LightevalTaskConfig(
+ name="blimp:passive_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="passive_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_passive_2 = LightevalTaskConfig(
+ name="blimp:passive_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="passive_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_principle_A_c_command = LightevalTaskConfig(
+ name="blimp:principle_A_c_command",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="principle_A_c_command",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_principle_A_case_1 = LightevalTaskConfig(
+ name="blimp:principle_A_case_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="principle_A_case_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_principle_A_case_2 = LightevalTaskConfig(
+ name="blimp:principle_A_case_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="principle_A_case_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_principle_A_domain_1 = LightevalTaskConfig(
+ name="blimp:principle_A_domain_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="principle_A_domain_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_principle_A_domain_2 = LightevalTaskConfig(
+ name="blimp:principle_A_domain_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="principle_A_domain_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_principle_A_domain_3 = LightevalTaskConfig(
+ name="blimp:principle_A_domain_3",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="principle_A_domain_3",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_principle_A_reconstruction = LightevalTaskConfig(
+ name="blimp:principle_A_reconstruction",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="principle_A_reconstruction",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_regular_plural_subject_verb_agreement_1 = LightevalTaskConfig(
+ name="blimp:regular_plural_subject_verb_agreement_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="regular_plural_subject_verb_agreement_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_regular_plural_subject_verb_agreement_2 = LightevalTaskConfig(
+ name="blimp:regular_plural_subject_verb_agreement_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="regular_plural_subject_verb_agreement_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_sentential_negation_npi_licensor_present = LightevalTaskConfig(
+ name="blimp:sentential_negation_npi_licensor_present",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="sentential_negation_npi_licensor_present",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_sentential_negation_npi_scope = LightevalTaskConfig(
+ name="blimp:sentential_negation_npi_scope",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="sentential_negation_npi_scope",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_sentential_subject_island = LightevalTaskConfig(
+ name="blimp:sentential_subject_island",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="sentential_subject_island",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_superlative_quantifiers_1 = LightevalTaskConfig(
+ name="blimp:superlative_quantifiers_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="superlative_quantifiers_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_superlative_quantifiers_2 = LightevalTaskConfig(
+ name="blimp:superlative_quantifiers_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="superlative_quantifiers_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_tough_vs_raising_1 = LightevalTaskConfig(
+ name="blimp:tough_vs_raising_1",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="tough_vs_raising_1",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_tough_vs_raising_2 = LightevalTaskConfig(
+ name="blimp:tough_vs_raising_2",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="tough_vs_raising_2",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_transitive = LightevalTaskConfig(
+ name="blimp:transitive",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="transitive",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_wh_island = LightevalTaskConfig(
+ name="blimp:wh_island",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="wh_island",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_wh_questions_object_gap = LightevalTaskConfig(
+ name="blimp:wh_questions_object_gap",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="wh_questions_object_gap",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_wh_questions_subject_gap = LightevalTaskConfig(
+ name="blimp:wh_questions_subject_gap",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="wh_questions_subject_gap",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_wh_questions_subject_gap_long_distance = LightevalTaskConfig(
+ name="blimp:wh_questions_subject_gap_long_distance",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="wh_questions_subject_gap_long_distance",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_wh_vs_that_no_gap = LightevalTaskConfig(
+ name="blimp:wh_vs_that_no_gap",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="wh_vs_that_no_gap",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_wh_vs_that_no_gap_long_distance = LightevalTaskConfig(
+ name="blimp:wh_vs_that_no_gap_long_distance",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="wh_vs_that_no_gap_long_distance",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_wh_vs_that_with_gap = LightevalTaskConfig(
+ name="blimp:wh_vs_that_with_gap",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="wh_vs_that_with_gap",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+blimp_wh_vs_that_with_gap_long_distance = LightevalTaskConfig(
+ name="blimp:wh_vs_that_with_gap_long_distance",
+ suite=["lighteval"],
+ prompt_function=prompt.blimp,
+ hf_repo="nyu-mll/blimp",
+ hf_subset="wh_vs_that_with_gap_long_distance",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ blimp_adjunct_island,
+ blimp_anaphor_gender_agreement,
+ blimp_anaphor_number_agreement,
+ blimp_animate_subject_passive,
+ blimp_animate_subject_trans,
+ blimp_causative,
+ blimp_complex_NP_island,
+ blimp_drop_argument,
+ blimp_ellipsis_n_bar_1,
+ blimp_ellipsis_n_bar_2,
+ blimp_existential_there_object_raising,
+ blimp_inchoative,
+ blimp_intransitive,
+ blimp_irregular_past_participle_adjectives,
+ blimp_irregular_past_participle_verbs,
+ blimp_only_npi_scope,
+ blimp_passive_1,
+ blimp_passive_2,
+ blimp_principle_A_c_command,
+ blimp_principle_A_reconstruction,
+ blimp_regular_plural_subject_verb_agreement_1,
+ blimp_regular_plural_subject_verb_agreement_2,
+ blimp_sentential_negation_npi_licensor_present,
+ blimp_sentential_negation_npi_scope,
+ blimp_sentential_subject_island,
+ blimp_superlative_quantifiers_1,
+ blimp_superlative_quantifiers_2,
+ blimp_tough_vs_raising_1,
+ blimp_tough_vs_raising_2,
+ blimp_transitive,
+ blimp_wh_island,
+ blimp_wh_questions_object_gap,
+ blimp_wh_questions_subject_gap,
+ blimp_wh_questions_subject_gap_long_distance,
+ blimp_wh_vs_that_no_gap,
+ blimp_wh_vs_that_no_gap_long_distance,
+ blimp_wh_vs_that_with_gap,
+ blimp_wh_vs_that_with_gap_long_distance,
+]
diff --git a/src/lighteval/tasks/tasks/bold.py b/src/lighteval/tasks/tasks/bold.py
new file mode 100644
index 000000000..f1345a533
--- /dev/null
+++ b/src/lighteval/tasks/tasks/bold.py
@@ -0,0 +1,130 @@
+"""
+name:
+Bold
+
+dataset:
+lighteval/bold_helm
+
+abstract:
+The Bias in Open-Ended Language Generation Dataset (BOLD) for measuring biases
+and toxicity in open-ended language generation.
+
+languages:
+english
+
+tags:
+bias, generation
+
+paper:
+https://dl.acm.org/doi/10.1145/3442188.3445924
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+bold = LightevalTaskConfig(
+ name="bold",
+ suite=["lighteval"],
+ prompt_function=prompt.bold,
+ hf_repo="lighteval/bold_helm",
+ hf_subset="all",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.prediction_perplexity],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bold_gender = LightevalTaskConfig(
+ name="bold:gender",
+ suite=["lighteval"],
+ prompt_function=prompt.bold,
+ hf_repo="lighteval/bold_helm",
+ hf_subset="gender",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.prediction_perplexity],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bold_political_ideology = LightevalTaskConfig(
+ name="bold:political_ideology",
+ suite=["lighteval"],
+ prompt_function=prompt.bold,
+ hf_repo="lighteval/bold_helm",
+ hf_subset="political_ideology",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.prediction_perplexity],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bold_profession = LightevalTaskConfig(
+ name="bold:profession",
+ suite=["lighteval"],
+ prompt_function=prompt.bold,
+ hf_repo="lighteval/bold_helm",
+ hf_subset="profession",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.prediction_perplexity],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bold_race = LightevalTaskConfig(
+ name="bold:race",
+ suite=["lighteval"],
+ prompt_function=prompt.bold,
+ hf_repo="lighteval/bold_helm",
+ hf_subset="race",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.prediction_perplexity],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+bold_religious_ideology = LightevalTaskConfig(
+ name="bold:religious_ideology",
+ suite=["lighteval"],
+ prompt_function=prompt.bold,
+ hf_repo="lighteval/bold_helm",
+ hf_subset="religious_ideology",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.prediction_perplexity],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ bold,
+ bold_gender,
+ bold_political_ideology,
+ bold_profession,
+ bold_race,
+ bold_religious_ideology,
+]
diff --git a/src/lighteval/tasks/tasks/boolq.py b/src/lighteval/tasks/tasks/boolq.py
new file mode 100644
index 000000000..b086ab1cb
--- /dev/null
+++ b/src/lighteval/tasks/tasks/boolq.py
@@ -0,0 +1,66 @@
+"""
+name:
+Boolq
+
+dataset:
+lighteval/boolq_helm
+
+abstract:
+The BoolQ benchmark for binary (yes/no) question answering.
+
+languages:
+english
+
+tags:
+qa
+
+paper:
+https://arxiv.org/abs/1905.11946
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+boolq = LightevalTaskConfig(
+ name="boolq",
+ suite=["lighteval"],
+ prompt_function=prompt.boolq_helm,
+ hf_repo="lighteval/boolq_helm",
+ hf_subset="default",
+ hf_avail_splits=["train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+boolq_contrastset = LightevalTaskConfig(
+ name="boolq:contrastset",
+ suite=["lighteval"],
+ prompt_function=prompt.boolq_helm_contrastset,
+ hf_repo="lighteval/boolq_helm",
+ hf_subset="default",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ boolq,
+ boolq_contrastset,
+]
diff --git a/src/lighteval/tasks/tasks/civil_comments.py b/src/lighteval/tasks/tasks/civil_comments.py
new file mode 100644
index 000000000..608ab097c
--- /dev/null
+++ b/src/lighteval/tasks/tasks/civil_comments.py
@@ -0,0 +1,180 @@
+"""
+name:
+Civil Comments
+
+dataset:
+lighteval/civil_comments_helm
+
+abstract:
+The CivilComments benchmark for toxicity detection.
+
+languages:
+english
+
+tags:
+bias, classification
+
+paper:
+https://arxiv.org/abs/1903.04561
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+civil_comments = LightevalTaskConfig(
+ name="civil_comments",
+ suite=["lighteval"],
+ prompt_function=prompt.civil_comments,
+ hf_repo="lighteval/civil_comments_helm",
+ hf_subset="all",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+civil_comments_LGBTQ = LightevalTaskConfig(
+ name="civil_comments:LGBTQ",
+ suite=["lighteval"],
+ prompt_function=prompt.civil_comments,
+ hf_repo="lighteval/civil_comments_helm",
+ hf_subset="LGBTQ",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+civil_comments_black = LightevalTaskConfig(
+ name="civil_comments:black",
+ suite=["lighteval"],
+ prompt_function=prompt.civil_comments,
+ hf_repo="lighteval/civil_comments_helm",
+ hf_subset="black",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+civil_comments_christian = LightevalTaskConfig(
+ name="civil_comments:christian",
+ suite=["lighteval"],
+ prompt_function=prompt.civil_comments,
+ hf_repo="lighteval/civil_comments_helm",
+ hf_subset="christian",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+civil_comments_female = LightevalTaskConfig(
+ name="civil_comments:female",
+ suite=["lighteval"],
+ prompt_function=prompt.civil_comments,
+ hf_repo="lighteval/civil_comments_helm",
+ hf_subset="female",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+civil_comments_male = LightevalTaskConfig(
+ name="civil_comments:male",
+ suite=["lighteval"],
+ prompt_function=prompt.civil_comments,
+ hf_repo="lighteval/civil_comments_helm",
+ hf_subset="male",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+civil_comments_muslim = LightevalTaskConfig(
+ name="civil_comments:muslim",
+ suite=["lighteval"],
+ prompt_function=prompt.civil_comments,
+ hf_repo="lighteval/civil_comments_helm",
+ hf_subset="muslim",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+civil_comments_other_religions = LightevalTaskConfig(
+ name="civil_comments:other_religions",
+ suite=["lighteval"],
+ prompt_function=prompt.civil_comments,
+ hf_repo="lighteval/civil_comments_helm",
+ hf_subset="other_religions",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+civil_comments_white = LightevalTaskConfig(
+ name="civil_comments:white",
+ suite=["lighteval"],
+ prompt_function=prompt.civil_comments,
+ hf_repo="lighteval/civil_comments_helm",
+ hf_subset="white",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ civil_comments,
+ civil_comments_LGBTQ,
+ civil_comments_black,
+ civil_comments_christian,
+ civil_comments_female,
+ civil_comments_male,
+ civil_comments_muslim,
+ civil_comments_other_religions,
+ civil_comments_white,
+]
diff --git a/src/lighteval/tasks/tasks/commonsenseqa.py b/src/lighteval/tasks/tasks/commonsenseqa.py
new file mode 100644
index 000000000..8c6f6c6de
--- /dev/null
+++ b/src/lighteval/tasks/tasks/commonsenseqa.py
@@ -0,0 +1,49 @@
+"""
+name:
+Commonsenseqa
+
+dataset:
+tau/commonsense_qa
+
+abstract:
+CommonsenseQA is a new multiple-choice question answering dataset that requires
+different types of commonsense knowledge to predict the correct answers . It
+contains 12,102 questions with one correct answer and four distractor answers.
+The dataset is provided in two major training/validation/testing set splits:
+"Random split" which is the main evaluation split, and "Question token split",
+see paper for details.
+
+languages:
+english
+
+tags:
+commonsense, multiple-choice, qa
+
+paper:
+https://arxiv.org/abs/1811.00937
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+commonsenseqa = LightevalTaskConfig(
+ name="commonsenseqa",
+ suite=["lighteval"],
+ prompt_function=prompt.commonsense_qa,
+ hf_repo="tau/commonsense_qa",
+ hf_subset="default",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ commonsenseqa,
+]
diff --git a/src/lighteval/tasks/tasks/coqa.py b/src/lighteval/tasks/tasks/coqa.py
new file mode 100644
index 000000000..a11b6a7a1
--- /dev/null
+++ b/src/lighteval/tasks/tasks/coqa.py
@@ -0,0 +1,45 @@
+"""
+name:
+Coqa
+
+dataset:
+stanfordnlp/coqa
+
+abstract:
+CoQA is a large-scale dataset for building Conversational Question Answering
+systems. The goal of the CoQA challenge is to measure the ability of machines to
+understand a text passage and answer a series of interconnected questions that
+appear in a conversation.
+
+languages:
+english
+
+tags:
+dialog, qa
+
+paper:
+https://arxiv.org/abs/1808.07042
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+coqa_first_question = LightevalTaskConfig(
+ name="coqa",
+ prompt_function=prompt.coqa,
+ suite=["lighteval"],
+ hf_repo="stanfordnlp/coqa",
+ hf_subset="default",
+ hf_avail_splits=["train", "validation"],
+ evaluation_splits=["validation"],
+ stop_sequence=["\n", "Question:", "question:"],
+ generation_size=100,
+ version=1,
+ metrics=[Metrics.exact_match],
+)
+
+TASKS_TABLE = [
+ coqa_first_question,
+]
diff --git a/src/lighteval/tasks/tasks/covid_dialogue.py b/src/lighteval/tasks/tasks/covid_dialogue.py
new file mode 100644
index 000000000..bce5e17ce
--- /dev/null
+++ b/src/lighteval/tasks/tasks/covid_dialogue.py
@@ -0,0 +1,45 @@
+"""
+name:
+Covid Dialogue
+
+dataset:
+lighteval/covid_dialogue
+
+abstract:
+The COVID-19 Dialogue dataset is a collection of 500+ dialogues between
+doctors and patients during the COVID-19 pandemic.
+
+languages:
+english
+
+tags:
+dialog, medical
+
+paper:
+https://arxiv.org/abs/2004.06561
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+covid_dialogue = LightevalTaskConfig(
+ name="covid_dialogue",
+ suite=["lighteval"],
+ prompt_function=prompt.covid_dialogue,
+ hf_repo="lighteval/covid_dialogue",
+ hf_subset="default",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=128,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ covid_dialogue,
+]
diff --git a/community_tasks/custom_task_classification_grammar_task.py b/src/lighteval/tasks/tasks/custom_task_classification_grammar_task.py
similarity index 86%
rename from community_tasks/custom_task_classification_grammar_task.py
rename to src/lighteval/tasks/tasks/custom_task_classification_grammar_task.py
index 5b248093b..04a715149 100644
--- a/community_tasks/custom_task_classification_grammar_task.py
+++ b/src/lighteval/tasks/tasks/custom_task_classification_grammar_task.py
@@ -1,59 +1,21 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-# ruff: noqa: F405, F403, F401
-"""Emotion Classification Task with Grammar Constraints using LightEval
-
-This module demonstrates how to create a classification task in LightEval with JSON grammar-constrained generation for structured responses.
-
-
-The task performs emotion classification on the 'emotion' dataset from HuggingFace Hub,
-classifying text into one of six emotion categories: sadness, joy, love, anger, fear, surprise.
-
-Example usage:
- TGI endpoint evaluation:
- ```bash
- uv run --active --extra litellm --extra tgi lighteval endpoint tgi examples/model_configs/tgi_model.yaml "custom|emotion_classification|0"
- --custom-tasks examples/custom_tasks_templates/custom_task_classification_grammar_task.py
- --output-dir results
- --save-details
- --no-public-run
- ```
-
-Dataset:
- The task uses the 'emotion' dataset from HuggingFace Hub, which contains
- English Twitter messages labeled with one of six emotions. The dataset
- includes train/validation/test splits with the following distribution:
- - Total samples: ~416k (train: ~16k, validation: ~2k, test: ~2k)
- - Labels: sadness, joy, love, anger, fear, surprise
- - Text format: Short social media posts in English
-
-Customization:
- To adapt this task for other classification problems:
- 1. Update EMOTION_LABELS with your target labels
- 2. Modify prompt_emotion_classification() for your use case
- 3. Update the grammar schema in get_emotion_classification_grammar()
- 4. Adjust the HuggingFace dataset reference in EMOTION_CLASSIFICATION_TASK
- 5. Update metric calculations in emotion_classification_metric() if needed
+"""
+name:
+Emotion Classification
+
+dataset:
+dair-ai/emotion
+
+abstract:
+This task performs emotion classification classifying text into one of six
+emotion categories: sadness, joy, love, anger, fear, surprise.
+
+languages:
+english
+
+tags:
+emotion, classification, multiple-choice
+
+paper:
"""
import json
diff --git a/src/lighteval/tasks/tasks/drop_qa.py b/src/lighteval/tasks/tasks/drop_qa.py
new file mode 100644
index 000000000..9e4b23bd7
--- /dev/null
+++ b/src/lighteval/tasks/tasks/drop_qa.py
@@ -0,0 +1,68 @@
+"""
+name:
+Drop Qa
+
+dataset:
+lighteval/drop_harness
+
+abstract:
+The DROP dataset is a new question-answering dataset designed to evaluate the
+ability of language models to answer complex questions that require reasoning
+over multiple sentences.
+
+languages:
+english
+
+tags:
+math, qa, reasoning
+
+paper:
+https://arxiv.org/abs/1810.00505
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+drop_qa = LightevalTaskConfig(
+ name="drop",
+ prompt_function=get_qa_prompt_function(
+ Language.ENGLISH,
+ lambda line: {
+ "context": line["passage"],
+ "question": line["question"],
+ "choices": list(
+ filter(
+ lambda x: x,
+ [line["answer"].get("number")]
+ + line["answer"]["spans"]
+ + [prompt.get_drop_date(line["answer"].get("date"))],
+ )
+ ),
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="lighteval/drop_harness",
+ hf_subset="default",
+ hf_filter=lambda line: list(
+ filter(
+ lambda x: x,
+ [line["answer"].get("number")]
+ + line["answer"]["spans"]
+ + [prompt.get_drop_date(line["answer"].get("date"))],
+ )
+ ),
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ generation_size=250,
+ stop_sequence=["Question:", "question:", "\n"],
+ metrics=[Metrics.exact_match],
+ version=1,
+)
+
+TASKS_TABLE = [
+ drop_qa,
+]
diff --git a/src/lighteval/tasks/tasks/dyck_language.py b/src/lighteval/tasks/tasks/dyck_language.py
new file mode 100644
index 000000000..ff2e536ea
--- /dev/null
+++ b/src/lighteval/tasks/tasks/dyck_language.py
@@ -0,0 +1,80 @@
+"""
+name:
+Dyck Language
+
+dataset:
+lighteval/DyckLanguage
+
+abstract:
+Scenario testing hierarchical reasoning through the Dyck formal languages.
+
+languages:
+english
+
+tags:
+reasoning
+
+paper:
+https://aclanthology.org/W19-3905/
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+dyck_language_2 = LightevalTaskConfig(
+ name="dyck_language:2",
+ suite=["lighteval"],
+ prompt_function=prompt.dyck_language,
+ hf_repo="lighteval/DyckLanguage",
+ hf_subset="2",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+dyck_language_3 = LightevalTaskConfig(
+ name="dyck_language:3",
+ suite=["lighteval"],
+ prompt_function=prompt.dyck_language,
+ hf_repo="lighteval/DyckLanguage",
+ hf_subset="3",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+dyck_language_4 = LightevalTaskConfig(
+ name="dyck_language:4",
+ suite=["lighteval"],
+ prompt_function=prompt.dyck_language,
+ hf_repo="lighteval/DyckLanguage",
+ hf_subset="4",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ dyck_language_2,
+ dyck_language_3,
+ dyck_language_4,
+]
diff --git a/src/lighteval/tasks/tasks/entity_data_imputation.py b/src/lighteval/tasks/tasks/entity_data_imputation.py
new file mode 100644
index 000000000..309e0585d
--- /dev/null
+++ b/src/lighteval/tasks/tasks/entity_data_imputation.py
@@ -0,0 +1,66 @@
+"""
+name:
+Entity Data Imputation
+
+dataset:
+lighteval/Buy, lighteval/Restaurant
+
+abstract:
+Scenario that tests the ability to impute missing entities in a data table.
+
+languages:
+english
+
+tags:
+reasoning
+
+paper:
+https://ieeexplore.ieee.org/document/9458712
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+entity_data_imputation_Buy = LightevalTaskConfig(
+ name="entity_data_imputation:Buy",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_data_imputation,
+ hf_repo="lighteval/Buy",
+ hf_subset="default",
+ hf_avail_splits=["train", "test", "valid"],
+ evaluation_splits=["valid", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+entity_data_imputation_Restaurant = LightevalTaskConfig(
+ name="entity_data_imputation:Restaurant",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_data_imputation,
+ hf_repo="lighteval/Restaurant",
+ hf_subset="default",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ entity_data_imputation_Buy,
+ entity_data_imputation_Restaurant,
+]
diff --git a/src/lighteval/tasks/tasks/entitymatching.py b/src/lighteval/tasks/tasks/entitymatching.py
new file mode 100644
index 000000000..c251244b2
--- /dev/null
+++ b/src/lighteval/tasks/tasks/entitymatching.py
@@ -0,0 +1,248 @@
+"""
+name:
+Entitymatching
+
+dataset:
+lighteval/EntityMatching
+
+abstract:
+Simple entity matching benchmark.
+
+languages:
+english
+
+tags:
+classification, reasoning
+
+paper:
+https://dl.acm.org/doi/10.14778/3007263.3007314
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+entity_matching_Abt_Buy = LightevalTaskConfig(
+ name="entity_matching:Abt_Buy",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="Abt_Buy",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entity_matching_Amazon_Google = LightevalTaskConfig(
+ name="entity_matching:Amazon_Google",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="Amazon_Google",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entity_matching_Beer = LightevalTaskConfig(
+ name="entity_matching:Beer",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="Beer",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entity_matching_Company = LightevalTaskConfig(
+ name="entity_matching:Company",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="Company",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entity_matching_DBLP_ACM = LightevalTaskConfig(
+ name="entity_matching:DBLP_ACM",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="DBLP_ACM",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entity_matching_DBLP_GoogleScholar = LightevalTaskConfig(
+ name="entity_matching:DBLP_GoogleScholar",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="DBLP_GoogleScholar",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entity_matching_Dirty_DBLP_ACM = LightevalTaskConfig(
+ name="entity_matching:Dirty_DBLP_ACM",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="Dirty_DBLP_ACM",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entity_matching_Dirty_DBLP_GoogleScholar = LightevalTaskConfig(
+ name="entity_matching:Dirty_DBLP_GoogleScholar",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="Dirty_DBLP_GoogleScholar",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entity_matching_Dirty_Walmart_Amazon = LightevalTaskConfig(
+ name="entity_matching:Dirty_Walmart_Amazon",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="Dirty_Walmart_Amazon",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entity_matching_Dirty_iTunes_Amazon = LightevalTaskConfig(
+ name="entity_matching:Dirty_iTunes_Amazon",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="Dirty_iTunes_Amazon",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entity_matching_Fodors_Zagats = LightevalTaskConfig(
+ name="entity_matching=Fodors_Zagats",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="Fodors_Zagats",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entity_matching_Walmart_Amazon = LightevalTaskConfig(
+ name="entity_matching:Walmart_Amazon",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="Walmart_Amazon",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+entity_matching_iTunes_Amazon = LightevalTaskConfig(
+ name="entity_matching:iTunes_Amazon",
+ suite=["lighteval"],
+ prompt_function=prompt.entity_matching,
+ hf_repo="lighteval/EntityMatching",
+ hf_subset="iTunes_Amazon",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ entity_matching_Abt_Buy,
+ entity_matching_Amazon_Google,
+ entity_matching_Beer,
+ entity_matching_Company,
+ entity_matching_DBLP_ACM,
+ entity_matching_DBLP_GoogleScholar,
+ entity_matching_Dirty_DBLP_ACM,
+ entity_matching_Dirty_DBLP_GoogleScholar,
+ entity_matching_Dirty_Walmart_Amazon,
+ entity_matching_Dirty_iTunes_Amazon,
+ entity_matching_Fodors_Zagats,
+ entity_matching_Walmart_Amazon,
+ entity_matching_iTunes_Amazon,
+]
diff --git a/src/lighteval/tasks/tasks/ethics.py b/src/lighteval/tasks/tasks/ethics.py
new file mode 100644
index 000000000..bb45a2f2e
--- /dev/null
+++ b/src/lighteval/tasks/tasks/ethics.py
@@ -0,0 +1,113 @@
+"""
+name:
+Ethics
+
+dataset:
+lighteval/hendrycks_ethics
+
+abstract:
+The Ethics benchmark for evaluating the ability of language models to reason about
+ethical issues.
+
+languages:
+english
+
+tags:
+classification, ethics, justice, morality, utilitarianism, virtue
+
+paper:
+https://arxiv.org/abs/2008.02275
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+ethics_commonsense = LightevalTaskConfig(
+ name="ethics:commonsense",
+ suite=["lighteval"],
+ prompt_function=prompt.ethics_commonsense,
+ hf_repo="lighteval/hendrycks_ethics",
+ hf_subset="commonsense",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+ethics_deontology = LightevalTaskConfig(
+ name="ethics:deontology",
+ suite=["lighteval"],
+ prompt_function=prompt.ethics_deontology,
+ hf_repo="lighteval/hendrycks_ethics",
+ hf_subset="deontology",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+ethics_justice = LightevalTaskConfig(
+ name="ethics:justice",
+ suite=["lighteval"],
+ prompt_function=prompt.ethics_justice,
+ hf_repo="lighteval/hendrycks_ethics",
+ hf_subset="justice",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+ethics_utilitarianism = LightevalTaskConfig(
+ name="ethics:utilitarianism",
+ suite=["lighteval"],
+ prompt_function=prompt.ethics_utilitarianism,
+ hf_repo="lighteval/hendrycks_ethics",
+ hf_subset="utilitarianism",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+ethics_virtue = LightevalTaskConfig(
+ name="ethics:virtue",
+ suite=["lighteval"],
+ prompt_function=prompt.ethics_virtue,
+ hf_repo="lighteval/hendrycks_ethics",
+ hf_subset="virtue",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ ethics_commonsense,
+ ethics_deontology,
+ ethics_justice,
+ ethics_utilitarianism,
+ ethics_virtue,
+]
diff --git a/src/lighteval/tasks/tasks/glue.py b/src/lighteval/tasks/tasks/glue.py
new file mode 100644
index 000000000..69b9c0dc3
--- /dev/null
+++ b/src/lighteval/tasks/tasks/glue.py
@@ -0,0 +1,317 @@
+"""
+name:
+GLUE
+
+dataset:
+nyu-mll/glue, aps/super_glue
+
+abstract:
+The General Language Understanding Evaluation (GLUE) benchmark is a collection
+of resources for training, evaluating, and analyzing natural language
+understanding systems.
+
+languages:
+english
+
+tags:
+classification, language-understanding
+
+paper:
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+glue_cola = LightevalTaskConfig(
+ name="glue:cola",
+ suite=["lighteval"],
+ prompt_function=prompt.cola,
+ hf_repo="nyu-mll/glue",
+ hf_subset="cola",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc, Metrics.mcc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+glue_mnli = LightevalTaskConfig(
+ name="glue:mnli",
+ suite=["lighteval"],
+ prompt_function=prompt.mnli,
+ hf_repo="nyu-mll/glue",
+ hf_subset="mnli_matched",
+ hf_avail_splits=["train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+glue_mnli_mismatched = LightevalTaskConfig(
+ name="glue:mnli_mismatched",
+ suite=["lighteval"],
+ prompt_function=prompt.mnli,
+ hf_repo="nyu-mll/glue",
+ hf_subset="mnli_mismatched",
+ hf_avail_splits=["train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+glue_mrpc = LightevalTaskConfig(
+ name="glue:mrpc",
+ suite=["lighteval"],
+ prompt_function=prompt.mrpc,
+ hf_repo="nyu-mll/glue",
+ hf_subset="mrpc",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_f1],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+glue_qnli = LightevalTaskConfig(
+ name="glue:qnli",
+ suite=["lighteval"],
+ prompt_function=prompt.qnli,
+ hf_repo="nyu-mll/glue",
+ hf_subset="qnli",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+glue_qqp = LightevalTaskConfig(
+ name="glue:qqp",
+ suite=["lighteval"],
+ prompt_function=prompt.qqp,
+ hf_repo="nyu-mll/glue",
+ hf_subset="qqp",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_f1],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+glue_rte = LightevalTaskConfig(
+ name="glue:rte",
+ suite=["lighteval"],
+ prompt_function=prompt.rte,
+ hf_repo="nyu-mll/glue",
+ hf_subset="rte",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+glue_sst2 = LightevalTaskConfig(
+ name="glue:sst2",
+ suite=["lighteval"],
+ prompt_function=prompt.sst,
+ hf_repo="nyu-mll/glue",
+ hf_subset="sst2",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+glue_stsb = LightevalTaskConfig(
+ name="glue:stsb",
+ suite=["lighteval"],
+ prompt_function=prompt.stsb,
+ hf_repo="nyu-mll/glue",
+ hf_subset="stsb",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+glue_wnli = LightevalTaskConfig(
+ name="glue:wnli",
+ suite=["lighteval"],
+ prompt_function=prompt.wnli,
+ hf_repo="nyu-mll/glue",
+ hf_subset="wnli",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+super_glue_boolq = LightevalTaskConfig(
+ name="super_glue:boolq",
+ suite=["lighteval"],
+ prompt_function=prompt.boolq_harness,
+ hf_repo="aps/super_glue",
+ hf_subset="boolq",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+super_glue_cb = LightevalTaskConfig(
+ name="super_glue:cb",
+ suite=["lighteval"],
+ prompt_function=prompt.cb,
+ hf_repo="aps/super_glue",
+ hf_subset="cb",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc, Metrics.multi_f1_numeric],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+super_glue_copa = LightevalTaskConfig(
+ name="super_glue:copa",
+ suite=["lighteval"],
+ prompt_function=prompt.copa,
+ hf_repo="aps/super_glue",
+ hf_subset="copa",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+super_glue_rte = LightevalTaskConfig(
+ name="super_glue:rte",
+ suite=["lighteval"],
+ prompt_function=prompt.rte,
+ hf_repo="aps/super_glue",
+ hf_subset="rte",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+super_glue_multirc = LightevalTaskConfig(
+ name="super_glue:multirc",
+ suite=["lighteval"],
+ prompt_function=prompt.multirc,
+ hf_repo="aps/super_glue",
+ hf_subset="multirc",
+ hf_avail_splits=["train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+super_glue_wic = LightevalTaskConfig(
+ name="super_glue:wic",
+ suite=["lighteval"],
+ prompt_function=prompt.wic,
+ hf_repo="aps/super_glue",
+ hf_subset="wic",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+super_glue_wsc = LightevalTaskConfig(
+ name="super_glue:wsc",
+ suite=["lighteval"],
+ prompt_function=prompt.wsc,
+ hf_repo="aps/super_glue",
+ hf_subset="wsc",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ glue_cola,
+ glue_mnli,
+ glue_mnli_mismatched,
+ glue_mrpc,
+ glue_qnli,
+ glue_qqp,
+ glue_rte,
+ glue_sst2,
+ glue_stsb,
+ glue_wnli,
+ super_glue_boolq,
+ super_glue_cb,
+ super_glue_copa,
+ super_glue_rte,
+ super_glue_multirc,
+ super_glue_wic,
+ super_glue_wsc,
+]
diff --git a/src/lighteval/tasks/tasks/gpqa.py b/src/lighteval/tasks/tasks/gpqa.py
new file mode 100644
index 000000000..5d0e67bda
--- /dev/null
+++ b/src/lighteval/tasks/tasks/gpqa.py
@@ -0,0 +1,100 @@
+"""
+name:
+Gpqa
+
+dataset:
+Idavidrein/gpqa
+
+abstract:
+GPQA is a dataset of 448 expert-written multiple-choice questions in biology,
+physics, and chemistry, designed to test graduate-level reasoning. The questions
+are extremely difficult—PhD-level experts score about 65%, skilled non-experts
+34% (even with web access), and GPT-4 around 39%. GPQA aims to support research
+on scalable oversight, helping humans evaluate and trust AI systems that may
+exceed human expertise.
+
+languages:
+english
+
+tags:
+biology, chemistry, graduate-level, multiple-choice, physics, qa, reasoning, science
+
+paper:
+https://arxiv.org/abs/2311.12022
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+gpqa = LightevalTaskConfig(
+ name="gpqa:mc",
+ suite=["lighteval"],
+ prompt_function=prompt.gpqa,
+ hf_repo="Idavidrein/gpqa",
+ hf_subset="gpqa_main",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+gpqa_diamond_instruct = LightevalTaskConfig(
+ name="gpqa:diamond",
+ suite=["lighteval"],
+ prompt_function=prompt.gpqa_instruct,
+ hf_repo="Idavidrein/gpqa",
+ hf_subset="gpqa_diamond",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=32768, # needed for reasoning models like R1
+ metrics=[Metrics.gpqa_instruct_pass_at_k(sample_params={"k": 1})],
+ stop_sequence=[], # no stop sequence, will use eos token
+ version=1,
+)
+
+gpqa_extended_instruct = LightevalTaskConfig(
+ name="gpqa:extended",
+ suite=["lighteval"],
+ prompt_function=prompt.gpqa_instruct,
+ hf_repo="Idavidrein/gpqa",
+ hf_subset="gpqa_extended",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=32768, # needed for reasoning models like R1
+ metrics=[Metrics.gpqa_instruct_metric],
+ stop_sequence=[], # no stop sequence, will use eos token
+ version=0,
+)
+
+gpqa_main_instruct = LightevalTaskConfig(
+ name="gpqa:main",
+ suite=["lighteval"],
+ prompt_function=prompt.gpqa_instruct,
+ hf_repo="Idavidrein/gpqa",
+ hf_subset="gpqa_main",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=32768, # needed for reasoning models like R1
+ metrics=[Metrics.gpqa_instruct_metric],
+ stop_sequence=[], # no stop sequence, will use eos token
+ version=0,
+)
+
+TASKS_TABLE = [
+ gpqa,
+ gpqa_diamond_instruct,
+ gpqa_extended_instruct,
+ gpqa_main_instruct,
+]
diff --git a/src/lighteval/tasks/tasks/gsm8k.py b/src/lighteval/tasks/tasks/gsm8k.py
new file mode 100644
index 000000000..c4b5a51a6
--- /dev/null
+++ b/src/lighteval/tasks/tasks/gsm8k.py
@@ -0,0 +1,46 @@
+"""
+name:
+Gsm8K
+
+dataset:
+openai/gsm8k
+
+abstract:
+GSM8K is a dataset of 8,000+ high-quality, single-step arithmetic word problems.
+
+languages:
+english
+
+tags:
+math, reasoning
+
+paper:
+https://arxiv.org/abs/2110.14168
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+gsm8k = LightevalTaskConfig(
+ name="gsm8k",
+ suite=["lighteval"],
+ prompt_function=prompt.gsm8k,
+ hf_repo="openai/gsm8k",
+ hf_subset="main",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select="random_sampling_from_train",
+ generation_size=256,
+ metrics=[
+ Metrics.expr_gold_metric,
+ ],
+ stop_sequence=["Question:"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ gsm8k,
+]
diff --git a/src/lighteval/tasks/tasks/gsm_plus.py b/src/lighteval/tasks/tasks/gsm_plus.py
new file mode 100644
index 000000000..65afadef2
--- /dev/null
+++ b/src/lighteval/tasks/tasks/gsm_plus.py
@@ -0,0 +1,46 @@
+"""
+name:
+Gsm Plus
+
+dataset:
+qintongli/GSM-Plus
+
+abstract:
+GSM-Plus is an adversarial extension of GSM8K that tests the robustness of LLMs'
+mathematical reasoning by introducing varied perturbations to grade-school math
+problems.
+
+languages:
+english
+
+tags:
+math, reasoning
+
+paper:
+https://arxiv.org/abs/2402.19255
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+gsm_plus = LightevalTaskConfig(
+ name="gsm_plus",
+ suite=["lighteval"],
+ prompt_function=prompt.gsm_plus,
+ hf_repo="qintongli/GSM-Plus",
+ hf_subset="default",
+ hf_avail_splits=["test", "testmini"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.expr_gold_metric],
+ stop_sequence=None,
+ version=0,
+)
+
+TASKS_TABLE = [
+ gsm_plus,
+]
diff --git a/src/lighteval/tasks/tasks/headqa.py b/src/lighteval/tasks/tasks/headqa.py
new file mode 100644
index 000000000..2d7eb36ea
--- /dev/null
+++ b/src/lighteval/tasks/tasks/headqa.py
@@ -0,0 +1,70 @@
+"""
+name:
+Headqa
+
+dataset:
+lighteval/headqa_harness
+
+abstract:
+HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to
+access a specialized position in the Spanish healthcare system, and are
+challenging even for highly specialized humans. They are designed by the
+Ministerio de Sanidad, Consumo y Bienestar Social, who also provides direct
+access to the exams of the last 5 years.
+
+languages:
+english, spanish
+
+tags:
+health, medical, multiple-choice, qa
+
+paper:
+https://arxiv.org/abs/1906.04701
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+headqa_en = LightevalTaskConfig(
+ name="headqa:en",
+ suite=["lighteval"],
+ prompt_function=prompt.headqa,
+ hf_repo="lighteval/headqa_harness",
+ hf_subset="en",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+headqa_es = LightevalTaskConfig(
+ name="headqa:es",
+ suite=["lighteval"],
+ prompt_function=prompt.headqa,
+ hf_repo="lighteval/headqa_harness",
+ hf_subset="es",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ headqa_en,
+ headqa_es,
+]
diff --git a/src/lighteval/tasks/tasks/hellaswag.py b/src/lighteval/tasks/tasks/hellaswag.py
new file mode 100644
index 000000000..76e02fee0
--- /dev/null
+++ b/src/lighteval/tasks/tasks/hellaswag.py
@@ -0,0 +1,47 @@
+"""
+name:
+Hellaswag
+
+dataset:
+Rowan/hellaswag
+
+abstract:
+HellaSwag is a commonsense inference benchmark designed to challenge language
+models with adversarially filtered multiple-choice questions.
+
+languages:
+english
+
+tags:
+multiple-choice, narrative, reasoning
+
+paper:
+https://arxiv.org/abs/1905.07830
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+hellaswag = LightevalTaskConfig(
+ name="hellaswag",
+ suite=["lighteval"],
+ prompt_function=prompt.hellaswag_generative,
+ hf_repo="Rowan/hellaswag",
+ hf_subset="default",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ hellaswag,
+]
diff --git a/src/lighteval/tasks/extended/hle/main.py b/src/lighteval/tasks/tasks/hle/main.py
similarity index 85%
rename from src/lighteval/tasks/extended/hle/main.py
rename to src/lighteval/tasks/tasks/hle/main.py
index 1e2540984..c22dcaf72 100644
--- a/src/lighteval/tasks/extended/hle/main.py
+++ b/src/lighteval/tasks/tasks/hle/main.py
@@ -1,25 +1,25 @@
-# MIT License
+"""
+name:
+Humanity's Last Exam
-# Copyright (c) 2024 The HuggingFace Team
+dataset:
+cais/hle
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
+abstract:
+Humanity's Last Exam (HLE) is a global collaborative effort, with questions from
+nearly 1,000 subject expert contributors affiliated with over 500 institutions
+across 50 countries - comprised mostly of professors, researchers, and graduate
+degree holders.
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+languages:
+english
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+tags:
+qa, reasoning, general-knowledge
+paper:
+https://arxiv.org/abs/2501.14249
+"""
import logging
import math
@@ -47,8 +47,7 @@ class ExtractedAnswer(BaseModel):
strict: Literal[True] # 100% reliability
-"""Adaptation from https://github.com/centerforaisafety/hle/blob/main/hle_eval/run_judge_results.py
-"""
+# Adaptation from https://github.com/centerforaisafety/hle/blob/main/hle_eval/run_judge_results.py
def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
diff --git a/src/lighteval/tasks/extended/ifbench/evaluation_lib.py b/src/lighteval/tasks/tasks/ifbench/evaluation_lib.py
similarity index 98%
rename from src/lighteval/tasks/extended/ifbench/evaluation_lib.py
rename to src/lighteval/tasks/tasks/ifbench/evaluation_lib.py
index 493362866..2c4b761e8 100644
--- a/src/lighteval/tasks/extended/ifbench/evaluation_lib.py
+++ b/src/lighteval/tasks/tasks/ifbench/evaluation_lib.py
@@ -20,7 +20,7 @@
import json
from typing import Dict, Optional, Union
-import lighteval.tasks.extended.ifbench.instructions_registry as instructions_registry
+import lighteval.tasks.tasks.ifbench.instructions_registry as instructions_registry
@dataclasses.dataclass
diff --git a/src/lighteval/tasks/extended/ifbench/instructions.py b/src/lighteval/tasks/tasks/ifbench/instructions.py
similarity index 99%
rename from src/lighteval/tasks/extended/ifbench/instructions.py
rename to src/lighteval/tasks/tasks/ifbench/instructions.py
index 0c4f0a9a0..f691a26f8 100644
--- a/src/lighteval/tasks/extended/ifbench/instructions.py
+++ b/src/lighteval/tasks/tasks/ifbench/instructions.py
@@ -23,7 +23,6 @@
import unicodedata
from collections import Counter
-import emoji
import nltk
from lighteval.utils.imports import is_package_available, requires
@@ -35,7 +34,10 @@
if is_package_available("spacy"):
import spacy
-import lighteval.tasks.extended.ifeval.instructions_utils as instructions_util
+if is_package_available("emoji"):
+ import emoji
+
+import lighteval.tasks.tasks.ifeval.instructions_utils as instructions_util
logger = logging.getLogger(__name__)
diff --git a/src/lighteval/tasks/extended/ifbench/instructions_registry.py b/src/lighteval/tasks/tasks/ifbench/instructions_registry.py
similarity index 98%
rename from src/lighteval/tasks/extended/ifbench/instructions_registry.py
rename to src/lighteval/tasks/tasks/ifbench/instructions_registry.py
index b47494dd2..b146bd06d 100644
--- a/src/lighteval/tasks/extended/ifbench/instructions_registry.py
+++ b/src/lighteval/tasks/tasks/ifbench/instructions_registry.py
@@ -14,7 +14,7 @@
"""Registry of all instructions."""
-import lighteval.tasks.extended.ifbench.instructions as instructions
+import lighteval.tasks.tasks.ifbench.instructions as instructions
INSTRUCTION_DICT = {
diff --git a/src/lighteval/tasks/extended/ifbench/main.py b/src/lighteval/tasks/tasks/ifbench/main.py
similarity index 75%
rename from src/lighteval/tasks/extended/ifbench/main.py
rename to src/lighteval/tasks/tasks/ifbench/main.py
index 6f948203a..419c86600 100644
--- a/src/lighteval/tasks/extended/ifbench/main.py
+++ b/src/lighteval/tasks/tasks/ifbench/main.py
@@ -1,25 +1,22 @@
-# MIT License
+"""
+name:
+IFBench
-# Copyright (c) 2024 The HuggingFace Team
+dataset:
+allenai/IFBench_test, allenai/IFBench_multi-turn
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
+abstract:
+Challenging benchmark for precise instruction following.
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+languages:
+english
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+tags:
+instruction-following
+paper:
+https://arxiv.org/abs/2507.02833
+"""
import numpy as np
from aenum import extend_enum
@@ -30,9 +27,9 @@
SampleLevelMetricGrouping,
)
from lighteval.models.model_output import ModelResponse
-from lighteval.tasks.extended.ifbench import evaluation_lib
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc, SamplingMethod
+from lighteval.tasks.tasks.ifbench import evaluation_lib
def ifbench_prompt(line, task_name: str = ""):
@@ -104,7 +101,7 @@ def agg_inst_level_acc(items):
ifbench_test = LightevalTaskConfig(
name="ifbench_test",
prompt_function=ifbench_prompt,
- suite=["extended"],
+ suite=["lighteval"],
hf_repo="allenai/IFBench_test",
hf_subset="default",
metrics=[ifbench_metrics],
@@ -121,7 +118,7 @@ def agg_inst_level_acc(items):
ifbench_multiturn = LightevalTaskConfig(
name="ifbench_multiturn",
prompt_function=ifbench_prompt,
- suite=["extended"],
+ suite=["lighteval"],
hf_repo="allenai/IFBench_multi-turn",
hf_subset="default",
metrics=[ifbench_metrics],
diff --git a/src/lighteval/tasks/extended/ifeval/instructions.py b/src/lighteval/tasks/tasks/ifeval/instructions.py
similarity index 99%
rename from src/lighteval/tasks/extended/ifeval/instructions.py
rename to src/lighteval/tasks/tasks/ifeval/instructions.py
index 06b7cf85c..70a87e893 100644
--- a/src/lighteval/tasks/extended/ifeval/instructions.py
+++ b/src/lighteval/tasks/tasks/ifeval/instructions.py
@@ -27,7 +27,7 @@
if is_package_available("langdetect"):
import langdetect
-import lighteval.tasks.extended.ifeval.instructions_utils as instructions_util
+import lighteval.tasks.tasks.ifeval.instructions_utils as instructions_util
logger = logging.getLogger(__name__)
diff --git a/src/lighteval/tasks/extended/ifeval/instructions_registry.py b/src/lighteval/tasks/tasks/ifeval/instructions_registry.py
similarity index 99%
rename from src/lighteval/tasks/extended/ifeval/instructions_registry.py
rename to src/lighteval/tasks/tasks/ifeval/instructions_registry.py
index 62becfbaa..4dada73d4 100644
--- a/src/lighteval/tasks/extended/ifeval/instructions_registry.py
+++ b/src/lighteval/tasks/tasks/ifeval/instructions_registry.py
@@ -14,7 +14,7 @@
"""Registry of all instructions."""
-import lighteval.tasks.extended.ifeval.instructions as instructions
+import lighteval.tasks.tasks.ifeval.instructions as instructions
_KEYWORD = "keywords:"
diff --git a/src/lighteval/tasks/extended/ifeval/instructions_utils.py b/src/lighteval/tasks/tasks/ifeval/instructions_utils.py
similarity index 100%
rename from src/lighteval/tasks/extended/ifeval/instructions_utils.py
rename to src/lighteval/tasks/tasks/ifeval/instructions_utils.py
diff --git a/src/lighteval/tasks/extended/ifeval/main.py b/src/lighteval/tasks/tasks/ifeval/main.py
similarity index 79%
rename from src/lighteval/tasks/extended/ifeval/main.py
rename to src/lighteval/tasks/tasks/ifeval/main.py
index ae7d42809..2922e5fb6 100644
--- a/src/lighteval/tasks/extended/ifeval/main.py
+++ b/src/lighteval/tasks/tasks/ifeval/main.py
@@ -1,29 +1,27 @@
-# MIT License
+"""
+name:
+IFEval
-# Copyright (c) 2024 The HuggingFace Team
+dataset:
+google/IFEval
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
+abstract:
+Very specific task where there are no precise outputs but instead we test if the
+format obeys rules.
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+languages:
+english
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+tags:
+instruction-following
+paper:
+https://arxiv.org/abs/2311.07911
+"""
import numpy as np
-import lighteval.tasks.extended.ifeval.instructions_registry as instructions_registry
+import lighteval.tasks.tasks.ifeval.instructions_registry as instructions_registry
from lighteval.metrics.metrics_sample import SampleLevelComputation
from lighteval.metrics.utils.metric_utils import (
SampleLevelMetricGrouping,
@@ -149,7 +147,7 @@ def agg_inst_level_acc(items):
ifeval = LightevalTaskConfig(
name="ifeval",
prompt_function=ifeval_prompt,
- suite=["extended"],
+ suite=["lighteval"],
hf_repo="google/IFEval",
hf_subset="default",
metrics=[ifeval_metrics],
diff --git a/src/lighteval/tasks/tasks/imdb.py b/src/lighteval/tasks/tasks/imdb.py
new file mode 100644
index 000000000..e7073699e
--- /dev/null
+++ b/src/lighteval/tasks/tasks/imdb.py
@@ -0,0 +1,67 @@
+"""
+name:
+Imdb
+
+dataset:
+lighteval/IMDB_helm
+
+abstract:
+The IMDB benchmark for sentiment analysis in movie review, from:
+Learning Word Vectors for Sentiment Analysis
+
+languages:
+english
+
+tags:
+classification
+
+paper:
+https://aclanthology.org/P11-1015/
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+imdb = LightevalTaskConfig(
+ name="imdb",
+ suite=["lighteval"],
+ prompt_function=prompt.imdb,
+ hf_repo="lighteval/IMDB_helm",
+ hf_subset="default",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+imdb_contrastset = LightevalTaskConfig(
+ name="imdb:contrastset",
+ suite=["lighteval"],
+ prompt_function=prompt.imdb_contrastset,
+ hf_repo="lighteval/IMDB_helm",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ imdb,
+ imdb_contrastset,
+]
diff --git a/src/lighteval/tasks/tasks/jeopardy.py b/src/lighteval/tasks/tasks/jeopardy.py
new file mode 100644
index 000000000..5044602fe
--- /dev/null
+++ b/src/lighteval/tasks/tasks/jeopardy.py
@@ -0,0 +1,48 @@
+"""
+name:
+Jeopardy
+
+dataset:
+openaccess-ai-collective/jeopardy
+
+abstract:
+Jeopardy is a dataset of questions and answers from the Jeopardy game show.
+
+languages:
+english
+
+tags:
+knowledge, qa
+
+paper:
+"""
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+jeopardy = LightevalTaskConfig(
+ name="jeopardy",
+ prompt_function=get_qa_prompt_function(
+ Language.ENGLISH,
+ lambda line: {
+ "question": line["question"],
+ "choices": [line["answer"]],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="openaccess-ai-collective/jeopardy",
+ hf_subset="default",
+ evaluation_splits=("train",),
+ few_shots_split="train",
+ generation_size=250,
+ stop_sequence=["\n", "Question:", "question:"],
+ metrics=[Metrics.exact_match],
+ version=1,
+)
+
+TASKS_TABLE = [
+ jeopardy,
+]
diff --git a/src/lighteval/tasks/tasks/lambada.py b/src/lighteval/tasks/tasks/lambada.py
new file mode 100644
index 000000000..3a7292a3f
--- /dev/null
+++ b/src/lighteval/tasks/tasks/lambada.py
@@ -0,0 +1,65 @@
+"""
+name:
+Lambada
+
+dataset:
+cimec/lambada
+
+abstract:
+LAMBADA is a benchmark for testing language models’ ability to understand broad
+narrative context. Each passage requires predicting its final word—easy for
+humans given the full passage but impossible from just the last sentence.
+Success demands long-range discourse comprehension.
+
+languages:
+english
+
+tags:
+language-modeling
+
+paper:
+https://arxiv.org/abs/1606.06031
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+lambada_standard = LightevalTaskConfig(
+ name="lambada:standard",
+ suite=["lighteval"],
+ prompt_function=prompt.lambada,
+ hf_repo="cimec/lambada",
+ hf_subset="plain_text",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=10,
+ metrics=[Metrics.target_perplexity],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+lambada_standard_cloze = LightevalTaskConfig(
+ name="lambada:standard_cloze",
+ suite=["lighteval"],
+ prompt_function=prompt.lambada_cloze,
+ hf_repo="cimec/lambada",
+ hf_subset="plain_text",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=10,
+ metrics=[Metrics.target_perplexity],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ lambada_standard,
+ lambada_standard_cloze,
+]
diff --git a/src/lighteval/tasks/extended/lcb/codegen_metrics.py b/src/lighteval/tasks/tasks/lcb/codegen_metrics.py
similarity index 94%
rename from src/lighteval/tasks/extended/lcb/codegen_metrics.py
rename to src/lighteval/tasks/tasks/lcb/codegen_metrics.py
index 08246806a..e2617ed44 100644
--- a/src/lighteval/tasks/extended/lcb/codegen_metrics.py
+++ b/src/lighteval/tasks/tasks/lcb/codegen_metrics.py
@@ -1,28 +1,16 @@
-# MIT License
-
-# Copyright (c) 2025 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-"""This module contains helper functions copied and modified from
-https://github.com/LiveCodeBench/LiveCodeBench
-and
-https://github.com/QwenLM/Qwen2.5-Coder/tree/main/qwencoder-eval/instruct/livecode_bench
+"""
+name:
+Codegen Metrics
+
+dataset:
+
+abstract:
+
+languages:
+
+tags:
+
+paper:
"""
import ast
diff --git a/src/lighteval/tasks/extended/lcb/main.py b/src/lighteval/tasks/tasks/lcb/main.py
similarity index 75%
rename from src/lighteval/tasks/extended/lcb/main.py
rename to src/lighteval/tasks/tasks/lcb/main.py
index 299ae9073..0f2f5d52e 100644
--- a/src/lighteval/tasks/extended/lcb/main.py
+++ b/src/lighteval/tasks/tasks/lcb/main.py
@@ -1,32 +1,24 @@
-# MIT License
-
-# Copyright (c) 2025 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-"""Usage:
-lighteval vllm \
- "pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={temperature:0.6,top_p:0.95}" \
- "extended|lcb:codegeneration|0"
-
-lighteval vllm \
- "pretrained=Qwen/Qwen2.5-Coder-3B-Instruct,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={temperature:0.2,top_p:0.95}" \
- "extended|lcb:codegeneration|0"
+"""
+name:
+Live Code Bench
+
+dataset:
+lighteval/code_generation_lite
+
+abstract:
+LiveCodeBench collects problems from periodic contests on LeetCode, AtCoder, and
+Codeforces platforms and uses them for constructing a holistic benchmark for
+evaluating Code LLMs across variety of code-related scenarios continuously over
+time.
+
+languages:
+english
+
+tags:
+code-generation
+
+paper:
+https://livecodebench.github.io/
"""
import json
@@ -38,13 +30,13 @@
from lighteval.metrics.metrics import Metrics, SampleLevelMetric
from lighteval.metrics.metrics_sample import SampleLevelComputation
from lighteval.models.model_output import ModelResponse
-from lighteval.tasks.extended.lcb.codegen_metrics import (
+from lighteval.tasks.lighteval_task import Doc, LightevalTaskConfig
+from lighteval.tasks.requests import SamplingMethod
+from lighteval.tasks.tasks.lcb.codegen_metrics import (
codegen_metrics,
extract_code,
translate_private_test_cases,
)
-from lighteval.tasks.lighteval_task import Doc, LightevalTaskConfig
-from lighteval.tasks.requests import SamplingMethod
def prepare_prompt(line: dict[str, Any]) -> str:
@@ -154,7 +146,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> dict:
name = "lcb:codegeneration" if subset == "v4_v5" else f"lcb:codegeneration_{subset}"
task = LightevalTaskConfig(
name=name,
- suite=["extended"],
+ suite=["lighteval"],
prompt_function=lcb_codegeneration_prompt_fn,
hf_repo="lighteval/code_generation_lite",
hf_subset=subset, # https://github.com/LiveCodeBench/LiveCodeBench/tree/main?tab=readme-ov-file#dataset-versions
diff --git a/src/lighteval/tasks/tasks/legal_summarization.py b/src/lighteval/tasks/tasks/legal_summarization.py
new file mode 100644
index 000000000..3e31b67ba
--- /dev/null
+++ b/src/lighteval/tasks/tasks/legal_summarization.py
@@ -0,0 +1,102 @@
+"""
+name:
+Legal Summarization
+
+dataset:
+lighteval/legal_summarization
+
+abstract:
+LegalSummarization is a dataset for legal summarization.
+
+languages:
+english
+
+tags:
+legal, summarization
+
+paper:
+https://arxiv.org/abs/2210.13448
+https://arxiv.org/abs/2210.13448
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+legal_summarization_billsum = LightevalTaskConfig(
+ name="legal_summarization:billsum",
+ suite=["lighteval"],
+ prompt_function=prompt.legal_summarization,
+ hf_repo="lighteval/legal_summarization",
+ hf_subset="BillSum",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1024,
+ metrics=[
+ Metrics.rouge1,
+ Metrics.rouge2,
+ Metrics.rougeL,
+ Metrics.faithfulness,
+ Metrics.extractiveness,
+ Metrics.bert_score,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+legal_summarization_eurlexsum = LightevalTaskConfig(
+ name="legal_summarization:eurlexsum",
+ suite=["lighteval"],
+ prompt_function=prompt.legal_summarization,
+ hf_repo="lighteval/legal_summarization",
+ hf_subset="EurLexSum",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=2048,
+ metrics=[
+ Metrics.rouge1,
+ Metrics.rouge2,
+ Metrics.rougeL,
+ Metrics.faithfulness,
+ Metrics.extractiveness,
+ Metrics.bert_score,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+legal_summarization_multilexsum = LightevalTaskConfig(
+ name="legal_summarization:multilexsum",
+ suite=["lighteval"],
+ prompt_function=prompt.multilexsum,
+ hf_repo="lighteval/legal_summarization",
+ hf_subset="MultiLexSum",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=256,
+ metrics=[
+ Metrics.rouge1,
+ Metrics.rouge2,
+ Metrics.rougeL,
+ Metrics.faithfulness,
+ Metrics.extractiveness,
+ Metrics.bert_score,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ legal_summarization_billsum,
+ legal_summarization_eurlexsum,
+ legal_summarization_multilexsum,
+]
diff --git a/src/lighteval/tasks/tasks/legalsupport.py b/src/lighteval/tasks/tasks/legalsupport.py
new file mode 100644
index 000000000..82ea8c864
--- /dev/null
+++ b/src/lighteval/tasks/tasks/legalsupport.py
@@ -0,0 +1,43 @@
+"""
+name:
+Legalsupport
+
+dataset:
+lighteval/LegalSupport
+
+abstract:
+Measures fine-grained legal reasoning through reverse entailment.
+
+languages:
+english
+
+tags:
+legal
+
+paper:
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+legalsupport = LightevalTaskConfig(
+ name="legalsupport",
+ suite=["lighteval"],
+ prompt_function=prompt.legal_support,
+ hf_repo="lighteval/LegalSupport",
+ hf_subset="default",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ legalsupport,
+]
diff --git a/src/lighteval/tasks/tasks/lexglue.py b/src/lighteval/tasks/tasks/lexglue.py
new file mode 100644
index 000000000..4206225a3
--- /dev/null
+++ b/src/lighteval/tasks/tasks/lexglue.py
@@ -0,0 +1,146 @@
+"""
+name:
+Lexglue
+
+dataset:
+lighteval/lexglue
+
+abstract:
+LexGLUE: A Benchmark Dataset for Legal Language Understanding in English
+
+languages:
+english
+
+tags:
+classification, legal
+
+paper:
+https://arxiv.org/abs/2110.00976
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+lexglue_case_hold = LightevalTaskConfig(
+ name="lexglue:case_hold",
+ suite=["lighteval"],
+ prompt_function=prompt.lex_glue_case_hold,
+ hf_repo="lighteval/lexglue",
+ hf_subset="case_hold",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lexglue_ecthr_a = LightevalTaskConfig(
+ name="lexglue:ecthr_a",
+ suite=["lighteval"],
+ prompt_function=prompt.lex_glue_ecthr_a,
+ hf_repo="lighteval/lexglue",
+ hf_subset="ecthr_a",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lexglue_ecthr_b = LightevalTaskConfig(
+ name="lexglue:ecthr_b",
+ suite=["lighteval"],
+ prompt_function=prompt.lex_glue_ecthr_b,
+ hf_repo="lighteval/lexglue",
+ hf_subset="ecthr_b",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lexglue_eurlex = LightevalTaskConfig(
+ name="lexglue:eurlex",
+ suite=["lighteval"],
+ prompt_function=prompt.lex_glue_eurlex,
+ hf_repo="lighteval/lexglue",
+ hf_subset="eurlex",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lexglue_ledgar = LightevalTaskConfig(
+ name="lexglue:ledgar",
+ suite=["lighteval"],
+ prompt_function=prompt.lex_glue_ledgar,
+ hf_repo="lighteval/lexglue",
+ hf_subset="ledgar",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lexglue_scotus = LightevalTaskConfig(
+ name="lexglue:scotus",
+ suite=["lighteval"],
+ prompt_function=prompt.lex_glue_scotus,
+ hf_repo="lighteval/lexglue",
+ hf_subset="scotus",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lexglue_unfair_tos = LightevalTaskConfig(
+ name="lexglue:unfair_tos",
+ suite=["lighteval"],
+ prompt_function=prompt.lex_glue_unfair_tos,
+ hf_repo="lighteval/lexglue",
+ hf_subset="unfair_tos",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ lexglue_case_hold,
+ lexglue_ecthr_a,
+ lexglue_ecthr_b,
+ lexglue_eurlex,
+ lexglue_ledgar,
+ lexglue_scotus,
+ lexglue_unfair_tos,
+]
diff --git a/src/lighteval/tasks/tasks/lextreme.py b/src/lighteval/tasks/tasks/lextreme.py
new file mode 100644
index 000000000..7ba9df453
--- /dev/null
+++ b/src/lighteval/tasks/tasks/lextreme.py
@@ -0,0 +1,333 @@
+"""
+name:
+Lextreme
+
+dataset:
+lighteval/lextreme
+
+abstract:
+LEXTREME: A Multi-Lingual and Multi-Task Benchmark for the Legal Domain
+
+languages:
+bulgarian, czech, danish, german, greek, english, spanish, estonian, finnish, french, ga, croatian, hungarian, italian, lithuanian, latvian, mt, dutch, polish, portuguese, romanian, slovak, slovenian, swedish
+
+tags:
+classification, legal
+
+paper:
+https://arxiv.org/abs/2301.13126
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+lextreme_brazilian_court_decisions_judgment = LightevalTaskConfig(
+ name="lextreme:brazilian_court_decisions_judgment",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_brazilian_court_decisions_judgment,
+ hf_repo="lighteval/lextreme",
+ hf_subset="brazilian_court_decisions_judgment",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_brazilian_court_decisions_unanimity = LightevalTaskConfig(
+ name="lextreme:brazilian_court_decisions_unanimity",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_brazilian_court_decisions_unanimity,
+ hf_repo="lighteval/lextreme",
+ hf_subset="brazilian_court_decisions_unanimity",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_covid19_emergency_event = LightevalTaskConfig(
+ name="lextreme:covid19_emergency_event",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_covid19_emergency_event,
+ hf_repo="lighteval/lextreme",
+ hf_subset="covid19_emergency_event",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=10,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_german_argument_mining = LightevalTaskConfig(
+ name="lextreme:german_argument_mining",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_german_argument_mining,
+ hf_repo="lighteval/lextreme",
+ hf_subset="german_argument_mining",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_greek_legal_code_chapter = LightevalTaskConfig(
+ name="lextreme:greek_legal_code_chapter",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_greek_legal_code_chapter,
+ hf_repo="lighteval/lextreme",
+ hf_subset="greek_legal_code_chapter",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_greek_legal_code_subject = LightevalTaskConfig(
+ name="lextreme:greek_legal_code_subject",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_greek_legal_code_subject,
+ hf_repo="lighteval/lextreme",
+ hf_subset="greek_legal_code_subject",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_greek_legal_code_volume = LightevalTaskConfig(
+ name="lextreme:greek_legal_code_volume",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_greek_legal_code_volume,
+ hf_repo="lighteval/lextreme",
+ hf_subset="greek_legal_code_volume",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_greek_legal_ner = LightevalTaskConfig(
+ name="lextreme:greek_legal_ner",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_greek_legal_ner,
+ hf_repo="lighteval/lextreme",
+ hf_subset="greek_legal_ner",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=430,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_legalnero = LightevalTaskConfig(
+ name="lextreme:legalnero",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_legalnero,
+ hf_repo="lighteval/lextreme",
+ hf_subset="legalnero",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=788,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_lener_br = LightevalTaskConfig(
+ name="lextreme:lener_br",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_lener_br,
+ hf_repo="lighteval/lextreme",
+ hf_subset="lener_br",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=338,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_mapa_coarse = LightevalTaskConfig(
+ name="lextreme:mapa_coarse",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_mapa_coarse,
+ hf_repo="lighteval/lextreme",
+ hf_subset="mapa_coarse",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=274,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_mapa_fine = LightevalTaskConfig(
+ name="lextreme:mapa_fine",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_mapa_fine,
+ hf_repo="lighteval/lextreme",
+ hf_subset="mapa_fine",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=274,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_multi_eurlex_level_1 = LightevalTaskConfig(
+ name="lextreme:multi_eurlex_level_1",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_multi_eurlex_level_1,
+ hf_repo="lighteval/lextreme",
+ hf_subset="multi_eurlex_level_1",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=10,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_multi_eurlex_level_2 = LightevalTaskConfig(
+ name="lextreme:multi_eurlex_level_2",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_multi_eurlex_level_2,
+ hf_repo="lighteval/lextreme",
+ hf_subset="multi_eurlex_level_2",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=10,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_multi_eurlex_level_3 = LightevalTaskConfig(
+ name="lextreme:multi_eurlex_level_3",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_multi_eurlex_level_3,
+ hf_repo="lighteval/lextreme",
+ hf_subset="multi_eurlex_level_3",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=10,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_online_terms_of_service_clause_topics = LightevalTaskConfig(
+ name="lextreme:online_terms_of_service_clause_topics",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_online_terms_of_service_clause_topics,
+ hf_repo="lighteval/lextreme",
+ hf_subset="online_terms_of_service_clause_topics",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=10,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_online_terms_of_service_unfairness_levels = LightevalTaskConfig(
+ name="lextreme:online_terms_of_service_unfairness_levels",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_online_terms_of_service_unfairness_levels,
+ hf_repo="lighteval/lextreme",
+ hf_subset="online_terms_of_service_unfairness_levels",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=10,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lextreme_swiss_judgment_prediction = LightevalTaskConfig(
+ name="lextreme:swiss_judgment_prediction",
+ suite=["lighteval"],
+ prompt_function=prompt.lextreme_swiss_judgment_prediction,
+ hf_repo="lighteval/lextreme",
+ hf_subset="swiss_judgment_prediction",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ lextreme_brazilian_court_decisions_judgment,
+ lextreme_brazilian_court_decisions_unanimity,
+ lextreme_covid19_emergency_event,
+ lextreme_german_argument_mining,
+ lextreme_greek_legal_code_chapter,
+ lextreme_greek_legal_code_subject,
+ lextreme_greek_legal_code_volume,
+ lextreme_greek_legal_ner,
+ lextreme_legalnero,
+ lextreme_lener_br,
+ lextreme_mapa_coarse,
+ lextreme_mapa_fine,
+ lextreme_multi_eurlex_level_1,
+ lextreme_multi_eurlex_level_2,
+ lextreme_multi_eurlex_level_3,
+ lextreme_online_terms_of_service_clause_topics,
+ lextreme_online_terms_of_service_unfairness_levels,
+ lextreme_swiss_judgment_prediction,
+]
diff --git a/src/lighteval/tasks/tasks/logiqa.py b/src/lighteval/tasks/tasks/logiqa.py
new file mode 100644
index 000000000..2439ddf69
--- /dev/null
+++ b/src/lighteval/tasks/tasks/logiqa.py
@@ -0,0 +1,48 @@
+"""
+name:
+Logiqa
+
+dataset:
+lighteval/logiqa_harness
+
+abstract:
+LogiQA is a machine reading comprehension dataset focused on testing logical
+reasoning abilities. It contains 8,678 expert-written multiple-choice questions
+covering various types of deductive reasoning. While humans perform strongly,
+state-of-the-art models lag far behind, making LogiQA a benchmark for advancing
+logical reasoning in NLP systems.
+
+languages:
+english
+
+tags:
+qa
+
+paper:
+https://arxiv.org/abs/2007.08124
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+logiqa = LightevalTaskConfig(
+ name="logiqa",
+ suite=["lighteval"],
+ prompt_function=prompt.logiqa,
+ hf_repo="lighteval/logiqa_harness",
+ hf_subset="logiqa",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ logiqa,
+]
diff --git a/src/lighteval/tasks/tasks/lsat_qa.py b/src/lighteval/tasks/tasks/lsat_qa.py
new file mode 100644
index 000000000..8d14fb86b
--- /dev/null
+++ b/src/lighteval/tasks/tasks/lsat_qa.py
@@ -0,0 +1,111 @@
+"""
+name:
+Lsat Qa
+
+dataset:
+lighteval/lsat_qa
+
+abstract:
+Questions from law school admission tests.
+
+languages:
+english
+
+tags:
+legal, qa
+
+paper:
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+lsat_qa = LightevalTaskConfig(
+ name="lsat_qa",
+ suite=["lighteval"],
+ prompt_function=prompt.lsat_qa,
+ hf_repo="lighteval/lsat_qa",
+ hf_subset="all",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lsat_qa_assignment = LightevalTaskConfig(
+ name="lsat_qa:assignment",
+ suite=["lighteval"],
+ prompt_function=prompt.lsat_qa,
+ hf_repo="lighteval/lsat_qa",
+ hf_subset="assignment",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lsat_qa_grouping = LightevalTaskConfig(
+ name="lsat_qa:grouping",
+ suite=["lighteval"],
+ prompt_function=prompt.lsat_qa,
+ hf_repo="lighteval/lsat_qa",
+ hf_subset="grouping",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lsat_qa_miscellaneous = LightevalTaskConfig(
+ name="lsat_qa:miscellaneous",
+ suite=["lighteval"],
+ prompt_function=prompt.lsat_qa,
+ hf_repo="lighteval/lsat_qa",
+ hf_subset="miscellaneous",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+lsat_qa_ordering = LightevalTaskConfig(
+ name="lsat_qa:ordering",
+ suite=["lighteval"],
+ prompt_function=prompt.lsat_qa,
+ hf_repo="lighteval/lsat_qa",
+ hf_subset="ordering",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ lsat_qa,
+ lsat_qa_assignment,
+ lsat_qa_grouping,
+ lsat_qa_miscellaneous,
+ lsat_qa_ordering,
+]
diff --git a/src/lighteval/tasks/tasks/math.py b/src/lighteval/tasks/tasks/math.py
new file mode 100644
index 000000000..8ae7bd243
--- /dev/null
+++ b/src/lighteval/tasks/tasks/math.py
@@ -0,0 +1,209 @@
+"""
+name:
+Math
+
+dataset:
+DigitalLearningGmbH/MATH-lighteval
+
+abstract:
+
+languages:
+english
+
+tags:
+math, reasoning
+
+paper:
+https://arxiv.org/abs/2305.20050
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.normalizations import math_normalizer
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+math_algebra = LightevalTaskConfig(
+ name="math:algebra",
+ suite=["lighteval"],
+ prompt_function=prompt.math,
+ hf_repo="DigitalLearningGmbH/MATH-lighteval",
+ hf_subset="algebra",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=2048,
+ metrics=[
+ Metrics.maj_at_n(
+ sample_params={
+ "n": 4,
+ "strip_strings": True,
+ "normalize_pred": math_normalizer,
+ "normalize_gold": math_normalizer,
+ }
+ ),
+ ],
+ stop_sequence=["\n"],
+ version=1,
+)
+
+math_counting_and_probability = LightevalTaskConfig(
+ name="math:counting_and_probability",
+ suite=["lighteval"],
+ prompt_function=prompt.math,
+ hf_repo="DigitalLearningGmbH/MATH-lighteval",
+ hf_subset="counting_and_probability",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=2048,
+ metrics=[
+ Metrics.maj_at_n(
+ sample_params={
+ "n": 4,
+ "strip_strings": True,
+ "normalize_pred": math_normalizer,
+ "normalize_gold": math_normalizer,
+ }
+ ),
+ ],
+ stop_sequence=["\n"],
+ version=1,
+)
+
+math_geometry = LightevalTaskConfig(
+ name="math:geometry",
+ suite=["lighteval"],
+ prompt_function=prompt.math,
+ hf_repo="DigitalLearningGmbH/MATH-lighteval",
+ hf_subset="geometry",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=2048,
+ metrics=[
+ Metrics.maj_at_n(
+ sample_params={
+ "n": 4,
+ "strip_strings": True,
+ "normalize_pred": math_normalizer,
+ "normalize_gold": math_normalizer,
+ }
+ ),
+ ],
+ stop_sequence=["\n"],
+ version=1,
+)
+
+math_intermediate_algebra = LightevalTaskConfig(
+ name="math:intermediate_algebra",
+ suite=["lighteval"],
+ prompt_function=prompt.math,
+ hf_repo="DigitalLearningGmbH/MATH-lighteval",
+ hf_subset="intermediate_algebra",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=2048,
+ metrics=[
+ Metrics.maj_at_n(
+ sample_params={
+ "n": 4,
+ "strip_strings": True,
+ "normalize_pred": math_normalizer,
+ "normalize_gold": math_normalizer,
+ }
+ ),
+ ],
+ stop_sequence=["\n"],
+ version=1,
+)
+
+math_number_theory = LightevalTaskConfig(
+ name="math:number_theory",
+ suite=["lighteval"],
+ prompt_function=prompt.math,
+ hf_repo="DigitalLearningGmbH/MATH-lighteval",
+ hf_subset="number_theory",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=2048,
+ metrics=[
+ Metrics.maj_at_n(
+ sample_params={
+ "n": 4,
+ "strip_strings": True,
+ "normalize_pred": math_normalizer,
+ "normalize_gold": math_normalizer,
+ }
+ ),
+ ],
+ stop_sequence=["\n"],
+ version=1,
+)
+
+math_prealgebra = LightevalTaskConfig(
+ name="math:prealgebra",
+ suite=["lighteval"],
+ prompt_function=prompt.math,
+ hf_repo="DigitalLearningGmbH/MATH-lighteval",
+ hf_subset="prealgebra",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=2048,
+ metrics=[
+ Metrics.maj_at_n(
+ sample_params={
+ "n": 4,
+ "strip_strings": True,
+ "normalize_pred": math_normalizer,
+ "normalize_gold": math_normalizer,
+ }
+ ),
+ ],
+ stop_sequence=["\n"],
+ version=1,
+)
+
+math_precalculus = LightevalTaskConfig(
+ name="math:precalculus",
+ suite=["lighteval"],
+ prompt_function=prompt.math,
+ hf_repo="DigitalLearningGmbH/MATH-lighteval",
+ hf_subset="precalculus",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=2048,
+ metrics=[
+ Metrics.maj_at_n(
+ sample_params={
+ "n": 4,
+ "strip_strings": True,
+ "normalize_pred": math_normalizer,
+ "normalize_gold": math_normalizer,
+ }
+ ),
+ ],
+ stop_sequence=["\n"],
+ version=1,
+)
+
+TASKS_TABLE = [
+ math_algebra,
+ math_counting_and_probability,
+ math_geometry,
+ math_intermediate_algebra,
+ math_number_theory,
+ math_prealgebra,
+ math_precalculus,
+]
diff --git a/src/lighteval/tasks/tasks/math_500.py b/src/lighteval/tasks/tasks/math_500.py
new file mode 100644
index 000000000..961250b5d
--- /dev/null
+++ b/src/lighteval/tasks/tasks/math_500.py
@@ -0,0 +1,46 @@
+"""
+name:
+Math 500
+
+dataset:
+HuggingFaceH4/MATH-500
+
+abstract:
+This dataset contains a subset of 500 problems from the MATH benchmark that
+OpenAI created in their Let's Verify Step by Step paper.
+
+languages:
+english
+
+tags:
+math, reasoning
+
+paper:
+https://arxiv.org/abs/2305.20050
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+math_500 = LightevalTaskConfig(
+ name="math_500",
+ suite=["lighteval"],
+ prompt_function=prompt.math_500,
+ hf_repo="HuggingFaceH4/MATH-500",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=32768,
+ metrics=[
+ Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}),
+ ],
+ version=2,
+)
+
+TASKS_TABLE = [
+ math_500,
+]
diff --git a/src/lighteval/tasks/tasks/mathqa.py b/src/lighteval/tasks/tasks/mathqa.py
new file mode 100644
index 000000000..4eccd9a75
--- /dev/null
+++ b/src/lighteval/tasks/tasks/mathqa.py
@@ -0,0 +1,47 @@
+"""
+name:
+Mathqa
+
+dataset:
+allenai/math_qa
+
+abstract:
+large-scale dataset of math word problems. Our dataset is gathered by using a
+new representation language to annotate over the AQuA-RAT dataset with
+fully-specified operational programs. AQuA-RAT has provided the questions,
+options, rationale, and the correct options.
+
+languages:
+english
+
+tags:
+math, qa, reasoning
+
+paper:
+https://arxiv.org/abs/1905.13319
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+mathqa = LightevalTaskConfig(
+ name="mathqa",
+ suite=["lighteval"],
+ prompt_function=prompt.mathqa,
+ hf_repo="allenai/math_qa",
+ hf_subset="default",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ mathqa,
+]
diff --git a/src/lighteval/tasks/tasks/med.py b/src/lighteval/tasks/tasks/med.py
new file mode 100644
index 000000000..49496dae3
--- /dev/null
+++ b/src/lighteval/tasks/tasks/med.py
@@ -0,0 +1,86 @@
+"""
+name:
+Med
+
+dataset:
+lighteval/med_mcqa, lighteval/med_paragraph_simplification, bigbio/med_qa
+
+abstract:
+A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering
+
+languages:
+english
+
+tags:
+health, medical
+
+paper:
+https://medmcqa.github.io/
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+med_mcqa = LightevalTaskConfig(
+ name="med_mcqa",
+ suite=["lighteval"],
+ prompt_function=prompt.med_mcqa,
+ hf_repo="lighteval/med_mcqa",
+ hf_subset="default",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+med_paragraph_simplification = LightevalTaskConfig(
+ name="med_paragraph_simplification",
+ suite=["lighteval"],
+ prompt_function=prompt.med_paragraph_simplification,
+ hf_repo="lighteval/med_paragraph_simplification",
+ hf_subset="default",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=512,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+med_qa = LightevalTaskConfig(
+ name="med_qa",
+ suite=["lighteval"],
+ prompt_function=prompt.med_qa,
+ hf_repo="bigbio/med_qa",
+ hf_subset="med_qa_en_source",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ med_mcqa,
+ med_paragraph_simplification,
+ med_qa,
+]
diff --git a/src/lighteval/tasks/tasks/med_dialog.py b/src/lighteval/tasks/tasks/med_dialog.py
new file mode 100644
index 000000000..70a7c08ee
--- /dev/null
+++ b/src/lighteval/tasks/tasks/med_dialog.py
@@ -0,0 +1,65 @@
+"""
+name:
+Med Dialog
+
+dataset:
+lighteval/med_dialog
+
+abstract:
+A collection of medical dialogue datasets.
+
+languages:
+english
+
+tags:
+dialog, health, medical
+
+paper:
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+med_dialog_healthcaremagic = LightevalTaskConfig(
+ name="med_dialog:healthcaremagic",
+ suite=["lighteval"],
+ prompt_function=prompt.med_dialog,
+ hf_repo="lighteval/med_dialog",
+ hf_subset="healthcaremagic",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=128,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+med_dialog_icliniq = LightevalTaskConfig(
+ name="med_dialog:icliniq",
+ suite=["lighteval"],
+ prompt_function=prompt.med_dialog,
+ hf_repo="lighteval/med_dialog",
+ hf_subset="icliniq",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=128,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ med_dialog_healthcaremagic,
+ med_dialog_icliniq,
+]
diff --git a/src/lighteval/tasks/tasks/mgsm.py b/src/lighteval/tasks/tasks/mgsm.py
new file mode 100644
index 000000000..e6391ec01
--- /dev/null
+++ b/src/lighteval/tasks/tasks/mgsm.py
@@ -0,0 +1,217 @@
+"""
+name:
+Mgsm
+
+dataset:
+juletxara/mgsm
+
+abstract:
+Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school
+math problems.
+The same 250 problems from GSM8K are each translated via human annotators in 10
+languages.
+
+languages:
+english, spanish, french, german, russian, chinese, japanese, thai, swahili, bengali, telugu
+
+tags:
+math, multilingual, reasoning
+
+paper:
+https://arxiv.org/abs/2210.03057
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+mgsm_en = LightevalTaskConfig(
+ name="mgsm:en",
+ suite=["lighteval"],
+ prompt_function=prompt.mgsm_en,
+ hf_repo="juletxara/mgsm",
+ hf_subset="en",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.exact_match],
+ stop_sequence=None,
+ version=0,
+)
+
+mgsm_es = LightevalTaskConfig(
+ name="mgsm:es",
+ suite=["lighteval"],
+ prompt_function=prompt.mgsm_es,
+ hf_repo="juletxara/mgsm",
+ hf_subset="es",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.exact_match],
+ stop_sequence=None,
+ version=0,
+)
+
+mgsm_fr = LightevalTaskConfig(
+ name="mgsm:fr",
+ suite=["lighteval"],
+ prompt_function=prompt.mgsm_fr,
+ hf_repo="juletxara/mgsm",
+ hf_subset="fr",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.exact_match],
+ stop_sequence=None,
+ version=0,
+)
+
+mgsm_de = LightevalTaskConfig(
+ name="mgsm:de",
+ suite=["lighteval"],
+ prompt_function=prompt.mgsm_de,
+ hf_repo="juletxara/mgsm",
+ hf_subset="de",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.exact_match],
+ stop_sequence=None,
+ version=0,
+)
+
+mgsm_ru = LightevalTaskConfig(
+ name="mgsm:ru",
+ suite=["lighteval"],
+ prompt_function=prompt.mgsm_ru,
+ hf_repo="juletxara/mgsm",
+ hf_subset="ru",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.exact_match],
+ stop_sequence=None,
+ version=0,
+)
+
+mgsm_zh = LightevalTaskConfig(
+ name="mgsm:zh",
+ suite=["lighteval"],
+ prompt_function=prompt.mgsm_zh,
+ hf_repo="juletxara/mgsm",
+ hf_subset="zh",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.exact_match],
+ stop_sequence=None,
+ version=0,
+)
+
+mgsm_ja = LightevalTaskConfig(
+ name="mgsm:ja",
+ suite=["lighteval"],
+ prompt_function=prompt.mgsm_ja,
+ hf_repo="juletxara/mgsm",
+ hf_subset="ja",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.exact_match],
+ stop_sequence=None,
+ version=0,
+)
+
+mgsm_th = LightevalTaskConfig(
+ name="mgsm:th",
+ suite=["lighteval"],
+ prompt_function=prompt.mgsm_th,
+ hf_repo="juletxara/mgsm",
+ hf_subset="th",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.exact_match],
+ stop_sequence=None,
+ version=0,
+)
+
+mgsm_sw = LightevalTaskConfig(
+ name="mgsm:sw",
+ suite=["lighteval"],
+ prompt_function=prompt.mgsm_sw,
+ hf_repo="juletxara/mgsm",
+ hf_subset="sw",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.exact_match],
+ stop_sequence=None,
+ version=0,
+)
+
+mgsm_bn = LightevalTaskConfig(
+ name="mgsm:bn",
+ suite=["lighteval"],
+ prompt_function=prompt.mgsm_bn,
+ hf_repo="juletxara/mgsm",
+ hf_subset="bn",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.exact_match],
+ stop_sequence=None,
+ version=0,
+)
+
+mgsm_te = LightevalTaskConfig(
+ name="mgsm:te",
+ suite=["lighteval"],
+ prompt_function=prompt.mgsm_te,
+ hf_repo="juletxara/mgsm",
+ hf_subset="te",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.exact_match],
+ stop_sequence=None,
+ version=0,
+)
+
+TASKS_TABLE = [
+ mgsm_en,
+ mgsm_es,
+ mgsm_fr,
+ mgsm_de,
+ mgsm_ru,
+ mgsm_zh,
+ mgsm_ja,
+ mgsm_th,
+ mgsm_sw,
+ mgsm_bn,
+ mgsm_te,
+]
diff --git a/src/lighteval/tasks/extended/mix_eval/judge_prompts.py b/src/lighteval/tasks/tasks/mix_eval/judge_prompts.py
similarity index 91%
rename from src/lighteval/tasks/extended/mix_eval/judge_prompts.py
rename to src/lighteval/tasks/tasks/mix_eval/judge_prompts.py
index ab2a03405..48850b820 100644
--- a/src/lighteval/tasks/extended/mix_eval/judge_prompts.py
+++ b/src/lighteval/tasks/tasks/mix_eval/judge_prompts.py
@@ -1,26 +1,4 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-from lighteval.tasks.extended.mix_eval.prompts import parse_options
+from lighteval.tasks.tasks.mix_eval.prompts import parse_options
def flow_judge_for_freeform_template(question, options, answer, gold):
diff --git a/src/lighteval/tasks/extended/mix_eval/main.py b/src/lighteval/tasks/tasks/mix_eval/main.py
similarity index 83%
rename from src/lighteval/tasks/extended/mix_eval/main.py
rename to src/lighteval/tasks/tasks/mix_eval/main.py
index e57faa1bd..2b65ab817 100644
--- a/src/lighteval/tasks/extended/mix_eval/main.py
+++ b/src/lighteval/tasks/tasks/mix_eval/main.py
@@ -1,24 +1,26 @@
-# MIT License
+"""
+name:
+Mix Eval
-# Copyright (c) 2024 The HuggingFace Team
+dataset:
+MixEval/MixEval
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
+abstract:
+Ground-truth-based dynamic benchmark derived from off-the-shelf benchmark
+mixtures, which evaluates LLMs with a highly capable model ranking (i.e., 0.96
+correlation with Chatbot Arena) while running locally and quickly (6% the time
+and cost of running MMLU), with its queries being stably and effortlessly
+updated every month to avoid contamination.
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+languages:
+english
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+tags:
+general-knowledge, reasoning, qa
+
+paper:
+https://mixeval.github.io/
+"""
import logging
import re
@@ -27,15 +29,15 @@
from lighteval.metrics.metrics_sample import JudgeLLMMixEval
from lighteval.metrics.utils.metric_utils import SampleLevelMetricGrouping
-from lighteval.tasks.extended.mix_eval.judge_prompts import (
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc, SamplingMethod
+from lighteval.tasks.tasks.mix_eval.judge_prompts import (
flow_judge_for_freeform_template,
flow_judge_for_multichoice_template,
gpt_judge_for_closeended_freeform,
gpt_judge_for_closeended_multiplechoice,
)
-from lighteval.tasks.extended.mix_eval.prompts import construct_prompt_freeform, construct_prompt_multichoice
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc, SamplingMethod
+from lighteval.tasks.tasks.mix_eval.prompts import construct_prompt_freeform, construct_prompt_multichoice
logger = logging.getLogger(__name__)
@@ -178,7 +180,7 @@ def mean_dv_5(x):
mixeval_freeform_easy = LightevalTaskConfig(
name="mixeval_easy:freeform",
prompt_function=mixeval_freeform_prompt,
- suite=["extended"],
+ suite=["lighteval"],
hf_repo="MixEval/MixEval",
hf_subset="MixEval",
metrics=[llm_judge_mixeval_freeform_flow_judge, llm_judge_mixeval_freeform_gpt_judge],
@@ -195,7 +197,7 @@ def mean_dv_5(x):
mixeval_multichoice_easy = LightevalTaskConfig(
name="mixeval_easy:multichoice",
prompt_function=mixeval_multichoice_prompt,
- suite=["extended"],
+ suite=["lighteval"],
hf_repo="MixEval/MixEval",
hf_subset="MixEval",
metrics=[llm_judge_mixeval_multichoice_flow_judge, llm_judge_mixeval_multichoice_gpt_judge],
@@ -211,7 +213,7 @@ def mean_dv_5(x):
mixeval_freeform_hard = LightevalTaskConfig(
name="mixeval_hard:freeform",
prompt_function=mixeval_freeform_prompt,
- suite=["extended"],
+ suite=["lighteval"],
hf_repo="MixEval/MixEval",
hf_subset="MixEval_Hard",
metrics=[llm_judge_mixeval_freeform_flow_judge, llm_judge_mixeval_freeform_gpt_judge],
@@ -228,7 +230,7 @@ def mean_dv_5(x):
mixeval_multichoice_hard = LightevalTaskConfig(
name="mixeval_hard:multichoice",
prompt_function=mixeval_multichoice_prompt,
- suite=["extended"],
+ suite=["lighteval"],
hf_repo="MixEval/MixEval",
hf_subset="MixEval_Hard",
metrics=[llm_judge_mixeval_multichoice_flow_judge, llm_judge_mixeval_multichoice_gpt_judge],
diff --git a/src/lighteval/tasks/extended/mix_eval/prompts.py b/src/lighteval/tasks/tasks/mix_eval/prompts.py
similarity index 88%
rename from src/lighteval/tasks/extended/mix_eval/prompts.py
rename to src/lighteval/tasks/tasks/mix_eval/prompts.py
index d5cb2f06b..bd859a967 100644
--- a/src/lighteval/tasks/extended/mix_eval/prompts.py
+++ b/src/lighteval/tasks/tasks/mix_eval/prompts.py
@@ -1,25 +1,3 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team and MixEval team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
MULTI_CHOICE_PROMPT = "Answer with the option letter from the given choices directly."
FREE_FORM_PROMPT = "Answer the question shortly."
# FREE_FORM_PROMPT_QUAC = "Answer the question using a short excerpt (span) from the given text."
diff --git a/src/lighteval/tasks/tasks/mmlu.py b/src/lighteval/tasks/tasks/mmlu.py
new file mode 100644
index 000000000..2791b6e4c
--- /dev/null
+++ b/src/lighteval/tasks/tasks/mmlu.py
@@ -0,0 +1,996 @@
+"""
+name:
+Mmlu
+
+dataset:
+lighteval/mmlu
+
+abstract:
+MMMLU is a benchmark of general-knowledge and English language understanding.
+
+languages:
+english
+
+tags:
+general-knowledge, knowledge, multiple-choice
+
+paper:
+https://arxiv.org/abs/2009.03300
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+mmlu_abstract_algebra = LightevalTaskConfig(
+ name="mmlu:abstract_algebra",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="abstract_algebra",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_anatomy = LightevalTaskConfig(
+ name="mmlu:anatomy",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="anatomy",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_astronomy = LightevalTaskConfig(
+ name="mmlu:astronomy",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="astronomy",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_business_ethics = LightevalTaskConfig(
+ name="mmlu:business_ethics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="business_ethics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_clinical_knowledge = LightevalTaskConfig(
+ name="mmlu:clinical_knowledge",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="clinical_knowledge",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_college_biology = LightevalTaskConfig(
+ name="mmlu:college_biology",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="college_biology",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_college_chemistry = LightevalTaskConfig(
+ name="mmlu:college_chemistry",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="college_chemistry",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_college_computer_science = LightevalTaskConfig(
+ name="mmlu:college_computer_science",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="college_computer_science",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_college_mathematics = LightevalTaskConfig(
+ name="mmlu:college_mathematics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="college_mathematics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_college_medicine = LightevalTaskConfig(
+ name="mmlu:college_medicine",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="college_medicine",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_college_physics = LightevalTaskConfig(
+ name="mmlu:college_physics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="college_physics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_computer_security = LightevalTaskConfig(
+ name="mmlu:computer_security",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="computer_security",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_conceptual_physics = LightevalTaskConfig(
+ name="mmlu:conceptual_physics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="conceptual_physics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_econometrics = LightevalTaskConfig(
+ name="mmlu:econometrics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="econometrics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_electrical_engineering = LightevalTaskConfig(
+ name="mmlu:electrical_engineering",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="electrical_engineering",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_elementary_mathematics = LightevalTaskConfig(
+ name="mmlu:elementary_mathematics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="elementary_mathematics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_formal_logic = LightevalTaskConfig(
+ name="mmlu:formal_logic",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="formal_logic",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_global_facts = LightevalTaskConfig(
+ name="mmlu:global_facts",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="global_facts",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_biology = LightevalTaskConfig(
+ name="mmlu:high_school_biology",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_biology",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_chemistry = LightevalTaskConfig(
+ name="mmlu:high_school_chemistry",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_chemistry",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_computer_science = LightevalTaskConfig(
+ name="mmlu:high_school_computer_science",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_computer_science",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_european_history = LightevalTaskConfig(
+ name="mmlu:high_school_european_history",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_european_history",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_geography = LightevalTaskConfig(
+ name="mmlu:high_school_geography",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_geography",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_government_and_politics = LightevalTaskConfig(
+ name="mmlu:high_school_government_and_politics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_government_and_politics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_macroeconomics = LightevalTaskConfig(
+ name="mmlu:high_school_macroeconomics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_macroeconomics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_mathematics = LightevalTaskConfig(
+ name="mmlu:high_school_mathematics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_mathematics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_microeconomics = LightevalTaskConfig(
+ name="mmlu:high_school_microeconomics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_microeconomics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_physics = LightevalTaskConfig(
+ name="mmlu:high_school_physics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_physics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_psychology = LightevalTaskConfig(
+ name="mmlu:high_school_psychology",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_psychology",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_statistics = LightevalTaskConfig(
+ name="mmlu:high_school_statistics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_statistics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_us_history = LightevalTaskConfig(
+ name="mmlu:high_school_us_history",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_us_history",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_high_school_world_history = LightevalTaskConfig(
+ name="mmlu:high_school_world_history",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="high_school_world_history",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_human_aging = LightevalTaskConfig(
+ name="mmlu:human_aging",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="human_aging",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_human_sexuality = LightevalTaskConfig(
+ name="mmlu:human_sexuality",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="human_sexuality",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_international_law = LightevalTaskConfig(
+ name="mmlu:international_law",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="international_law",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_jurisprudence = LightevalTaskConfig(
+ name="mmlu:jurisprudence",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="jurisprudence",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_logical_fallacies = LightevalTaskConfig(
+ name="mmlu:logical_fallacies",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="logical_fallacies",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_machine_learning = LightevalTaskConfig(
+ name="mmlu:machine_learning",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="machine_learning",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_management = LightevalTaskConfig(
+ name="mmlu:management",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="management",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_marketing = LightevalTaskConfig(
+ name="mmlu:marketing",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="marketing",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_medical_genetics = LightevalTaskConfig(
+ name="mmlu:medical_genetics",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="medical_genetics",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_miscellaneous = LightevalTaskConfig(
+ name="mmlu:miscellaneous",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="miscellaneous",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_moral_disputes = LightevalTaskConfig(
+ name="mmlu:moral_disputes",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="moral_disputes",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_moral_scenarios = LightevalTaskConfig(
+ name="mmlu:moral_scenarios",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="moral_scenarios",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_nutrition = LightevalTaskConfig(
+ name="mmlu:nutrition",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="nutrition",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_philosophy = LightevalTaskConfig(
+ name="mmlu:philosophy",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="philosophy",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_prehistory = LightevalTaskConfig(
+ name="mmlu:prehistory",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="prehistory",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_professional_accounting = LightevalTaskConfig(
+ name="mmlu:professional_accounting",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="professional_accounting",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_professional_law = LightevalTaskConfig(
+ name="mmlu:professional_law",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="professional_law",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_professional_medicine = LightevalTaskConfig(
+ name="mmlu:professional_medicine",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="professional_medicine",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_professional_psychology = LightevalTaskConfig(
+ name="mmlu:professional_psychology",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="professional_psychology",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_public_relations = LightevalTaskConfig(
+ name="mmlu:public_relations",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="public_relations",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_security_studies = LightevalTaskConfig(
+ name="mmlu:security_studies",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="security_studies",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_sociology = LightevalTaskConfig(
+ name="mmlu:sociology",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="sociology",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_us_foreign_policy = LightevalTaskConfig(
+ name="mmlu:us_foreign_policy",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="us_foreign_policy",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_virology = LightevalTaskConfig(
+ name="mmlu:virology",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="virology",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mmlu_world_religions = LightevalTaskConfig(
+ name="mmlu:world_religions",
+ suite=["lighteval"],
+ prompt_function=prompt.mmlu_helm,
+ hf_repo="lighteval/mmlu",
+ hf_subset="world_religions",
+ hf_avail_splits=["auxiliary_train", "test", "validation", "dev"],
+ evaluation_splits=["test"],
+ few_shots_split="dev",
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ mmlu_abstract_algebra,
+ mmlu_anatomy,
+ mmlu_astronomy,
+ mmlu_business_ethics,
+ mmlu_clinical_knowledge,
+ mmlu_college_biology,
+ mmlu_college_chemistry,
+ mmlu_college_computer_science,
+ mmlu_college_mathematics,
+ mmlu_college_medicine,
+ mmlu_college_physics,
+ mmlu_computer_security,
+ mmlu_conceptual_physics,
+ mmlu_econometrics,
+ mmlu_electrical_engineering,
+ mmlu_elementary_mathematics,
+ mmlu_formal_logic,
+ mmlu_global_facts,
+ mmlu_high_school_biology,
+ mmlu_high_school_chemistry,
+ mmlu_high_school_computer_science,
+ mmlu_high_school_european_history,
+ mmlu_high_school_geography,
+ mmlu_high_school_government_and_politics,
+ mmlu_high_school_macroeconomics,
+ mmlu_high_school_mathematics,
+ mmlu_high_school_microeconomics,
+ mmlu_high_school_physics,
+ mmlu_high_school_psychology,
+ mmlu_high_school_statistics,
+ mmlu_high_school_us_history,
+ mmlu_high_school_world_history,
+ mmlu_human_aging,
+ mmlu_human_sexuality,
+ mmlu_international_law,
+ mmlu_jurisprudence,
+ mmlu_logical_fallacies,
+ mmlu_machine_learning,
+ mmlu_management,
+ mmlu_marketing,
+ mmlu_medical_genetics,
+ mmlu_miscellaneous,
+ mmlu_moral_disputes,
+ mmlu_moral_scenarios,
+ mmlu_nutrition,
+ mmlu_philosophy,
+ mmlu_prehistory,
+ mmlu_professional_accounting,
+ mmlu_professional_law,
+ mmlu_professional_medicine,
+ mmlu_professional_psychology,
+ mmlu_public_relations,
+ mmlu_security_studies,
+ mmlu_sociology,
+ mmlu_us_foreign_policy,
+ mmlu_virology,
+ mmlu_world_religions,
+]
diff --git a/src/lighteval/tasks/tasks/mmlu_redux.py b/src/lighteval/tasks/tasks/mmlu_redux.py
new file mode 100644
index 000000000..2a29afd12
--- /dev/null
+++ b/src/lighteval/tasks/tasks/mmlu_redux.py
@@ -0,0 +1,107 @@
+"""
+name:
+Mmlu Redux
+
+dataset:
+edinburgh-dawg/mmlu-redux-2.0
+
+abstract:
+MMLU-Redux is a subset of 5,700 manually re-annotated questions across 57 MMLU subjects.
+
+languages:
+english
+
+tags:
+general-knowledge, knowledge, multiple-choice
+
+paper:
+https://arxiv.org/abs/2406.04127
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+_MMLU_REDUX_2_SUBSETS = [
+ "abstract_algebra",
+ "anatomy",
+ "astronomy",
+ "business_ethics",
+ "clinical_knowledge",
+ "college_biology",
+ "college_chemistry",
+ "college_computer_science",
+ "college_mathematics",
+ "college_medicine",
+ "college_physics",
+ "computer_security",
+ "conceptual_physics",
+ "econometrics",
+ "electrical_engineering",
+ "elementary_mathematics",
+ "formal_logic",
+ "global_facts",
+ "high_school_biology",
+ "high_school_chemistry",
+ "high_school_computer_science",
+ "high_school_european_history",
+ "high_school_geography",
+ "high_school_government_and_politics",
+ "high_school_macroeconomics",
+ "high_school_mathematics",
+ "high_school_microeconomics",
+ "high_school_physics",
+ "high_school_psychology",
+ "high_school_statistics",
+ "high_school_us_history",
+ "high_school_world_history",
+ "human_aging",
+ "human_sexuality",
+ "international_law",
+ "jurisprudence",
+ "logical_fallacies",
+ "machine_learning",
+ "management",
+ "marketing",
+ "medical_genetics",
+ "miscellaneous",
+ "moral_disputes",
+ "moral_scenarios",
+ "nutrition",
+ "philosophy",
+ "prehistory",
+ "professional_accounting",
+ "professional_law",
+ "professional_medicine",
+ "professional_psychology",
+ "public_relations",
+ "security_studies",
+ "sociology",
+ "us_foreign_policy",
+ "virology",
+ "world_religions",
+]
+
+
+TASKS_TABLE = [
+ LightevalTaskConfig(
+ name=f"mmlu_redux_2:{subset}",
+ suite=["lighteval"],
+ prompt_function=lambda line, task_name=None, s=subset: prompt.mmlu_redux_2(line, s, task_name),
+ hf_repo="edinburgh-dawg/mmlu-redux-2.0",
+ hf_subset=subset,
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ Metrics.pass_at_k_letters(sample_params={"k": 1}),
+ ],
+ stop_sequence=["\n"],
+ version=0,
+ )
+ for subset in _MMLU_REDUX_2_SUBSETS
+]
diff --git a/src/lighteval/tasks/tasks/mmmu_pro.py b/src/lighteval/tasks/tasks/mmmu_pro.py
new file mode 100644
index 000000000..3a71a9061
--- /dev/null
+++ b/src/lighteval/tasks/tasks/mmmu_pro.py
@@ -0,0 +1,80 @@
+"""
+name:
+Mmmu Pro
+
+dataset:
+MMMU/MMMU_pro
+
+abstract:
+
+languages:
+english
+
+tags:
+general-knowledge, knowledge, multimodal, multiple-choice
+
+paper:
+https://arxiv.org/abs/2409.02813
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+mmmu_pro_standard_4_options = LightevalTaskConfig(
+ name="mmmu_pro:standard-4",
+ suite=["lighteval"],
+ prompt_function=prompt.mmmu_pro,
+ hf_repo="MMMU/MMMU_pro",
+ hf_subset="standard (4 options)",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30, # expected an answer in a format 'Answer: B'
+ metrics=[Metrics.gpqa_instruct_metric],
+ stop_sequence=None,
+ version=0,
+)
+
+
+mmmu_pro_standard_10_options = LightevalTaskConfig(
+ name="mmmu_pro:standard-10",
+ suite=["lighteval"],
+ prompt_function=prompt.mmmu_pro,
+ hf_repo="MMMU/MMMU_pro",
+ hf_subset="standard (10 options)",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30, # expected an answer in a format 'Answer: B'
+ metrics=[Metrics.gpqa_instruct_metric],
+ stop_sequence=None,
+ version=0,
+)
+
+
+mmmu_pro_vision = LightevalTaskConfig(
+ name="mmmu_pro:vision",
+ suite=["lighteval"],
+ prompt_function=prompt.mmmu_pro_vision,
+ hf_repo="MMMU/MMMU_pro",
+ hf_subset="vision",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30, # expected an answer in a format 'Answer: B'
+ metrics=[Metrics.gpqa_instruct_metric],
+ stop_sequence=None,
+ version=0,
+)
+
+
+TASKS_TABLE = [
+ mmmu_pro_standard_4_options,
+ mmmu_pro_standard_10_options,
+ mmmu_pro_vision,
+]
diff --git a/src/lighteval/tasks/extended/mt_bench/judge_prompt_templates.py b/src/lighteval/tasks/tasks/mt_bench/judge_prompt_templates.py
similarity index 82%
rename from src/lighteval/tasks/extended/mt_bench/judge_prompt_templates.py
rename to src/lighteval/tasks/tasks/mt_bench/judge_prompt_templates.py
index ea3ca41f4..e76de1b2d 100644
--- a/src/lighteval/tasks/extended/mt_bench/judge_prompt_templates.py
+++ b/src/lighteval/tasks/tasks/mt_bench/judge_prompt_templates.py
@@ -1,26 +1,3 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-
def flow_judge_prompt_mt_bench_without_ref(question, options, answer, gold):
return [
{
diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/tasks/mt_bench/main.py
similarity index 64%
rename from src/lighteval/tasks/extended/mt_bench/main.py
rename to src/lighteval/tasks/tasks/mt_bench/main.py
index e32194747..bed7239dd 100644
--- a/src/lighteval/tasks/extended/mt_bench/main.py
+++ b/src/lighteval/tasks/tasks/mt_bench/main.py
@@ -1,36 +1,38 @@
-# MIT License
+"""
+name:
+Mt Bench
-# Copyright (c) 2024 The HuggingFace Team
+dataset:
+lighteval/mt-bench
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
+abstract:
+MT-Bench is a multi-turn conversational benchmark for evaluating language
+models. It consists of 80 high-quality multi-turn questions across 8 common
+categories (writing, roleplay, reasoning, math, coding, extraction, STEM,
+humanities). Model responses are evaluated by a judge LLM.
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+languages:
+english
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+tags:
+conversational, generation, multi-turn
+
+paper:
+https://arxiv.org/abs/2402.14762
+"""
+
+import re
+
+import numpy as np
-# ruff: noqa: F405, F403, F401, I001
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc, SamplingMethod
from lighteval.metrics.metrics_sample import JudgeLLMMTBench
from lighteval.metrics.utils.metric_utils import SampleLevelMetricGrouping
-from lighteval.tasks.extended.mt_bench.judge_prompt_templates import (
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc, SamplingMethod
+from lighteval.tasks.tasks.mt_bench.judge_prompt_templates import (
flow_judge_prompt_mt_bench_with_ref,
flow_judge_prompt_mt_bench_without_ref,
)
-import re
-import numpy as np
def mt_bench_prompt(line, task_name: str = ""):
@@ -80,7 +82,7 @@ def flow_judge_mt_bench_prompt(question, answer, options, gold):
task = LightevalTaskConfig(
name="mt_bench",
prompt_function=mt_bench_prompt, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
- suite=["extended"],
+ suite=["lighteval"],
hf_repo="lighteval/mt-bench",
hf_subset="default",
hf_avail_splits=["train"],
diff --git a/src/lighteval/tasks/tasks/musr.py b/src/lighteval/tasks/tasks/musr.py
new file mode 100644
index 000000000..074e0ac6f
--- /dev/null
+++ b/src/lighteval/tasks/tasks/musr.py
@@ -0,0 +1,82 @@
+"""
+name:
+Musr
+
+dataset:
+TAUR-Lab/MuSR
+
+abstract:
+MuSR is a benchmark for evaluating multistep reasoning in natural language
+narratives. Built using a neurosymbolic synthetic-to-natural generation process,
+it features complex, realistic tasks—such as long-form murder mysteries.
+
+languages:
+english
+
+tags:
+long-context, multiple-choice, reasoning
+
+paper:
+https://arxiv.org/abs/2310.16049
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+musr_murder_mysteries = LightevalTaskConfig(
+ name="musr:murder_mysteries",
+ suite=["lighteval"],
+ prompt_function=prompt.musr,
+ hf_repo="TAUR-Lab/MuSR",
+ hf_subset="default",
+ hf_avail_splits=["murder_mysteries"],
+ evaluation_splits=["murder_mysteries"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+musr_object_placements = LightevalTaskConfig(
+ name="musr:object_placements",
+ suite=["lighteval"],
+ prompt_function=prompt.musr,
+ hf_repo="TAUR-Lab/MuSR",
+ hf_subset="default",
+ hf_avail_splits=["object_placements"],
+ evaluation_splits=["object_placements"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+musr_team_allocation = LightevalTaskConfig(
+ name="musr:team_allocation",
+ suite=["lighteval"],
+ prompt_function=prompt.musr,
+ hf_repo="TAUR-Lab/MuSR",
+ hf_subset="default",
+ hf_avail_splits=["team_allocation"],
+ evaluation_splits=["team_allocation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ musr_murder_mysteries,
+ musr_object_placements,
+ musr_team_allocation,
+]
diff --git a/src/lighteval/tasks/tasks/narrativeqa.py b/src/lighteval/tasks/tasks/narrativeqa.py
new file mode 100644
index 000000000..fbbd8239c
--- /dev/null
+++ b/src/lighteval/tasks/tasks/narrativeqa.py
@@ -0,0 +1,46 @@
+"""
+name:
+Narrativeqa
+
+dataset:
+lighteval/narrative_qa_helm
+
+abstract:
+NarrativeQA is a reading comprehension benchmark that tests deep understanding
+of full narratives—books and movie scripts—rather than shallow text matching. To
+answer its questions, models must integrate information across entire stories.
+
+languages:
+english
+
+tags:
+qa, reading-comprehension
+
+paper:
+https://aclanthology.org/Q18-1023/
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+narrativeqa = LightevalTaskConfig(
+ name="narrativeqa",
+ suite=["lighteval"],
+ prompt_function=prompt.narrativeqa,
+ hf_repo="lighteval/narrative_qa_helm",
+ hf_subset="default",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ narrativeqa,
+]
diff --git a/src/lighteval/tasks/tasks/natural_questions.py b/src/lighteval/tasks/tasks/natural_questions.py
new file mode 100644
index 000000000..47bbb4b3b
--- /dev/null
+++ b/src/lighteval/tasks/tasks/natural_questions.py
@@ -0,0 +1,48 @@
+"""
+name:
+Natural Questions
+
+dataset:
+lighteval/small_natural_questions
+
+abstract:
+This dataset is a collection of question-answer pairs from the Natural Questions
+dataset. See Natural Questions for additional information. This dataset can be
+used directly with Sentence Transformers to train embedding models.
+
+languages:
+english
+
+tags:
+general-knowledge, qa
+
+paper:
+https://ai.google.com/research/NaturalQuestions
+"""
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+natural_questions = LightevalTaskConfig(
+ name="natural_questions",
+ prompt_function=get_qa_prompt_function(
+ Language.ENGLISH,
+ lambda line: {"question": line["question"], "choices": [line["answer"]]},
+ ),
+ suite=("lighteval",),
+ hf_repo="lighteval/small_natural_questions",
+ hf_subset="default",
+ evaluation_splits=("test",),
+ few_shots_split="few_shot",
+ generation_size=250,
+ stop_sequence=["\n", "Question:", "question:"],
+ metrics=[Metrics.exact_match],
+ version=1,
+)
+
+TASKS_TABLE = [
+ natural_questions,
+]
diff --git a/src/lighteval/tasks/tasks/numeracy.py b/src/lighteval/tasks/tasks/numeracy.py
new file mode 100644
index 000000000..9a80d0b66
--- /dev/null
+++ b/src/lighteval/tasks/tasks/numeracy.py
@@ -0,0 +1,162 @@
+"""
+name:
+Numeracy
+
+dataset:
+lighteval/numeracy
+
+abstract:
+Numeracy is a benchmark for evaluating the ability of language models to reason about mathematics.
+
+languages:
+english
+
+tags:
+math, reasoning
+
+paper:
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+numeracy_linear_example = LightevalTaskConfig(
+ name="numeracy:linear_example",
+ suite=["lighteval"],
+ prompt_function=prompt.numeracy,
+ hf_repo="lighteval/numeracy",
+ hf_subset="linear_example",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+numeracy_linear_standard = LightevalTaskConfig(
+ name="numeracy:linear_standard",
+ suite=["lighteval"],
+ prompt_function=prompt.numeracy,
+ hf_repo="lighteval/numeracy",
+ hf_subset="linear_standard",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+numeracy_parabola_example = LightevalTaskConfig(
+ name="numeracy:parabola_example",
+ suite=["lighteval"],
+ prompt_function=prompt.numeracy,
+ hf_repo="lighteval/numeracy",
+ hf_subset="parabola_example",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+numeracy_parabola_standard = LightevalTaskConfig(
+ name="numeracy:parabola_standard",
+ suite=["lighteval"],
+ prompt_function=prompt.numeracy,
+ hf_repo="lighteval/numeracy",
+ hf_subset="parabola_standard",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+numeracy_paraboloid_example = LightevalTaskConfig(
+ name="numeracy:paraboloid_example",
+ suite=["lighteval"],
+ prompt_function=prompt.numeracy,
+ hf_repo="lighteval/numeracy",
+ hf_subset="paraboloid_example",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+numeracy_paraboloid_standard = LightevalTaskConfig(
+ name="numeracy:paraboloid_standard",
+ suite=["lighteval"],
+ prompt_function=prompt.numeracy,
+ hf_repo="lighteval/numeracy",
+ hf_subset="paraboloid_standard",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+numeracy_plane_example = LightevalTaskConfig(
+ name="numeracy:plane_example",
+ suite=["lighteval"],
+ prompt_function=prompt.numeracy,
+ hf_repo="lighteval/numeracy",
+ hf_subset="plane_example",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+numeracy_plane_standard = LightevalTaskConfig(
+ name="numeracy:plane_standard",
+ suite=["lighteval"],
+ prompt_function=prompt.numeracy,
+ hf_repo="lighteval/numeracy",
+ hf_subset="plane_standard",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ numeracy_linear_example,
+ numeracy_linear_standard,
+ numeracy_parabola_example,
+ numeracy_parabola_standard,
+ numeracy_paraboloid_example,
+ numeracy_paraboloid_standard,
+ numeracy_plane_example,
+ numeracy_plane_standard,
+]
diff --git a/src/lighteval/tasks/extended/olympiade_bench/main.py b/src/lighteval/tasks/tasks/olympiade_bench/main.py
similarity index 88%
rename from src/lighteval/tasks/extended/olympiade_bench/main.py
rename to src/lighteval/tasks/tasks/olympiade_bench/main.py
index d9fe0d2bc..bd53d3dcf 100644
--- a/src/lighteval/tasks/extended/olympiade_bench/main.py
+++ b/src/lighteval/tasks/tasks/olympiade_bench/main.py
@@ -1,25 +1,23 @@
-# MIT License
+"""
+name:
+Olympiade Bench
-# Copyright (c) 2024 The HuggingFace Team
+dataset:
+Hothan/OlympiadBench
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
+abstract:
+OlympiadBench is a benchmark for evaluating the performance of language models
+on olympiad problems.
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+languages:
+english, chinese
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+tags:
+math, reasoning, language
+paper:
+https://arxiv.org/abs/2402.14008
+"""
import numpy as np
@@ -224,7 +222,7 @@ def olympiad_bench_prompt(line, task_name: str = None):
LightevalTaskConfig(
name="olympiad_bench:" + subset,
prompt_function=olympiad_bench_prompt,
- suite=["extended"],
+ suite=["lighteval"],
hf_repo="Hothan/OlympiadBench",
hf_subset=subset,
metrics=[metric],
diff --git a/src/lighteval/tasks/tasks/openbookqa.py b/src/lighteval/tasks/tasks/openbookqa.py
new file mode 100644
index 000000000..eb0e547dc
--- /dev/null
+++ b/src/lighteval/tasks/tasks/openbookqa.py
@@ -0,0 +1,50 @@
+"""
+name:
+Openbookqa
+
+dataset:
+allenai/openbookqa
+
+abstract:
+OpenBookQA is a question-answering dataset modeled after open-book exams for
+assessing human understanding of a subject. It contains multiple-choice
+questions that require combining facts from a given open book with broad common
+knowledge. The task tests language models' ability to leverage provided
+information and apply common sense reasoning.
+
+languages:
+english
+
+tags:
+multiple-choice, qa
+
+paper:
+https://arxiv.org/abs/1809.02789
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+openbookqa = LightevalTaskConfig(
+ name="openbookqa",
+ suite=["lighteval"],
+ prompt_function=prompt.openbookqa_helm,
+ hf_repo="allenai/openbookqa",
+ hf_subset="main",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ openbookqa,
+]
diff --git a/src/lighteval/tasks/tasks/piqa.py b/src/lighteval/tasks/tasks/piqa.py
new file mode 100644
index 000000000..76388fac1
--- /dev/null
+++ b/src/lighteval/tasks/tasks/piqa.py
@@ -0,0 +1,47 @@
+"""
+name:
+Piqa
+
+dataset:
+ybisk/piqa
+
+abstract:
+PIQA is a benchmark for testing physical commonsense reasoning. It contains
+questions requiring this kind of physical commonsense reasoning.
+
+languages:
+english
+
+tags:
+commonsense, multiple-choice, qa
+
+paper:
+https://arxiv.org/abs/1911.11641
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+piqa = LightevalTaskConfig(
+ name="piqa",
+ suite=["lighteval"],
+ prompt_function=prompt.piqa_helm,
+ hf_repo="ybisk/piqa",
+ hf_subset="plain_text",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ piqa,
+]
diff --git a/src/lighteval/tasks/tasks/prost.py b/src/lighteval/tasks/tasks/prost.py
new file mode 100644
index 000000000..92a0ad0ca
--- /dev/null
+++ b/src/lighteval/tasks/tasks/prost.py
@@ -0,0 +1,48 @@
+"""
+name:
+Prost
+
+dataset:
+lighteval/prost
+
+abstract:
+PROST is a benchmark for testing physical reasoning about objects through space
+and time. It includes 18,736 multiple-choice questions covering 10 core physics
+concepts, designed to probe models in zero-shot settings. Results show that even
+large pretrained models struggle with physical reasoning and are sensitive to
+question phrasing, underscoring their limited real-world understanding.
+
+languages:
+english
+
+tags:
+reasoning, qa, physical-commonsense
+
+paper:
+https://arxiv.org/abs/2106.03634
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+prost = LightevalTaskConfig(
+ name="prost",
+ suite=["lighteval"],
+ prompt_function=prompt.prost,
+ hf_repo="lighteval/prost",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ prost,
+]
diff --git a/src/lighteval/tasks/tasks/pubmedqa.py b/src/lighteval/tasks/tasks/pubmedqa.py
new file mode 100644
index 000000000..5cef802b4
--- /dev/null
+++ b/src/lighteval/tasks/tasks/pubmedqa.py
@@ -0,0 +1,46 @@
+"""
+name:
+Pubmedqa
+
+dataset:
+pubmed_qa
+
+abstract:
+PubMedQA is a dataset for biomedical research question answering.
+
+languages:
+english
+
+tags:
+biomedical, health, medical, qa
+
+paper:
+https://pubmedqa.github.io/
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+pubmedqa = LightevalTaskConfig(
+ name="pubmedqa",
+ suite=["lighteval"],
+ prompt_function=prompt.pubmed_qa_helm,
+ hf_repo="pubmed_qa",
+ hf_subset="pqa_labeled",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ pubmedqa,
+]
diff --git a/src/lighteval/tasks/tasks/qa4mre.py b/src/lighteval/tasks/tasks/qa4mre.py
new file mode 100644
index 000000000..9120ae95c
--- /dev/null
+++ b/src/lighteval/tasks/tasks/qa4mre.py
@@ -0,0 +1,90 @@
+"""
+name:
+Qa4Mre
+
+dataset:
+qa4mre
+
+abstract:
+QA4MRE is a machine reading comprehension benchmark from the CLEF 2011-2013
+challenges. It evaluates systems' ability to answer questions requiring deep
+understanding of short texts, supported by external background knowledge.
+Covering tasks like modality, negation, biomedical reading, and entrance exams,
+QA4MRE tests reasoning beyond surface-level text matching.
+
+languages:
+english
+
+tags:
+biomedical, health, qa
+
+paper:
+https://link.springer.com/chapter/10.1007/978-3-642-40802-1_29
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+qa4mre_2011 = LightevalTaskConfig(
+ name="qa4mre:2011",
+ suite=["lighteval"],
+ prompt_function=prompt.qa4mre,
+ hf_repo="qa4mre",
+ hf_subset="2011.main.EN",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+qa4mre_2012 = LightevalTaskConfig(
+ name="qa4mre:2012",
+ suite=["lighteval"],
+ prompt_function=prompt.qa4mre,
+ hf_repo="qa4mre",
+ hf_subset="2012.main.EN",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+qa4mre_2013 = LightevalTaskConfig(
+ name="qa4mre:2013",
+ suite=["lighteval"],
+ prompt_function=prompt.qa4mre,
+ hf_repo="qa4mre",
+ hf_subset="2013.main.EN",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[
+ Metrics.loglikelihood_acc,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ qa4mre_2011,
+ qa4mre_2012,
+ qa4mre_2013,
+]
diff --git a/src/lighteval/tasks/tasks/qasper.py b/src/lighteval/tasks/tasks/qasper.py
new file mode 100644
index 000000000..223fb35c8
--- /dev/null
+++ b/src/lighteval/tasks/tasks/qasper.py
@@ -0,0 +1,49 @@
+"""
+name:
+Qasper
+
+dataset:
+allenai/qasper
+
+abstract:
+QASPER is a dataset for question answering on scientific research papers. It
+consists of 5,049 questions over 1,585 Natural Language Processing papers. Each
+question is written by an NLP practitioner who read only the title and abstract
+of the corresponding paper, and the question seeks information present in the
+full text. The questions are then answered by a separate set of NLP
+practitioners who also provide supporting evidence to answers.
+
+languages:
+english
+
+tags:
+qa, scientific
+
+paper:
+https://arxiv.org/abs/2105.03011
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+qasper = LightevalTaskConfig(
+ name="qasper",
+ suite=["lighteval"],
+ prompt_function=prompt.qasper,
+ hf_repo="allenai/qasper",
+ hf_subset="qasper",
+ hf_avail_splits=["train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.f1_score],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ qasper,
+]
diff --git a/src/lighteval/tasks/tasks/quac.py b/src/lighteval/tasks/tasks/quac.py
new file mode 100644
index 000000000..8fd69d116
--- /dev/null
+++ b/src/lighteval/tasks/tasks/quac.py
@@ -0,0 +1,44 @@
+"""
+name:
+Quac
+
+dataset:
+lighteval/quac_helm
+
+abstract:
+The QuAC benchmark for question answering in the context of dialogues.
+
+languages:
+english
+
+tags:
+dialog, qa
+
+paper:
+https://aclanthology.org/D18-1241/
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+quac = LightevalTaskConfig(
+ name="quac",
+ suite=["lighteval"],
+ prompt_function=prompt.quac,
+ hf_repo="lighteval/quac_helm",
+ hf_subset="default",
+ hf_avail_splits=["train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=100,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ quac,
+]
diff --git a/src/lighteval/tasks/tasks/race_high.py b/src/lighteval/tasks/tasks/race_high.py
new file mode 100644
index 000000000..4ac7e452a
--- /dev/null
+++ b/src/lighteval/tasks/tasks/race_high.py
@@ -0,0 +1,48 @@
+"""
+name:
+Race High
+
+dataset:
+EleutherAI/race
+
+abstract:
+RACE is a large-scale reading comprehension dataset with more than 28,000
+passages and nearly 100,000 questions. The dataset is collected from English
+examinations in China, which are designed for middle school and high school
+students. The dataset can be served as the training and test sets for machine
+comprehension.
+
+languages:
+english
+
+tags:
+multiple-choice, reading-comprehension
+
+paper:
+https://aclanthology.org/D17-1082/
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+race_high = LightevalTaskConfig(
+ name="race:high",
+ suite=["lighteval"],
+ prompt_function=prompt.race,
+ hf_repo="EleutherAI/race",
+ hf_subset="high",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ race_high,
+]
diff --git a/src/lighteval/tasks/tasks/raft.py b/src/lighteval/tasks/tasks/raft.py
new file mode 100644
index 000000000..5e1a00553
--- /dev/null
+++ b/src/lighteval/tasks/tasks/raft.py
@@ -0,0 +1,237 @@
+"""
+name:
+Raft
+
+dataset:
+ought/raft
+
+abstract:
+The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text
+classification tasks.
+
+languages:
+english
+
+tags:
+classification, reasoning
+
+paper:
+https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+raft_ade_corpus_v2 = LightevalTaskConfig(
+ name="raft:ade_corpus_v2",
+ suite=["lighteval"],
+ prompt_function=prompt.raft_ade_corpus_v2,
+ hf_repo="ought/raft",
+ hf_subset="ade_corpus_v2",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+raft_banking_77 = LightevalTaskConfig(
+ name="raft:banking_77",
+ suite=["lighteval"],
+ prompt_function=prompt.raft_banking_77,
+ hf_repo="ought/raft",
+ hf_subset="banking_77",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+raft_neurips_impact_statement_risks = LightevalTaskConfig(
+ name="raft:neurips_impact_statement_risks",
+ suite=["lighteval"],
+ prompt_function=prompt.raft_neurips_impact_statement_risks,
+ hf_repo="ought/raft",
+ hf_subset="neurips_impact_statement_risks",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+raft_one_stop_english = LightevalTaskConfig(
+ name="raft:one_stop_english",
+ suite=["lighteval"],
+ prompt_function=prompt.raft_one_stop_english,
+ hf_repo="ought/raft",
+ hf_subset="one_stop_english",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+raft_overruling = LightevalTaskConfig(
+ name="raft:overruling",
+ suite=["lighteval"],
+ prompt_function=prompt.raft_overruling,
+ hf_repo="ought/raft",
+ hf_subset="overruling",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+raft_semiconductor_org_types = LightevalTaskConfig(
+ name="raft:semiconductor_org_types",
+ suite=["lighteval"],
+ prompt_function=prompt.raft_semiconductor_org_types,
+ hf_repo="ought/raft",
+ hf_subset="semiconductor_org_types",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+raft_systematic_review_inclusion = LightevalTaskConfig(
+ name="raft:systematic_review_inclusion",
+ suite=["lighteval"],
+ prompt_function=prompt.raft_systematic_review_inclusion,
+ hf_repo="ought/raft",
+ hf_subset="systematic_review_inclusion",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+raft_tai_safety_research = LightevalTaskConfig(
+ name="raft:tai_safety_research",
+ suite=["lighteval"],
+ prompt_function=prompt.raft_tai_safety_research,
+ hf_repo="ought/raft",
+ hf_subset="tai_safety_research",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+raft_terms_of_service = LightevalTaskConfig(
+ name="raft:terms_of_service",
+ suite=["lighteval"],
+ prompt_function=prompt.raft_terms_of_service,
+ hf_repo="ought/raft",
+ hf_subset="terms_of_service",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+raft_tweet_eval_hate = LightevalTaskConfig(
+ name="raft:tweet_eval_hate",
+ suite=["lighteval"],
+ prompt_function=prompt.raft_tweet_eval_hate,
+ hf_repo="ought/raft",
+ hf_subset="tweet_eval_hate",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+raft_twitter_complaints = LightevalTaskConfig(
+ name="raft:twitter_complaints",
+ suite=["lighteval"],
+ prompt_function=prompt.raft_twitter_complaints,
+ hf_repo="ought/raft",
+ hf_subset="twitter_complaints",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=30,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ raft_ade_corpus_v2,
+ raft_banking_77,
+ raft_neurips_impact_statement_risks,
+ raft_one_stop_english,
+ raft_overruling,
+ raft_semiconductor_org_types,
+ raft_systematic_review_inclusion,
+ raft_tai_safety_research,
+ raft_terms_of_service,
+ raft_tweet_eval_hate,
+ raft_twitter_complaints,
+]
diff --git a/src/lighteval/tasks/tasks/real_toxicity_prompts.py b/src/lighteval/tasks/tasks/real_toxicity_prompts.py
new file mode 100644
index 000000000..726fda8fe
--- /dev/null
+++ b/src/lighteval/tasks/tasks/real_toxicity_prompts.py
@@ -0,0 +1,44 @@
+"""
+name:
+Real Toxicity Prompts
+
+dataset:
+allenai/real-toxicity-prompts
+
+abstract:
+The RealToxicityPrompts dataset for measuring toxicity in prompted model generations
+
+languages:
+english
+
+tags:
+generation, safety
+
+paper:
+https://aclanthology.org/2020.findings-emnlp.301/
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+real_toxicity_prompts = LightevalTaskConfig(
+ name="real_toxicity_prompts",
+ suite=["lighteval"],
+ prompt_function=prompt.real_toxicity_prompts,
+ hf_repo="allenai/real-toxicity-prompts",
+ hf_subset="default",
+ hf_avail_splits=["train"],
+ evaluation_splits=["train"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ real_toxicity_prompts,
+]
diff --git a/src/lighteval/tasks/tasks/sacrebleu.py b/src/lighteval/tasks/tasks/sacrebleu.py
new file mode 100644
index 000000000..b6387f2b7
--- /dev/null
+++ b/src/lighteval/tasks/tasks/sacrebleu.py
@@ -0,0 +1,2928 @@
+"""
+name:
+Sacrebleu
+
+dataset:
+lighteval/sacrebleu_manual, wmt14, wmt16
+
+abstract:
+tasks from sacrebleu
+
+languages:
+english, german, french, japanese, korean, chinese, arabic
+
+tags:
+translation
+
+paper:
+https://github.com/mjpost/sacrebleu
+"""
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks import default_prompts as prompt
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+iwslt17_ar_en = LightevalTaskConfig(
+ name="iwslt17:ar-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="iwslt17_ar-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+iwslt17_de_en = LightevalTaskConfig(
+ name="iwslt17:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="iwslt17_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+iwslt17_en_ar = LightevalTaskConfig(
+ name="iwslt17:en-ar",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="iwslt17_ar-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+iwslt17_en_de = LightevalTaskConfig(
+ name="iwslt17:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="iwslt17_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+iwslt17_en_fr = LightevalTaskConfig(
+ name="iwslt17:en-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="iwslt17_en-fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+iwslt17_en_ja = LightevalTaskConfig(
+ name="iwslt17:en-ja",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="iwslt17_en-ja",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+iwslt17_en_ko = LightevalTaskConfig(
+ name="iwslt17:en-ko",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="iwslt17_en-ko",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+iwslt17_en_zh = LightevalTaskConfig(
+ name="iwslt17:en-zh",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="iwslt17_en-zh",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+iwslt17_fr_en = LightevalTaskConfig(
+ name="iwslt17:fr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="iwslt17_fr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+iwslt17_ja_en = LightevalTaskConfig(
+ name="iwslt17:ja-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="iwslt17_ja-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+iwslt17_ko_en = LightevalTaskConfig(
+ name="iwslt17:ko-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="iwslt17_ko-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+iwslt17_zh_en = LightevalTaskConfig(
+ name="iwslt17:zh-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="iwslt17_zh-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mtnt2019_en_fr = LightevalTaskConfig(
+ name="mtnt2019:en-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="mtnt2019_en-fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=200,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mtnt2019_en_ja = LightevalTaskConfig(
+ name="mtnt2019:en-ja",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="mtnt2019_en-ja",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=200,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mtnt2019_fr_en = LightevalTaskConfig(
+ name="mtnt2019:fr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="mtnt2019_fr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=200,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+mtnt2019_ja_en = LightevalTaskConfig(
+ name="mtnt2019:ja-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="mtnt2019_ja-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=200,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt08_cs_en = LightevalTaskConfig(
+ name="wmt08:cs-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt08_cs-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt08_de_en = LightevalTaskConfig(
+ name="wmt08:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt08_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt08_en_cs = LightevalTaskConfig(
+ name="wmt08:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt08_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt08_en_de = LightevalTaskConfig(
+ name="wmt08:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt08_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt08_en_es = LightevalTaskConfig(
+ name="wmt08:en-es",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt08_en-es",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt08_en_fr = LightevalTaskConfig(
+ name="wmt08:en-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt08_en-fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt08_en_hu = LightevalTaskConfig(
+ name="wmt08:en-hu",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt08_en-hu",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt08_es_en = LightevalTaskConfig(
+ name="wmt08:es-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt08_es-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt08_fr_en = LightevalTaskConfig(
+ name="wmt08:fr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt08_fr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt08_hu_en = LightevalTaskConfig(
+ name="wmt08:hu-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt08_hu-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt09_cs_en = LightevalTaskConfig(
+ name="wmt09:cs-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt09_cs-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt09_de_en = LightevalTaskConfig(
+ name="wmt09:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt09_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt09_en_cs = LightevalTaskConfig(
+ name="wmt09:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt09_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt09_en_de = LightevalTaskConfig(
+ name="wmt09:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt09_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt09_en_es = LightevalTaskConfig(
+ name="wmt09:en-es",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt09_en-es",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt09_en_fr = LightevalTaskConfig(
+ name="wmt09:en-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt09_en-fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt09_en_hu = LightevalTaskConfig(
+ name="wmt09:en-hu",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt09_en-hu",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt09_en_it = LightevalTaskConfig(
+ name="wmt09:en-it",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt09_en-it",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt09_es_en = LightevalTaskConfig(
+ name="wmt09:es-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt09_es-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt09_fr_en = LightevalTaskConfig(
+ name="wmt09:fr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt09_fr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt09_hu_en = LightevalTaskConfig(
+ name="wmt09:hu-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt09_hu-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt09_it_en = LightevalTaskConfig(
+ name="wmt09:it-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt09_it-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt10_cs_en = LightevalTaskConfig(
+ name="wmt10:cs-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt10_cs-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt10_de_en = LightevalTaskConfig(
+ name="wmt10:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt10_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt10_en_cs = LightevalTaskConfig(
+ name="wmt10:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt10_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt10_en_de = LightevalTaskConfig(
+ name="wmt10:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt10_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt10_en_es = LightevalTaskConfig(
+ name="wmt10:en-es",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt10_en-es",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt10_en_fr = LightevalTaskConfig(
+ name="wmt10:en-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt10_en-fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt10_es_en = LightevalTaskConfig(
+ name="wmt10:es-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt10_es-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt10_fr_en = LightevalTaskConfig(
+ name="wmt10:fr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt10_fr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt11_cs_en = LightevalTaskConfig(
+ name="wmt11:cs-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt11_cs-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt11_de_en = LightevalTaskConfig(
+ name="wmt11:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt11_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt11_en_cs = LightevalTaskConfig(
+ name="wmt11:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt11_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt11_en_de = LightevalTaskConfig(
+ name="wmt11:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt11_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt11_en_es = LightevalTaskConfig(
+ name="wmt11:en-es",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt11_en-es",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt11_en_fr = LightevalTaskConfig(
+ name="wmt11:en-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt11_en-fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt11_es_en = LightevalTaskConfig(
+ name="wmt11:es-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt11_es-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt11_fr_en = LightevalTaskConfig(
+ name="wmt11:fr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt11_fr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt12_cs_en = LightevalTaskConfig(
+ name="wmt12:cs-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt12_cs-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt12_de_en = LightevalTaskConfig(
+ name="wmt12:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt12_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt12_en_cs = LightevalTaskConfig(
+ name="wmt12:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt12_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt12_en_de = LightevalTaskConfig(
+ name="wmt12:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt12_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt12_en_es = LightevalTaskConfig(
+ name="wmt12:en-es",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt12_en-es",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt12_en_fr = LightevalTaskConfig(
+ name="wmt12:en-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt12_en-fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt12_es_en = LightevalTaskConfig(
+ name="wmt12:es-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt12_es-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt12_fr_en = LightevalTaskConfig(
+ name="wmt12:fr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt12_fr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt13_cs_en = LightevalTaskConfig(
+ name="wmt13:cs-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt13_cs-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt13_de_en = LightevalTaskConfig(
+ name="wmt13:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt13_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt13_en_cs = LightevalTaskConfig(
+ name="wmt13:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt13_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt13_en_de = LightevalTaskConfig(
+ name="wmt13:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt13_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt13_en_es = LightevalTaskConfig(
+ name="wmt13:en-es",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt13_en-es",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt13_en_fr = LightevalTaskConfig(
+ name="wmt13:en-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt13_en-fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt13_en_ru = LightevalTaskConfig(
+ name="wmt13:en-ru",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt13_en-ru",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt13_es_en = LightevalTaskConfig(
+ name="wmt13:es-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt13_es-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt13_fr_en = LightevalTaskConfig(
+ name="wmt13:fr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt13_fr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt13_ru_en = LightevalTaskConfig(
+ name="wmt13:ru-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt13_ru-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt14_cs_en = LightevalTaskConfig(
+ name="wmt14:cs-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt14_cs-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt14_de_en = LightevalTaskConfig(
+ name="wmt14:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt14_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt14_en_cs = LightevalTaskConfig(
+ name="wmt14:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt14_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt14_en_de = LightevalTaskConfig(
+ name="wmt14:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt14_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt14_en_fr = LightevalTaskConfig(
+ name="wmt14:en-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="wmt14",
+ hf_subset="fr-en",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt14_en_fr = LightevalTaskConfig(
+ name="wmt14:en-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt14_en-fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt14_en_hi = LightevalTaskConfig(
+ name="wmt14:en-hi",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt14_en-hi",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt14_en_ru = LightevalTaskConfig(
+ name="wmt14:en-ru",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt14_en-ru",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt14_fr_en = LightevalTaskConfig(
+ name="wmt14:fr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="wmt14",
+ hf_subset="fr-en",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt14_fr_en = LightevalTaskConfig(
+ name="wmt14:fr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt14_fr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt14_hi_en = LightevalTaskConfig(
+ name="wmt14:hi-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt14_hi-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt14_ru_en = LightevalTaskConfig(
+ name="wmt14:ru-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt14_ru-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt15_cs_en = LightevalTaskConfig(
+ name="wmt15:cs-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt15_cs-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt15_de_en = LightevalTaskConfig(
+ name="wmt15:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt15_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt15_en_cs = LightevalTaskConfig(
+ name="wmt15:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt15_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt15_en_de = LightevalTaskConfig(
+ name="wmt15:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt15_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt15_en_fi = LightevalTaskConfig(
+ name="wmt15:en-fi",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt15_en-fi",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt15_en_fr = LightevalTaskConfig(
+ name="wmt15:en-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt15_en-fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt15_en_ru = LightevalTaskConfig(
+ name="wmt15:en-ru",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt15_en-ru",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt15_fi_en = LightevalTaskConfig(
+ name="wmt15:fi-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt15_fi-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt15_fr_en = LightevalTaskConfig(
+ name="wmt15:fr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt15_fr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt15_ru_en = LightevalTaskConfig(
+ name="wmt15:ru-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt15_ru-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_cs_en = LightevalTaskConfig(
+ name="wmt16:cs-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt16_cs-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_de_en = LightevalTaskConfig(
+ name="wmt16:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="wmt16",
+ hf_subset="de-en",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_de_en = LightevalTaskConfig(
+ name="wmt16:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt16_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_en_cs = LightevalTaskConfig(
+ name="wmt16:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt16_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_en_de = LightevalTaskConfig(
+ name="wmt16:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="wmt16",
+ hf_subset="de-en",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_en_de = LightevalTaskConfig(
+ name="wmt16:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt16_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_en_fi = LightevalTaskConfig(
+ name="wmt16:en-fi",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt16_en-fi",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_en_ro = LightevalTaskConfig(
+ name="wmt16:en-ro",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="wmt16",
+ hf_subset="ro-en",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_en_ro = LightevalTaskConfig(
+ name="wmt16:en-ro",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt16_en-ro",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_en_ru = LightevalTaskConfig(
+ name="wmt16:en-ru",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt16_en-ru",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_en_tr = LightevalTaskConfig(
+ name="wmt16:en-tr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt16_en-tr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_fi_en = LightevalTaskConfig(
+ name="wmt16:fi-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt16_fi-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_ro_en = LightevalTaskConfig(
+ name="wmt16:ro-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="wmt16",
+ hf_subset="ro-en",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_ro_en = LightevalTaskConfig(
+ name="wmt16:ro-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt16_ro-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_ru_en = LightevalTaskConfig(
+ name="wmt16:ru-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt16_ru-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt16_tr_en = LightevalTaskConfig(
+ name="wmt16:tr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt16_tr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_cs_en = LightevalTaskConfig(
+ name="wmt17:cs-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_cs-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_de_en = LightevalTaskConfig(
+ name="wmt17:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_en_cs = LightevalTaskConfig(
+ name="wmt17:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_en_de = LightevalTaskConfig(
+ name="wmt17:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_en_fi = LightevalTaskConfig(
+ name="wmt17:en-fi",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_en-fi",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_en_lv = LightevalTaskConfig(
+ name="wmt17:en-lv",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_en-lv",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_en_ru = LightevalTaskConfig(
+ name="wmt17:en-ru",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_en-ru",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_en_tr = LightevalTaskConfig(
+ name="wmt17:en-tr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_en-tr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_en_zh = LightevalTaskConfig(
+ name="wmt17:en-zh",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_en-zh",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_fi_en = LightevalTaskConfig(
+ name="wmt17:fi-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_fi-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_lv_en = LightevalTaskConfig(
+ name="wmt17:lv-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_lv-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_ru_en = LightevalTaskConfig(
+ name="wmt17:ru-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_ru-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_tr_en = LightevalTaskConfig(
+ name="wmt17:tr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_tr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt17_zh_en = LightevalTaskConfig(
+ name="wmt17:zh-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt17_zh-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_cs_en = LightevalTaskConfig(
+ name="wmt18:cs-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_cs-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_de_en = LightevalTaskConfig(
+ name="wmt18:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_en_cs = LightevalTaskConfig(
+ name="wmt18:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_en_de = LightevalTaskConfig(
+ name="wmt18:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_en_et = LightevalTaskConfig(
+ name="wmt18:en-et",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_en-et",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_en_fi = LightevalTaskConfig(
+ name="wmt18:en-fi",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_en-fi",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_en_ru = LightevalTaskConfig(
+ name="wmt18:en-ru",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_en-ru",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_en_tr = LightevalTaskConfig(
+ name="wmt18:en-tr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_en-tr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_en_zh = LightevalTaskConfig(
+ name="wmt18:en-zh",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_en-zh",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_et_en = LightevalTaskConfig(
+ name="wmt18:et-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_et-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_fi_en = LightevalTaskConfig(
+ name="wmt18:fi-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_fi-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_ru_en = LightevalTaskConfig(
+ name="wmt18:ru-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_ru-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_tr_en = LightevalTaskConfig(
+ name="wmt18:tr-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_tr-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt18_zh_en = LightevalTaskConfig(
+ name="wmt18:zh-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt18_zh-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_cs_de = LightevalTaskConfig(
+ name="wmt19:cs-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_cs-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_de_cs = LightevalTaskConfig(
+ name="wmt19:de-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_de-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_de_en = LightevalTaskConfig(
+ name="wmt19:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_de_fr = LightevalTaskConfig(
+ name="wmt19:de-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_de-fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_en_cs = LightevalTaskConfig(
+ name="wmt19:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_en_de = LightevalTaskConfig(
+ name="wmt19:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_en_fi = LightevalTaskConfig(
+ name="wmt19:en-fi",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_en-fi",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_en_gu = LightevalTaskConfig(
+ name="wmt19:en-gu",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_en-gu",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_en_kk = LightevalTaskConfig(
+ name="wmt19:en-kk",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_en-kk",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_en_lt = LightevalTaskConfig(
+ name="wmt19:en-lt",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_en-lt",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_en_ru = LightevalTaskConfig(
+ name="wmt19:en-ru",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_en-ru",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_en_zh = LightevalTaskConfig(
+ name="wmt19:en-zh",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_en-zh",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_fi_en = LightevalTaskConfig(
+ name="wmt19:fi-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_fi-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_fr_de = LightevalTaskConfig(
+ name="wmt19:fr-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_fr-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_gu_en = LightevalTaskConfig(
+ name="wmt19:gu-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_gu-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_kk_en = LightevalTaskConfig(
+ name="wmt19:kk-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_kk-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_lt_en = LightevalTaskConfig(
+ name="wmt19:lt-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_lt-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_ru_en = LightevalTaskConfig(
+ name="wmt19:ru-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_ru-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt19_zh_en = LightevalTaskConfig(
+ name="wmt19:zh-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt19_zh-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_cs_en = LightevalTaskConfig(
+ name="wmt20:cs-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_cs-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_de_en = LightevalTaskConfig(
+ name="wmt20:de-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_de-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_de_fr = LightevalTaskConfig(
+ name="wmt20:de-fr",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_de-fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_en_cs = LightevalTaskConfig(
+ name="wmt20:en-cs",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_en-cs",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_en_de = LightevalTaskConfig(
+ name="wmt20:en-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_en-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_en_iu = LightevalTaskConfig(
+ name="wmt20:en-iu",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_en-iu",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_en_ja = LightevalTaskConfig(
+ name="wmt20:en-ja",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_en-ja",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_en_km = LightevalTaskConfig(
+ name="wmt20:en-km",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_en-km",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_en_pl = LightevalTaskConfig(
+ name="wmt20:en-pl",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_en-pl",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_en_ps = LightevalTaskConfig(
+ name="wmt20:en-ps",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_en-ps",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_en_ru = LightevalTaskConfig(
+ name="wmt20:en-ru",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_en-ru",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_en_ta = LightevalTaskConfig(
+ name="wmt20:en-ta",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_en-ta",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_en_zh = LightevalTaskConfig(
+ name="wmt20:en-zh",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_en-zh",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_fr_de = LightevalTaskConfig(
+ name="wmt20:fr-de",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_fr-de",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_iu_en = LightevalTaskConfig(
+ name="wmt20:iu-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_iu-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_ja_en = LightevalTaskConfig(
+ name="wmt20:ja-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_ja-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_km_en = LightevalTaskConfig(
+ name="wmt20:km-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_km-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_pl_en = LightevalTaskConfig(
+ name="wmt20:pl-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_pl-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_ps_en = LightevalTaskConfig(
+ name="wmt20:ps-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_ps-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_ru_en = LightevalTaskConfig(
+ name="wmt20:ru-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_ru-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_ta_en = LightevalTaskConfig(
+ name="wmt20:ta-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_ta-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wmt20_zh_en = LightevalTaskConfig(
+ name="wmt20:zh-en",
+ suite=["lighteval"],
+ prompt_function=prompt.wmt_reverse_alphabetical,
+ hf_repo="lighteval/sacrebleu_manual",
+ hf_subset="wmt20_zh-en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=None,
+ metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ wmt14_de_en,
+ wmt16_en_cs,
+ wmt19_en_cs,
+ wmt19_en_de,
+ wmt19_en_fi,
+ wmt19_en_gu,
+ wmt19_en_kk,
+ wmt19_en_lt,
+ wmt19_en_ru,
+ wmt19_en_zh,
+ wmt19_fi_en,
+ wmt19_fr_de,
+ wmt19_gu_en,
+ wmt19_kk_en,
+ wmt19_lt_en,
+ wmt19_ru_en,
+ wmt19_zh_en,
+ wmt20_cs_en,
+ wmt20_de_en,
+ wmt20_en_de,
+ wmt20_en_iu,
+ wmt20_en_ja,
+ wmt20_en_km,
+ wmt20_en_pl,
+ wmt20_en_ps,
+ wmt20_en_ru,
+ wmt20_en_ta,
+ wmt20_en_zh,
+ wmt20_fr_de,
+ wmt20_iu_en,
+ wmt20_ja_en,
+ wmt20_km_en,
+ wmt20_pl_en,
+ wmt20_ps_en,
+ wmt20_ru_en,
+ wmt20_ta_en,
+ wmt20_zh_en,
+]
diff --git a/src/lighteval/tasks/tasks/sciq.py b/src/lighteval/tasks/tasks/sciq.py
new file mode 100644
index 000000000..ed4285101
--- /dev/null
+++ b/src/lighteval/tasks/tasks/sciq.py
@@ -0,0 +1,48 @@
+"""
+name:
+Sciq
+
+dataset:
+allenai/sciq
+
+abstract:
+The SciQ dataset contains 13,679 crowdsourced science exam questions about
+Physics, Chemistry and Biology, among others. The questions are in
+multiple-choice format with 4 answer options each. For the majority of the
+questions, an additional paragraph with supporting evidence for the correct
+answer is provided.
+
+languages:
+english
+
+tags:
+physics, chemistry, biology, reasoning, multiple-choice, qa
+
+paper:
+https://arxiv.org/abs/1707.06209
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+sciq = LightevalTaskConfig(
+ name="sciq",
+ suite=["lighteval"],
+ prompt_function=prompt.sciq,
+ hf_repo="allenai/sciq",
+ hf_subset="default",
+ hf_avail_splits=["train", "validation", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ sciq,
+]
diff --git a/src/lighteval/tasks/tasks/simpleqa.py b/src/lighteval/tasks/tasks/simpleqa.py
new file mode 100644
index 000000000..31ab0e369
--- /dev/null
+++ b/src/lighteval/tasks/tasks/simpleqa.py
@@ -0,0 +1,45 @@
+"""
+name:
+Simpleqa
+
+dataset:
+lighteval/SimpleQA
+
+abstract:
+A factuality benchmark called SimpleQA that measures the ability for language
+models to answer short, fact-seeking questions.
+
+languages:
+english
+
+tags:
+factuality, general-knowledge, qa
+
+paper:
+https://openai.com/index/introducing-simpleqa/
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+simpleqa = LightevalTaskConfig(
+ name="simpleqa",
+ suite=["lighteval"],
+ prompt_function=prompt.simpleqa,
+ hf_repo="lighteval/SimpleQA",
+ hf_subset="default",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split="few_shot",
+ few_shots_select=None,
+ generation_size=2048,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ simpleqa,
+]
diff --git a/src/lighteval/tasks/tasks/siqa.py b/src/lighteval/tasks/tasks/siqa.py
new file mode 100644
index 000000000..e8e049bbf
--- /dev/null
+++ b/src/lighteval/tasks/tasks/siqa.py
@@ -0,0 +1,54 @@
+"""
+name:
+Siqa
+
+dataset:
+allenai/social_i_qa
+
+abstract:
+We introduce Social IQa: Social Interaction QA, a new question-answering
+benchmark for testing social commonsense intelligence. Contrary to many prior
+benchmarks that focus on physical or taxonomic knowledge, Social IQa focuses on
+reasoning about people's actions and their social implications. For example,
+given an action like "Jesse saw a concert" and a question like "Why did Jesse do
+this?", humans can easily infer that Jesse wanted "to see their favorite
+performer" or "to enjoy the music", and not "to see what's happening inside" or
+"to see if it works". The actions in Social IQa span a wide variety of social
+situations, and answer candidates contain both human-curated answers and
+adversarially-filtered machine-generated candidates. Social IQa contains over
+37,000 QA pairs for evaluating models' abilities to reason about the social
+implications of everyday events and situations.
+
+languages:
+english
+
+tags:
+commonsense, multiple-choice, qa
+
+paper:
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+siqa = LightevalTaskConfig(
+ name="siqa",
+ suite=["lighteval"],
+ prompt_function=prompt.siqa,
+ hf_repo="allenai/social_i_qa",
+ hf_subset="default",
+ hf_avail_splits=["train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ siqa,
+]
diff --git a/community_tasks/slr_bench_evals.py b/src/lighteval/tasks/tasks/slr_bench.py
similarity index 55%
rename from community_tasks/slr_bench_evals.py
rename to src/lighteval/tasks/tasks/slr_bench.py
index b6d60ff43..bad487b57 100644
--- a/community_tasks/slr_bench_evals.py
+++ b/src/lighteval/tasks/tasks/slr_bench.py
@@ -1,68 +1,63 @@
-# MIT License
+"""
+name:
+SLR-Bench
-# Copyright (c) 2025 Lukas Helff
+dataset:
+AIML-TUDA/SLR-Bench
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
+abstract:
+SLR-Bench is a large-scale benchmark for scalable logical reasoning with
+language models, comprising 19,000 prompts organized into 20 curriculum levels.
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+languages:
+english
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+tags:
+reasoning, symbolic
-"""
-SLR-Bench is a large-scale benchmark for scalable logical reasoning with language models, comprising 19,000 prompts organized into 20 curriculum levels.
-The tasks progressively increase in relational, arithmetic, and recursive complexity, requiring models to synthesize Prolog rules that classify train compositions.
-For more details see: https://huggingface.co/datasets/AIML-TUDA/SLR-Bench
-The paper can be found here: https://arxiv.org/abs/2506.15787
-Before using this task, please ensure that SWI-Prolog and evaluate are installed on your system, as they are required for symbolic verification of the generated Prolog programs.
+paper:
+https://arxiv.org/abs/2506.15787
"""
import logging
-import shutil
import numpy as np
-from evaluate import load
from lighteval.metrics.utils.metric_utils import SampleLevelComputation, SampleLevelMetric
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc, SamplingMethod
+from lighteval.utils.imports import is_package_available, requires
-logger = logging.getLogger(__name__)
-
-
-# Check for SWI-Prolog installation
-if shutil.which("swipl") is None:
- raise ImportError(
- "SWI-Prolog (swipl) is not installed or not in PATH. "
- "Please install SWI-Prolog to use this task. "
- "You can install required dependencies with: pip install -r community_tasks/slr_bench_requirements.txt"
- )
+if is_package_available("evaluate"):
+ from evaluate import load
+else:
+ load = None
-# Load the symbolic judge for evaluating Prolog programs
-symbolic_judge = load("AIML-TUDA/VerifiableRewardsForScalableLogicalReasoning")
+logger = logging.getLogger(__name__)
+@requires("evaluate")
def prompt_fn(line: dict, task_name: str):
"""Defines how to go from a dataset line to a doc object."""
+ # Check for SWI-Prolog installation
+ import shutil
+
+ if shutil.which("swipl") is None:
+ raise ImportError(
+ "SWI-Prolog (swipl) is not installed or not in PATH. Please install SWI-Prolog to use this task. "
+ )
+
return Doc(
task_name=task_name, query=line["prompt"], choices=[str(line.get("validation program", ""))], gold_index=0
)
class VerifiableRewardMetric(SampleLevelComputation):
+ # Load the symbolic judge for evaluating Prolog programs
+
def compute(self, doc, model_response, **kwargs):
+ symbolic_judge = load("AIML-TUDA/VerifiableRewardsForScalableLogicalReasoning")
try:
prediction = model_response.final_text[0]
validation_program = doc.choices[0] if doc.choices else ""
diff --git a/src/lighteval/tasks/tasks/squad_v2.py b/src/lighteval/tasks/tasks/squad_v2.py
new file mode 100644
index 000000000..a05df9332
--- /dev/null
+++ b/src/lighteval/tasks/tasks/squad_v2.py
@@ -0,0 +1,59 @@
+"""
+name:
+Squad V2
+
+dataset:
+rajpurkar/squad_v2
+
+abstract:
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+where the answer to every question is a segment of text, or span, from the
+corresponding reading passage, or the question might be unanswerable.
+SQuAD 2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000
+unanswerable questions written adversarially by crowdworkers to look similar to
+answerable ones. To do well on SQuAD2.0, systems must not only answer questions
+when possible, but also determine when no answer is supported by the paragraph
+and abstain from answering.
+
+languages:
+english
+
+tags:
+qa
+
+paper:
+https://arxiv.org/abs/1806.03822
+"""
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.templates.qa import get_qa_prompt_function
+from lighteval.utils.language import Language
+
+
+squad_v2 = LightevalTaskConfig(
+ name="squad_v2",
+ prompt_function=get_qa_prompt_function(
+ Language.ENGLISH,
+ lambda line: {
+ "question": line["question"],
+ "context": line["context"],
+ "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
+ },
+ ),
+ suite=("lighteval",),
+ hf_repo="rajpurkar/squad_v2",
+ hf_subset="squad_v2",
+ hf_filter=lambda line: any(ans for ans in line["answers"]["text"] if len(ans) > 0),
+ evaluation_splits=("validation",),
+ few_shots_split="train",
+ stop_sequence=["\n", "Question:", "question:"],
+ generation_size=200,
+ metrics=[Metrics.exact_match],
+ version=1,
+)
+
+TASKS_TABLE = [
+ squad_v2,
+]
diff --git a/src/lighteval/tasks/tasks/storycloze.py b/src/lighteval/tasks/tasks/storycloze.py
new file mode 100644
index 000000000..5fdd34c9c
--- /dev/null
+++ b/src/lighteval/tasks/tasks/storycloze.py
@@ -0,0 +1,63 @@
+"""
+name:
+Storycloze
+
+dataset:
+MoE-UNC/story_cloze
+
+abstract:
+A Corpus and Cloze Evaluation for Deeper Understanding of
+Commonsense Stories
+
+languages:
+english
+
+tags:
+narrative, reasoning
+
+paper:
+https://arxiv.org/abs/1604.01696
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+storycloze_2016 = LightevalTaskConfig(
+ name="storycloze:2016",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="MoE-UNC/story_cloze",
+ hf_subset="2016",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+storycloze_2018 = LightevalTaskConfig(
+ name="storycloze:2018",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="MoE-UNC/story_cloze",
+ hf_subset="2018",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ storycloze_2016,
+ storycloze_2018,
+]
diff --git a/src/lighteval/tasks/tasks/summarization.py b/src/lighteval/tasks/tasks/summarization.py
new file mode 100644
index 000000000..84deb9f01
--- /dev/null
+++ b/src/lighteval/tasks/tasks/summarization.py
@@ -0,0 +1,104 @@
+"""
+name:
+Summarization
+
+dataset:
+lighteval/summarization
+
+abstract:
+Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural
+Networks for Extreme Summarization and: Abstractive Text Summarization using
+Sequence-to-sequence RNNs and Beyond
+
+languages:
+english
+
+tags:
+summarization
+
+paper:
+https://aclanthology.org/D18-1206/
+https://aclanthology.org/K16-1028/
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+summarization_cnn_dm = LightevalTaskConfig(
+ name="summarization:cnn-dm",
+ suite=["lighteval"],
+ prompt_function=prompt.cnn_dm,
+ hf_repo="lighteval/summarization",
+ hf_subset="cnn-dm",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=128,
+ metrics=[
+ Metrics.rouge1,
+ Metrics.rouge2,
+ Metrics.rougeL,
+ Metrics.faithfulness,
+ Metrics.extractiveness,
+ Metrics.bert_score,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+summarization_xsum = LightevalTaskConfig(
+ name="summarization:xsum",
+ suite=["lighteval"],
+ prompt_function=prompt.xsum,
+ hf_repo="lighteval/summarization",
+ hf_subset="xsum",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=64,
+ metrics=[
+ Metrics.rouge1,
+ Metrics.rouge2,
+ Metrics.rougeL,
+ Metrics.faithfulness,
+ Metrics.extractiveness,
+ Metrics.bert_score,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+summarization_xsum_sampled = LightevalTaskConfig(
+ name="summarization:xsum-sampled",
+ suite=["lighteval"],
+ prompt_function=prompt.xsum,
+ hf_repo="lighteval/summarization",
+ hf_subset="xsum-sampled",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=64,
+ metrics=[
+ Metrics.rouge1,
+ Metrics.rouge2,
+ Metrics.rougeL,
+ Metrics.faithfulness,
+ Metrics.extractiveness,
+ Metrics.bert_score,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ summarization_cnn_dm,
+ summarization_xsum,
+ summarization_xsum_sampled,
+]
diff --git a/src/lighteval/tasks/tasks/swag.py b/src/lighteval/tasks/tasks/swag.py
new file mode 100644
index 000000000..7743a1c47
--- /dev/null
+++ b/src/lighteval/tasks/tasks/swag.py
@@ -0,0 +1,51 @@
+"""
+name:
+Swag
+
+dataset:
+allenai/swag
+
+abstract:
+The dataset consists of 113k multiple choice questions about grounded situations
+(73k training, 20k validation, 20k test). Each question is a video caption from
+LSMDC or ActivityNet Captions, with four answer choices about what might happen
+next in the scene. The correct answer is the (real) video caption for the next
+event in the video; the three incorrect answers are adversarially generated and
+human verified, so as to fool machines but not humans. SWAG aims to be a
+benchmark for evaluating grounded commonsense NLI and for learning
+representations.
+
+languages:
+english
+
+tags:
+narrative, reasoning
+
+paper:
+https://arxiv.org/abs/1808.05326
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+swag = LightevalTaskConfig(
+ name="swag",
+ suite=["lighteval"],
+ prompt_function=prompt.swag,
+ hf_repo="allenai/swag",
+ hf_subset="regular",
+ hf_avail_splits=["train", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ swag,
+]
diff --git a/src/lighteval/tasks/tasks/synthetic_reasoning.py b/src/lighteval/tasks/tasks/synthetic_reasoning.py
new file mode 100644
index 000000000..815e0e91a
--- /dev/null
+++ b/src/lighteval/tasks/tasks/synthetic_reasoning.py
@@ -0,0 +1,122 @@
+"""
+name:
+Synthetic Reasoning
+
+dataset:
+lighteval/synthetic_reasoning, lighteval/synthetic_reasoning_natural
+
+abstract:
+LIME: Learning Inductive Bias for Primitives of Mathematical Reasoning
+
+languages:
+english
+
+tags:
+reasoning
+
+paper:
+https://arxiv.org/abs/2206.03855
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+synthetic_reasoning_induction = LightevalTaskConfig(
+ name="synthetic_reasoning:induction",
+ suite=["lighteval"],
+ prompt_function=prompt.synthetic_reasoning,
+ hf_repo="lighteval/synthetic_reasoning",
+ hf_subset="induction",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=50,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+synthetic_reasoning_natural_easy = LightevalTaskConfig(
+ name="synthetic_reasoning:natural_easy",
+ suite=["lighteval"],
+ prompt_function=prompt.synthetic_reasoning_natural,
+ hf_repo="lighteval/synthetic_reasoning_natural",
+ hf_subset="easy",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+synthetic_reasoning_natural_hard = LightevalTaskConfig(
+ name="synthetic_reasoning:natural_hard",
+ suite=["lighteval"],
+ prompt_function=prompt.synthetic_reasoning_natural,
+ hf_repo="lighteval/synthetic_reasoning_natural",
+ hf_subset="hard",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+synthetic_reasoning_pattern_match = LightevalTaskConfig(
+ name="synthetic_reasoning:pattern_match",
+ suite=["lighteval"],
+ prompt_function=prompt.synthetic_reasoning,
+ hf_repo="lighteval/synthetic_reasoning",
+ hf_subset="pattern_match",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=50,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+synthetic_reasoning_variable_substitution = LightevalTaskConfig(
+ name="synthetic_reasoning:variable_substitution",
+ suite=["lighteval"],
+ prompt_function=prompt.synthetic_reasoning,
+ hf_repo="lighteval/synthetic_reasoning",
+ hf_subset="variable_substitution",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation", "test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=50,
+ metrics=[
+ Metrics.exact_match,
+ ],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ synthetic_reasoning_induction,
+ synthetic_reasoning_natural_easy,
+ synthetic_reasoning_natural_hard,
+ synthetic_reasoning_pattern_match,
+ synthetic_reasoning_variable_substitution,
+]
diff --git a/src/lighteval/tasks/tasks/the_pile.py b/src/lighteval/tasks/tasks/the_pile.py
new file mode 100644
index 000000000..3ed26d94e
--- /dev/null
+++ b/src/lighteval/tasks/tasks/the_pile.py
@@ -0,0 +1,351 @@
+"""
+name:
+The Pile
+
+dataset:
+lighteval/pile_helm
+
+abstract:
+The Pile corpus for measuring lanugage model performance across various domains.
+
+languages:
+english
+
+tags:
+language-modeling
+
+paper:
+https://arxiv.org/abs/2101.00027
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+the_pile_arxiv_helm = LightevalTaskConfig(
+ name="the_pile:arxiv",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="arxiv",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_bibliotik_helm = LightevalTaskConfig(
+ name="the_pile:bibliotik",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="bibliotik",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_commoncrawl_helm = LightevalTaskConfig(
+ name="the_pile:commoncrawl",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="commoncrawl",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_dm_mathematics_helm = LightevalTaskConfig(
+ name="the_pile:dm-mathematics",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="dm-mathematics",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_enron_helm = LightevalTaskConfig(
+ name="the_pile:enron",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="enron",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_europarl_helm = LightevalTaskConfig(
+ name="the_pile:europarl",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="europarl",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_freelaw_helm = LightevalTaskConfig(
+ name="the_pile:freelaw",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="freelaw",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_github_helm = LightevalTaskConfig(
+ name="the_pile:github",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="github",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_gutenberg_helm = LightevalTaskConfig(
+ name="the_pile:gutenberg",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="gutenberg",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_hackernews_helm = LightevalTaskConfig(
+ name="the_pile:hackernews",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="hackernews",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_nih_exporter_helm = LightevalTaskConfig(
+ name="the_pile:nih-exporter",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="nih-exporter",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_opensubtitles_helm = LightevalTaskConfig(
+ name="the_pile:opensubtitles",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="opensubtitles",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_openwebtext2_helm = LightevalTaskConfig(
+ name="the_pile:openwebtext2",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="openwebtext2",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+the_pile_pubmed_abstracts_helm = LightevalTaskConfig(
+ name="the_pile:pubmed-abstracts",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="pubmed-abstracts",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_pubmed_central_helm = LightevalTaskConfig(
+ name="the_pile:pubmed-central",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="pubmed-central",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_stackexchange_helm = LightevalTaskConfig(
+ name="the_pile:stackexchange",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="stackexchange",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_upsto_helm = LightevalTaskConfig(
+ name="the_pile:upsto",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="uspto",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_wikipedia_helm = LightevalTaskConfig(
+ name="the_pile:wikipedia",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="wikipedia",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+the_pile_youtubesubtitles_helm = LightevalTaskConfig(
+ name="the_pile:youtubesubtitles",
+ suite=["lighteval"],
+ prompt_function=prompt.the_pile,
+ hf_repo="lighteval/pile_helm",
+ hf_subset="youtubesubtitles",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ the_pile_arxiv_helm,
+ the_pile_bibliotik_helm,
+ the_pile_commoncrawl_helm,
+ the_pile_dm_mathematics_helm,
+ the_pile_enron_helm,
+ the_pile_europarl_helm,
+ the_pile_freelaw_helm,
+ the_pile_github_helm,
+ the_pile_gutenberg_helm,
+ the_pile_hackernews_helm,
+ the_pile_nih_exporter_helm,
+ the_pile_opensubtitles_helm,
+ the_pile_openwebtext2_helm,
+ the_pile_pubmed_abstracts_helm,
+ the_pile_pubmed_central_helm,
+ the_pile_stackexchange_helm,
+ the_pile_upsto_helm,
+ the_pile_wikipedia_helm,
+ the_pile_youtubesubtitles_helm,
+]
diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/tasks/tiny_benchmarks/main.py
similarity index 86%
rename from src/lighteval/tasks/extended/tiny_benchmarks/main.py
rename to src/lighteval/tasks/tasks/tiny_benchmarks/main.py
index 44e05d0cc..bb8d0c2d1 100644
--- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py
+++ b/src/lighteval/tasks/tasks/tiny_benchmarks/main.py
@@ -1,29 +1,24 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team & Felipe Maia Polo
+"""
+name:
+Tiny Benchmarks
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
+dataset:
+tinyBenchmarks/tinyWinogrande, tinyBenchmarks/tinyAI2_arc,
+tinyBenchmarks/tinyHellaswag, tinyBenchmarks/tinyMMLU,
+tinyBenchmarks/tinyTruthfulQA, tinyBenchmarks/tinyGSM8k
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
+abstract:
+TinyBenchmarks is a benchmark for evaluating the performance of language models
+on tiny benchmarks.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+languages:
+english
-# ruff: noqa: F405, F403, F401
-"""See https://github.com/felipemaiapolo/tinyBenchmarks/ for the original code.
+tags:
+general-knowledge, reasoning, qa
-Test with `python run_evals_accelerate.py --model_args "pretrained=EleutherAI/pythia-70m" --tasks "extended|tiny:winogrande|0,extended|tiny:gsm8k|0,extended|tiny:hellaswag|0,extended|tiny:arc|0,extended|tiny:truthfulqa|0" --extended_tasks extended_tasks --output_dir "./evals"`
+paper:
+https://arxiv.org/abs/2402.14992
"""
import os
@@ -249,7 +244,7 @@ def compute_corpus(self, y_input):
task = LightevalTaskConfig(
name=f"tiny:{name}",
prompt_function=task["prompt"],
- suite=["extended"],
+ suite=["lighteval"],
hf_repo=task["dataset"],
hf_subset=task["subset"],
hf_avail_splits=task["splits"],
diff --git a/src/lighteval/tasks/tasks/toxigen.py b/src/lighteval/tasks/tasks/toxigen.py
new file mode 100644
index 000000000..c5e724a9d
--- /dev/null
+++ b/src/lighteval/tasks/tasks/toxigen.py
@@ -0,0 +1,45 @@
+"""
+name:
+Toxigen
+
+dataset:
+skg/toxigen-data
+
+abstract:
+This dataset is for implicit hate speech detection. All instances were generated
+using GPT-3 and the methods described in our paper.
+
+languages:
+english
+
+tags:
+generation, safety
+
+paper:
+https://arxiv.org/abs/2203.09509
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+toxigen = LightevalTaskConfig(
+ name="toxigen",
+ suite=["lighteval"],
+ prompt_function=prompt.toxigen,
+ hf_repo="skg/toxigen-data",
+ hf_subset="annotated",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ toxigen,
+]
diff --git a/src/lighteval/tasks/tasks/triviaqa.py b/src/lighteval/tasks/tasks/triviaqa.py
new file mode 100644
index 000000000..b3e13d553
--- /dev/null
+++ b/src/lighteval/tasks/tasks/triviaqa.py
@@ -0,0 +1,48 @@
+"""
+name:
+Triviaqa
+
+dataset:
+mandarjoshi/trivia_qa
+
+abstract:
+TriviaqQA is a reading comprehension dataset containing over 650K
+question-answer-evidence triples. TriviaqQA includes 95K question-answer pairs
+authored by trivia enthusiasts and independently gathered evidence documents,
+six per question on average, that provide high quality distant supervision for
+answering the questions.
+
+languages:
+english
+
+tags:
+qa
+
+paper:
+https://arxiv.org/abs/1705.03551
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+triviaqa = LightevalTaskConfig(
+ name="triviaqa",
+ suite=["lighteval"],
+ prompt_function=prompt.triviaqa,
+ hf_repo="mandarjoshi/trivia_qa",
+ hf_subset="rc.nocontext",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=20,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n", ".", ","],
+ version=0,
+)
+
+TASKS_TABLE = [
+ triviaqa,
+]
diff --git a/src/lighteval/tasks/tasks/truthfulqa.py b/src/lighteval/tasks/tasks/truthfulqa.py
new file mode 100644
index 000000000..84db92ed6
--- /dev/null
+++ b/src/lighteval/tasks/tasks/truthfulqa.py
@@ -0,0 +1,61 @@
+"""
+name:
+Truthfulqa
+
+dataset:
+EleutherAI/truthful_qa_mc
+
+abstract:
+TruthfulQA: Measuring How Models Mimic Human Falsehoods
+
+languages:
+english
+
+tags:
+factuality, qa
+
+paper:
+https://arxiv.org/abs/2109.07958
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+truthfulqa_gen = LightevalTaskConfig(
+ name="truthfulqa:gen",
+ suite=["lighteval"],
+ prompt_function=prompt.truthful_qa_generative,
+ hf_repo="truthfulqa/truthful_qa",
+ hf_subset="generation",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=200,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+truthfulqa_mc = LightevalTaskConfig(
+ name="truthfulqa:mc",
+ suite=["lighteval"],
+ prompt_function=prompt.truthful_qa_multiple_choice,
+ hf_repo="truthfulqa/truthful_qa",
+ hf_subset="multiple_choice",
+ hf_avail_splits=["validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.truthfulqa_mc_metrics],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ truthfulqa_gen,
+ truthfulqa_mc,
+]
diff --git a/src/lighteval/tasks/tasks/twitterAAE.py b/src/lighteval/tasks/tasks/twitterAAE.py
new file mode 100644
index 000000000..dd9861f91
--- /dev/null
+++ b/src/lighteval/tasks/tasks/twitterAAE.py
@@ -0,0 +1,62 @@
+"""
+name:
+Twitteraae
+
+dataset:
+lighteval/twitterAAE
+
+abstract:
+Demographic Dialectal Variation in Social Media: A Case Study of African-American English
+
+languages:
+english
+
+tags:
+language-modeling
+
+paper:
+https://aclanthology.org/D16-1120/
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+twitterAAE_aa = LightevalTaskConfig(
+ name="twitterAAE:aa",
+ suite=["lighteval"],
+ prompt_function=prompt.twitter_aae,
+ hf_repo="lighteval/twitterAAE",
+ hf_subset="aa",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+
+twitterAAE_white = LightevalTaskConfig(
+ name="twitterAAE:white",
+ suite=["lighteval"],
+ prompt_function=prompt.twitter_aae,
+ hf_repo="lighteval/twitterAAE",
+ hf_subset="white",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ twitterAAE_aa,
+ twitterAAE_white,
+]
diff --git a/src/lighteval/tasks/tasks/unscramble.py b/src/lighteval/tasks/tasks/unscramble.py
new file mode 100644
index 000000000..eb8335026
--- /dev/null
+++ b/src/lighteval/tasks/tasks/unscramble.py
@@ -0,0 +1,113 @@
+"""
+name:
+Unscramble
+
+dataset:
+lighteval/GPT3_unscramble
+
+abstract:
+Benchmark where we ask the model to unscramble a word, either anagram or
+random insertion.
+
+languages:
+english
+
+tags:
+language-modeling, reasoning
+
+paper:
+https://huggingface.co/datasets/lighteval/GPT3_unscramble
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+unscramble_anagrams1 = LightevalTaskConfig(
+ name="unscramble:anagrams1",
+ suite=["lighteval"],
+ prompt_function=prompt.unscramble,
+ hf_repo="lighteval/GPT3_unscramble",
+ hf_subset="default",
+ hf_avail_splits=["mid_word_1_anagrams"],
+ evaluation_splits=["mid_word_1_anagrams"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+unscramble_anagrams2 = LightevalTaskConfig(
+ name="unscramble:anagrams2",
+ suite=["lighteval"],
+ prompt_function=prompt.unscramble,
+ hf_repo="lighteval/GPT3_unscramble",
+ hf_subset="default",
+ hf_avail_splits=["mid_word_2_anagrams"],
+ evaluation_splits=["mid_word_2_anagrams"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+unscramble_cycle_letters = LightevalTaskConfig(
+ name="unscramble:cycle_letters",
+ suite=["lighteval"],
+ prompt_function=prompt.unscramble,
+ hf_repo="lighteval/GPT3_unscramble",
+ hf_subset="default",
+ hf_avail_splits=["cycle_letters_in_word"],
+ evaluation_splits=["cycle_letters_in_word"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+unscramble_random_insertion = LightevalTaskConfig(
+ name="unscramble:random_insertion",
+ suite=["lighteval"],
+ prompt_function=prompt.unscramble,
+ hf_repo="lighteval/GPT3_unscramble",
+ hf_subset="default",
+ hf_avail_splits=["random_insertion_in_word"],
+ evaluation_splits=["random_insertion_in_word"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+unscramble_reversed_words = LightevalTaskConfig(
+ name="unscramble:reversed_words",
+ suite=["lighteval"],
+ prompt_function=prompt.unscramble,
+ hf_repo="lighteval/GPT3_unscramble",
+ hf_subset="default",
+ hf_avail_splits=["reversed_words"],
+ evaluation_splits=["reversed_words"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=5,
+ metrics=[Metrics.exact_match(sample_params={"strip_strings": False})],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ unscramble_anagrams1,
+ unscramble_anagrams2,
+ unscramble_cycle_letters,
+ unscramble_random_insertion,
+ unscramble_reversed_words,
+]
diff --git a/src/lighteval/tasks/tasks/webqs.py b/src/lighteval/tasks/tasks/webqs.py
new file mode 100644
index 000000000..493b83f75
--- /dev/null
+++ b/src/lighteval/tasks/tasks/webqs.py
@@ -0,0 +1,47 @@
+"""
+name:
+Webqs
+
+dataset:
+stanfordnlp/web_questions
+
+abstract:
+This dataset consists of 6,642 question/answer pairs. The questions are supposed
+to be answerable by Freebase, a large knowledge graph. The questions are mostly
+centered around a single named entity. The questions are popular ones asked on
+the web.
+
+languages:
+english
+
+tags:
+qa
+
+paper:
+https://aclanthology.org/D13-1160.pdf
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+webqs = LightevalTaskConfig(
+ name="webqs",
+ suite=["lighteval"],
+ prompt_function=prompt.webqs,
+ hf_repo="stanfordnlp/web_questions",
+ hf_subset="default",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ webqs,
+]
diff --git a/src/lighteval/tasks/tasks/wikifact.py b/src/lighteval/tasks/tasks/wikifact.py
new file mode 100644
index 000000000..592491379
--- /dev/null
+++ b/src/lighteval/tasks/tasks/wikifact.py
@@ -0,0 +1,1453 @@
+"""
+name:
+Wikifact
+
+dataset:
+lighteval/wikifact
+
+abstract:
+Extensively test factual knowledge.
+
+languages:
+english
+
+tags:
+factuality, knowledge
+
+paper:
+https://aclanthology.org/D19-1250/
+"""
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks import default_prompts as prompt
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+wikifact_applies_to_jurisdiction = LightevalTaskConfig(
+ name="wikifact:applies_to_jurisdiction",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="applies_to_jurisdiction",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_atomic_number = LightevalTaskConfig(
+ name="wikifact:atomic_number",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="atomic_number",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_author = LightevalTaskConfig(
+ name="wikifact:author",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="author",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_award_received = LightevalTaskConfig(
+ name="wikifact:award_received",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="award_received",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_basic_form_of_government = LightevalTaskConfig(
+ name="wikifact:basic_form_of_government",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="basic_form_of_government",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_capital = LightevalTaskConfig(
+ name="wikifact:capital",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="capital",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_capital_of = LightevalTaskConfig(
+ name="wikifact:capital_of",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="capital_of",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_central_bank = LightevalTaskConfig(
+ name="wikifact:central_bank",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="central_bank",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_composer = LightevalTaskConfig(
+ name="wikifact:composer",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="composer",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_continent = LightevalTaskConfig(
+ name="wikifact:continent",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="continent",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_country = LightevalTaskConfig(
+ name="wikifact:country",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="country",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_country_of_citizenship = LightevalTaskConfig(
+ name="wikifact:country_of_citizenship",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="country_of_citizenship",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_country_of_origin = LightevalTaskConfig(
+ name="wikifact:country_of_origin",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="country_of_origin",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_creator = LightevalTaskConfig(
+ name="wikifact:creator",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="creator",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_currency = LightevalTaskConfig(
+ name="wikifact:currency",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="currency",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_defendant = LightevalTaskConfig(
+ name="wikifact:defendant",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="defendant",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_developer = LightevalTaskConfig(
+ name="wikifact:developer",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="developer",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_diplomatic_relation = LightevalTaskConfig(
+ name="wikifact:diplomatic_relation",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="diplomatic_relation",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_director = LightevalTaskConfig(
+ name="wikifact:director",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="director",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_discoverer_or_inventor = LightevalTaskConfig(
+ name="wikifact:discoverer_or_inventor",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="discoverer_or_inventor",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_drug_or_therapy_used_for_treatment = LightevalTaskConfig(
+ name="wikifact:drug_or_therapy_used_for_treatment",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="drug_or_therapy_used_for_treatment",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_educated_at = LightevalTaskConfig(
+ name="wikifact:educated_at",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="educated_at",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_electron_configuration = LightevalTaskConfig(
+ name="wikifact:electron_configuration",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="electron_configuration",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_employer = LightevalTaskConfig(
+ name="wikifact:employer",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="employer",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_field_of_work = LightevalTaskConfig(
+ name="wikifact:field_of_work",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="field_of_work",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_file_extension = LightevalTaskConfig(
+ name="wikifact:file_extension",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="file_extension",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_genetic_association = LightevalTaskConfig(
+ name="wikifact:genetic_association",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="genetic_association",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_genre = LightevalTaskConfig(
+ name="wikifact:genre",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="genre",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_has_part = LightevalTaskConfig(
+ name="wikifact:has_part",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="has_part",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_head_of_government = LightevalTaskConfig(
+ name="wikifact:head_of_government",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="head_of_government",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_head_of_state = LightevalTaskConfig(
+ name="wikifact:head_of_state",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="head_of_state",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_headquarters_location = LightevalTaskConfig(
+ name="wikifact:headquarters_location",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="headquarters_location",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_industry = LightevalTaskConfig(
+ name="wikifact:industry",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="industry",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_influenced_by = LightevalTaskConfig(
+ name="wikifact:influenced_by",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="influenced_by",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_instance_of = LightevalTaskConfig(
+ name="wikifact:instance_of",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="instance_of",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_instrument = LightevalTaskConfig(
+ name="wikifact:instrument",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="instrument",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_language_of_work_or_name = LightevalTaskConfig(
+ name="wikifact:language_of_work_or_name",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="language_of_work_or_name",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_languages_spoken_written_or_signed = LightevalTaskConfig(
+ name="wikifact:languages_spoken_written_or_signed",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="languages_spoken_written_or_signed",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_laws_applied = LightevalTaskConfig(
+ name="wikifact:laws_applied",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="laws_applied",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_located_in_the_administrative_territorial_entity = LightevalTaskConfig(
+ name="wikifact:located_in_the_administrative_territorial_entity",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="located_in_the_administrative_territorial_entity",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_location = LightevalTaskConfig(
+ name="wikifact:location",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="location",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_location_of_discovery = LightevalTaskConfig(
+ name="wikifact:location_of_discovery",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="location_of_discovery",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_location_of_formation = LightevalTaskConfig(
+ name="wikifact:location_of_formation",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="location_of_formation",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_majority_opinion_by = LightevalTaskConfig(
+ name="wikifact:majority_opinion_by",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="majority_opinion_by",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_manufacturer = LightevalTaskConfig(
+ name="wikifact:manufacturer",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="manufacturer",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_measured_physical_quantity = LightevalTaskConfig(
+ name="wikifact:measured_physical_quantity",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="measured_physical_quantity",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_medical_condition_treated = LightevalTaskConfig(
+ name="wikifact:medical_condition_treated",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="medical_condition_treated",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_member_of = LightevalTaskConfig(
+ name="wikifact:member_of",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="member_of",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_member_of_political_party = LightevalTaskConfig(
+ name="wikifact:member_of_political_party",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="member_of_political_party",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_member_of_sports_team = LightevalTaskConfig(
+ name="wikifact:member_of_sports_team",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="member_of_sports_team",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_movement = LightevalTaskConfig(
+ name="wikifact:movement",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="movement",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_named_after = LightevalTaskConfig(
+ name="wikifact:named_after",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="named_after",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_native_language = LightevalTaskConfig(
+ name="wikifact:native_language",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="native_language",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_number_of_processor_cores = LightevalTaskConfig(
+ name="wikifact:number_of_processor_cores",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="number_of_processor_cores",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_occupation = LightevalTaskConfig(
+ name="wikifact:occupation",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="occupation",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_office_held_by_head_of_government = LightevalTaskConfig(
+ name="wikifact:office_held_by_head_of_government",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="office_held_by_head_of_government",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_office_held_by_head_of_state = LightevalTaskConfig(
+ name="wikifact:office_held_by_head_of_state",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="office_held_by_head_of_state",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_official_language = LightevalTaskConfig(
+ name="wikifact:official_language",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="official_language",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_operating_system = LightevalTaskConfig(
+ name="wikifact:operating_system",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="operating_system",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_original_language_of_film_or_TV_show = LightevalTaskConfig(
+ name="wikifact:original_language_of_film_or_TV_show",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="original_language_of_film_or_TV_show",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_original_network = LightevalTaskConfig(
+ name="wikifact:original_network",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="original_network",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_overrules = LightevalTaskConfig(
+ name="wikifact:overrules",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="overrules",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_owned_by = LightevalTaskConfig(
+ name="wikifact:owned_by",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="owned_by",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_part_of = LightevalTaskConfig(
+ name="wikifact:part_of",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="part_of",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_participating_team = LightevalTaskConfig(
+ name="wikifact:participating_team",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="participating_team",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_place_of_birth = LightevalTaskConfig(
+ name="wikifact:place_of_birth",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="place_of_birth",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_place_of_death = LightevalTaskConfig(
+ name="wikifact:place_of_death",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="place_of_death",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_plaintiff = LightevalTaskConfig(
+ name="wikifact:plaintiff",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="plaintiff",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_position_held = LightevalTaskConfig(
+ name="wikifact:position_held",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="position_held",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_position_played_on_team = LightevalTaskConfig(
+ name="wikifact:position_played_on_team",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="position_played_on_team",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_programming_language = LightevalTaskConfig(
+ name="wikifact:programming_language",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="programming_language",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_recommended_unit_of_measurement = LightevalTaskConfig(
+ name="wikifact:recommended_unit_of_measurement",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="recommended_unit_of_measurement",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_record_label = LightevalTaskConfig(
+ name="wikifact:record_label",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="record_label",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_religion = LightevalTaskConfig(
+ name="wikifact:religion",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="religion",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_repealed_by = LightevalTaskConfig(
+ name="wikifact:repealed_by",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="repealed_by",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_shares_border_with = LightevalTaskConfig(
+ name="wikifact:shares_border_with",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="shares_border_with",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_solved_by = LightevalTaskConfig(
+ name="wikifact:solved_by",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="solved_by",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_statement_describes = LightevalTaskConfig(
+ name="wikifact:statement_describes",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="statement_describes",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_stock_exchange = LightevalTaskConfig(
+ name="wikifact:stock_exchange",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="stock_exchange",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_subclass_of = LightevalTaskConfig(
+ name="wikifact:subclass_of",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="subclass_of",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_subsidiary = LightevalTaskConfig(
+ name="wikifact:subsidiary",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="subsidiary",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_symptoms_and_signs = LightevalTaskConfig(
+ name="wikifact:symptoms_and_signs",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="symptoms_and_signs",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_therapeutic_area = LightevalTaskConfig(
+ name="wikifact:therapeutic_area",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="therapeutic_area",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_time_of_discovery_or_invention = LightevalTaskConfig(
+ name="wikifact:time_of_discovery_or_invention",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="time_of_discovery_or_invention",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_twinned_administrative_body = LightevalTaskConfig(
+ name="wikifact:twinned_administrative_body",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="twinned_administrative_body",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+wikifact_work_location = LightevalTaskConfig(
+ name="wikifact:work_location",
+ suite=["lighteval"],
+ prompt_function=prompt.wikifact,
+ hf_repo="lighteval/wikifact",
+ hf_subset="work_location",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=8,
+ metrics=[Metrics.exact_match],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ wikifact_applies_to_jurisdiction,
+ wikifact_atomic_number,
+ wikifact_author,
+ wikifact_employer,
+ wikifact_field_of_work,
+ wikifact_file_extension,
+ wikifact_genetic_association,
+ wikifact_instrument,
+ wikifact_language_of_work_or_name,
+ wikifact_languages_spoken_written_or_signed,
+ wikifact_laws_applied,
+ wikifact_located_in_the_administrative_territorial_entity,
+ wikifact_location,
+ wikifact_location_of_discovery,
+ wikifact_location_of_formation,
+ wikifact_member_of,
+ wikifact_member_of_political_party,
+ wikifact_member_of_sports_team,
+ wikifact_movement,
+ wikifact_headquarters_location,
+ wikifact_industry,
+ wikifact_named_after,
+ wikifact_native_language,
+ wikifact_number_of_processor_cores,
+ wikifact_occupation,
+ wikifact_original_language_of_film_or_TV_show,
+ wikifact_original_network,
+ wikifact_overrules,
+ wikifact_owned_by,
+ wikifact_part_of,
+ wikifact_participating_team,
+ wikifact_place_of_birth,
+ wikifact_place_of_death,
+ wikifact_position_played_on_team,
+ wikifact_programming_language,
+ wikifact_recommended_unit_of_measurement,
+ wikifact_record_label,
+ wikifact_religion,
+ wikifact_repealed_by,
+ wikifact_shares_border_with,
+ wikifact_solved_by,
+ wikifact_statement_describes,
+ wikifact_stock_exchange,
+ wikifact_subclass_of,
+ wikifact_subsidiary,
+ wikifact_symptoms_and_signs,
+ wikifact_therapeutic_area,
+ wikifact_time_of_discovery_or_invention,
+ wikifact_twinned_administrative_body,
+ wikifact_work_location,
+]
diff --git a/src/lighteval/tasks/tasks/wikitext.py b/src/lighteval/tasks/tasks/wikitext.py
new file mode 100644
index 000000000..a6f62e90b
--- /dev/null
+++ b/src/lighteval/tasks/tasks/wikitext.py
@@ -0,0 +1,47 @@
+"""
+name:
+Wikitext
+
+dataset:
+EleutherAI/wikitext_document_level
+
+abstract:
+The WikiText language modeling dataset is a collection of over 100 million
+tokens extracted from the set of verified Good and Featured articles on
+Wikipedia. The dataset is available under the Creative Commons
+Attribution-ShareAlike License.
+
+languages:
+english
+
+tags:
+language-modeling
+
+paper:
+https://arxiv.org/abs/1609.07843
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+wikitext_103_document_level = LightevalTaskConfig(
+ name="wikitext:103:document_level",
+ suite=["lighteval"],
+ prompt_function=prompt.wikitext_helm,
+ hf_repo="EleutherAI/wikitext_document_level",
+ hf_subset="wikitext-103-raw-v1",
+ hf_avail_splits=["train", "test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ wikitext_103_document_level,
+]
diff --git a/src/lighteval/tasks/tasks/winogrande.py b/src/lighteval/tasks/tasks/winogrande.py
new file mode 100644
index 000000000..bcc49899b
--- /dev/null
+++ b/src/lighteval/tasks/tasks/winogrande.py
@@ -0,0 +1,48 @@
+"""
+name:
+Winogrande
+
+dataset:
+allenai/winogrande
+
+abstract:
+WinoGrande is a new collection of 44k problems, inspired by Winograd Schema
+Challenge (Levesque, Davis, and Morgenstern 2011), but adjusted to improve the
+scale and robustness against the dataset-specific bias. Formulated as a
+fill-in-a-blank task with binary options, the goal is to choose the right option
+for a given sentence which requires commonsense reasoning.
+
+languages:
+english
+
+tags:
+commonsense, multiple-choice
+
+paper:
+https://arxiv.org/abs/1907.10641
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+winogrande = LightevalTaskConfig(
+ name="winogrande",
+ suite=["lighteval"],
+ prompt_function=prompt.winogrande,
+ hf_repo="allenai/winogrande",
+ hf_subset="winogrande_xl",
+ hf_avail_splits=["train", "test", "validation"],
+ evaluation_splits=["validation"],
+ few_shots_split=None,
+ few_shots_select="random_sampling",
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ winogrande,
+]
diff --git a/src/lighteval/tasks/tasks/xcopa.py b/src/lighteval/tasks/tasks/xcopa.py
new file mode 100644
index 000000000..6b51be639
--- /dev/null
+++ b/src/lighteval/tasks/tasks/xcopa.py
@@ -0,0 +1,233 @@
+"""
+name:
+Xcopa
+
+dataset:
+cambridgeltl/xcopa
+
+abstract:
+XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning The Cross-lingual
+Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability
+of machine learning models to transfer commonsense reasoning across languages.
+
+languages:
+english
+
+tags:
+commonsense, multilingual, multiple-choice, reasoning
+
+paper:
+https://arxiv.org/abs/2005.00333
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+xcopa_en = LightevalTaskConfig(
+ name="xcopa:en",
+ suite=["lighteval"],
+ prompt_function=prompt.xcopa_en,
+ hf_repo="cambridgeltl/xcopa",
+ hf_subset="default",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xcopa_et = LightevalTaskConfig(
+ name="xcopa:et",
+ suite=["lighteval"],
+ prompt_function=prompt.xcopa_et,
+ hf_repo="cambridgeltl/xcopa",
+ hf_subset="et",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xcopa_ht = LightevalTaskConfig(
+ name="xcopa:ht",
+ suite=["lighteval"],
+ prompt_function=prompt.xcopa_ht,
+ hf_repo="cambridgeltl/xcopa",
+ hf_subset="ht",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xcopa_it = LightevalTaskConfig(
+ name="xcopa:it",
+ suite=["lighteval"],
+ prompt_function=prompt.xcopa_it,
+ hf_repo="cambridgeltl/xcopa",
+ hf_subset="it",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xcopa_id = LightevalTaskConfig(
+ name="xcopa:id",
+ suite=["lighteval"],
+ prompt_function=prompt.xcopa_id,
+ hf_repo="cambridgeltl/xcopa",
+ hf_subset="id",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xcopa_qu = LightevalTaskConfig(
+ name="xcopa:qu",
+ suite=["lighteval"],
+ prompt_function=prompt.xcopa_qu,
+ hf_repo="cambridgeltl/xcopa",
+ hf_subset="qu",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xcopa_sw = LightevalTaskConfig(
+ name="xcopa:sw",
+ suite=["lighteval"],
+ prompt_function=prompt.xcopa_sw,
+ hf_repo="cambridgeltl/xcopa",
+ hf_subset="sw",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xcopa_zh = LightevalTaskConfig(
+ name="xcopa:zh",
+ suite=["lighteval"],
+ prompt_function=prompt.xcopa_zh,
+ hf_repo="cambridgeltl/xcopa",
+ hf_subset="zh",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xcopa_ta = LightevalTaskConfig(
+ name="xcopa:ta",
+ suite=["lighteval"],
+ prompt_function=prompt.xcopa_ta,
+ hf_repo="cambridgeltl/xcopa",
+ hf_subset="ta",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xcopa_th = LightevalTaskConfig(
+ name="xcopa:th",
+ suite=["lighteval"],
+ prompt_function=prompt.xcopa_th,
+ hf_repo="cambridgeltl/xcopa",
+ hf_subset="th",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xcopa_tr = LightevalTaskConfig(
+ name="xcopa:tr",
+ suite=["lighteval"],
+ prompt_function=prompt.xcopa_tr,
+ hf_repo="cambridgeltl/xcopa",
+ hf_subset="tr",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xcopa_vi = LightevalTaskConfig(
+ name="xcopa:vi",
+ suite=["lighteval"],
+ prompt_function=prompt.xcopa_vi,
+ hf_repo="cambridgeltl/xcopa",
+ hf_subset="vi",
+ hf_avail_splits=["test", "train", "validation"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ xcopa_en,
+ xcopa_et,
+ xcopa_ht,
+ xcopa_it,
+ xcopa_id,
+ xcopa_qu,
+ xcopa_sw,
+ xcopa_zh,
+ xcopa_ta,
+ xcopa_th,
+ xcopa_tr,
+ xcopa_vi,
+]
diff --git a/src/lighteval/tasks/tasks/xstory_cloze.py b/src/lighteval/tasks/tasks/xstory_cloze.py
new file mode 100644
index 000000000..96caef9b5
--- /dev/null
+++ b/src/lighteval/tasks/tasks/xstory_cloze.py
@@ -0,0 +1,215 @@
+"""
+name:
+Xstory Cloze
+
+dataset:
+juletxara/xstory_cloze
+
+abstract:
+XStoryCloze consists of the professionally translated version of the English
+StoryCloze dataset (Spring 2016 version) to 10 non-English languages. This
+dataset is released by Meta AI.
+
+languages:
+english, russian, chinese, spanish, arabic, hindi, indonesian, telugu, swahili, basque, burmese
+
+tags:
+multilingual, narrative, reasoning
+
+paper:
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+xstory_cloze_en = LightevalTaskConfig(
+ name="xstory_cloze:en",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="juletxara/xstory_cloze",
+ hf_subset="en",
+ hf_avail_splits=["training", "eval"],
+ evaluation_splits=["eval"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xstory_cloze_ru = LightevalTaskConfig(
+ name="xstory_cloze:ru",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="juletxara/xstory_cloze",
+ hf_subset="ru",
+ hf_avail_splits=["training", "eval"],
+ evaluation_splits=["eval"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xstory_cloze_zh = LightevalTaskConfig(
+ name="xstory_cloze:zh",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="juletxara/xstory_cloze",
+ hf_subset="zh",
+ hf_avail_splits=["training", "eval"],
+ evaluation_splits=["eval"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xstory_cloze_es = LightevalTaskConfig(
+ name="xstory_cloze:es",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="juletxara/xstory_cloze",
+ hf_subset="es",
+ hf_avail_splits=["training", "eval"],
+ evaluation_splits=["eval"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xstory_cloze_ar = LightevalTaskConfig(
+ name="xstory_cloze:ar",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="juletxara/xstory_cloze",
+ hf_subset="ar",
+ hf_avail_splits=["training", "eval"],
+ evaluation_splits=["eval"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xstory_cloze_hi = LightevalTaskConfig(
+ name="xstory_cloze:hi",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="juletxara/xstory_cloze",
+ hf_subset="hi",
+ hf_avail_splits=["training", "eval"],
+ evaluation_splits=["eval"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xstory_cloze_id = LightevalTaskConfig(
+ name="xstory_cloze:id",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="juletxara/xstory_cloze",
+ hf_subset="id",
+ hf_avail_splits=["training", "eval"],
+ evaluation_splits=["eval"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xstory_cloze_te = LightevalTaskConfig(
+ name="xstory_cloze:te",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="juletxara/xstory_cloze",
+ hf_subset="te",
+ hf_avail_splits=["training", "eval"],
+ evaluation_splits=["eval"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xstory_cloze_sw = LightevalTaskConfig(
+ name="xstory_cloze:sw",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="juletxara/xstory_cloze",
+ hf_subset="sw",
+ hf_avail_splits=["training", "eval"],
+ evaluation_splits=["eval"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xstory_cloze_eu = LightevalTaskConfig(
+ name="xstory_cloze:eu",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="juletxara/xstory_cloze",
+ hf_subset="eu",
+ hf_avail_splits=["training", "eval"],
+ evaluation_splits=["eval"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xstory_cloze_my = LightevalTaskConfig(
+ name="xstory_cloze:my",
+ suite=["lighteval"],
+ prompt_function=prompt.storycloze,
+ hf_repo="juletxara/xstory_cloze",
+ hf_subset="my",
+ hf_avail_splits=["training", "eval"],
+ evaluation_splits=["eval"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ xstory_cloze_en,
+ xstory_cloze_ru,
+ xstory_cloze_zh,
+ xstory_cloze_es,
+ xstory_cloze_ar,
+ xstory_cloze_hi,
+ xstory_cloze_id,
+ xstory_cloze_te,
+ xstory_cloze_sw,
+ xstory_cloze_eu,
+ xstory_cloze_my,
+]
diff --git a/src/lighteval/tasks/tasks/xwinograd.py b/src/lighteval/tasks/tasks/xwinograd.py
new file mode 100644
index 000000000..c692c5803
--- /dev/null
+++ b/src/lighteval/tasks/tasks/xwinograd.py
@@ -0,0 +1,129 @@
+"""
+name:
+Xwinograd
+
+dataset:
+Muennighoff/xwinograd
+
+abstract:
+Multilingual winograd schema challenge as used in Crosslingual Generalization through Multitask Finetuning.
+
+languages:
+english, french, japanese, portuguese, russian, chinese
+
+tags:
+commonsense, multilingual, reasoning
+
+paper:
+https://arxiv.org/abs/2211.01786
+"""
+
+import lighteval.tasks.default_prompts as prompt
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+xwinograd_en = LightevalTaskConfig(
+ name="xwinograd:en",
+ suite=["lighteval"],
+ prompt_function=prompt.winogrande,
+ hf_repo="Muennighoff/xwinograd",
+ hf_subset="en",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xwinograd_fr = LightevalTaskConfig(
+ name="xwinograd:fr",
+ suite=["lighteval"],
+ prompt_function=prompt.winogrande,
+ hf_repo="Muennighoff/xwinograd",
+ hf_subset="fr",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xwinograd_jp = LightevalTaskConfig(
+ name="xwinograd:jp",
+ suite=["lighteval"],
+ prompt_function=prompt.winogrande,
+ hf_repo="Muennighoff/xwinograd",
+ hf_subset="jp",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xwinograd_pt = LightevalTaskConfig(
+ name="xwinograd:pt",
+ suite=["lighteval"],
+ prompt_function=prompt.winogrande,
+ hf_repo="Muennighoff/xwinograd",
+ hf_subset="pt",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xwinograd_ru = LightevalTaskConfig(
+ name="xwinograd:ru",
+ suite=["lighteval"],
+ prompt_function=prompt.winogrande,
+ hf_repo="Muennighoff/xwinograd",
+ hf_subset="ru",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+xwinograd_zh = LightevalTaskConfig(
+ name="xwinograd:zh",
+ suite=["lighteval"],
+ prompt_function=prompt.winogrande,
+ hf_repo="Muennighoff/xwinograd",
+ hf_subset="zh",
+ hf_avail_splits=["test"],
+ evaluation_splits=["test"],
+ few_shots_split=None,
+ few_shots_select=None,
+ generation_size=-1,
+ metrics=[Metrics.loglikelihood_acc],
+ stop_sequence=["\n"],
+ version=0,
+)
+
+TASKS_TABLE = [
+ xwinograd_en,
+ xwinograd_fr,
+ xwinograd_jp,
+ xwinograd_pt,
+ xwinograd_ru,
+ xwinograd_zh,
+]
diff --git a/src/lighteval/utils/cache_management.py b/src/lighteval/utils/cache_management.py
index 3e8c0a08a..e5764a04b 100644
--- a/src/lighteval/utils/cache_management.py
+++ b/src/lighteval/utils/cache_management.py
@@ -79,7 +79,6 @@ def __init__(self, model_config: ModelConfig):
Args:
model_config: Configuration for the model being cached
- cache_dir: Directory to store cache files
"""
self.model_config = model_config
self.model_hash = self.get_model_hash(model_config)
@@ -213,7 +212,6 @@ def _load_sample(self, sample: pd.core.series.Series | dict) -> Union[dict, Mode
Args:
sample: Raw sample data from cache, arrives as a dataframe row
- sample_type: Type of sample being loaded
Returns:
Union[dict, ModelResponse]: Loaded sample in appropriate format for processing
@@ -360,7 +358,7 @@ def cached(sampling_method: SamplingMethod = None): # noqa C901
Decorator to cache method results based on Doc inputs.
Args:
- cache_type_name: Type of cache ("tokenization" or "predictions")
+ sampling_method: Sampling method to cache
Usage:
@cached(SamplingMethod.GENERATIVE)
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|arc:challenge|25_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|arc:challenge|25_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index df81532e4..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|arc:challenge|25_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d2dce4416d022cb704a77d63dcbacc99e148cb598186f88f33e7b1c5c019335e
-size 87199
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|hellaswag|10_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|hellaswag|10_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 9f9639216..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|hellaswag|10_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8ac904dbbbd26b93de90df7400242713a359207985d5f4c4f75d31ee9bb3325f
-size 106015
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|mmlu:college_chemistry|5_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|mmlu:college_chemistry|5_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 86eb5a1ce..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|mmlu:college_chemistry|5_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e52b3dd01e79fa7028396bad84f6fba4d653fe6ede17a74cf1829115f809fdbe
-size 36114
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|mmlu:us_foreign_policy|5_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|mmlu:us_foreign_policy|5_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index f51f7ad89..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|mmlu:us_foreign_policy|5_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:73de608e18e75e21cd832c09aecd13f6e7a0dbb91f113cb4cb6f8984be474d77
-size 36635
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|truthfulqa:mc|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|truthfulqa:mc|0_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 50cc5802f..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|truthfulqa:mc|0_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dc795a85bcb77084b1275bfadfe2c613a3b44543a6184e3ffd32bc4588d8d64f
-size 25269
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:aqua-rat|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:aqua-rat|0_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 2ca8fcfc0..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:aqua-rat|0_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2e75e6460dd0c3ba833b74c19b4943b1baa0f266e5207895454a54019dc9cbf6
-size 21944
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:aqua-rat|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:aqua-rat|0_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..29fcc86f2
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:aqua-rat|0_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bb8f6798f1556468a715ef990a090a74149242ca44be87c4908966e7c18f684
+size 21839
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:logiqa-en|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:logiqa-en|0_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 675c2125e..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:logiqa-en|0_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6c96e81a70ef68946e7e83e30a9ef5dd5c04a4e8de215a021de33d4e841ec502
-size 34133
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:logiqa-en|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:logiqa-en|0_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..222e73463
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:logiqa-en|0_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e820d31ec994386562144504b28116960c48ee649fefa887c11cc10a6dc12373
+size 34072
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-ar|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-ar|0_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index b5d4632ed..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-ar|0_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ebf20030a92a27e15144e4f2071c419edafd1ae9d0e8fe7b9bc38a3edf7a181e
-size 30775
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-ar|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-ar|0_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..7cd541d5d
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-ar|0_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f42202e916ecc484879e824801e85d4965cf83b466199241734dfacd7f5f07d
+size 30714
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-lr|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-lr|0_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 811989b76..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-lr|0_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:01db21e17415bb49be149cf25da813faadfb6bac3b127ba246ae3dbcf96685d7
-size 39431
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-lr|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-lr|0_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..41aa908a7
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-lr|0_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f79f38ee2bf762a43bf75326f02fbf373a8b54f004764c51de05805da48378b2
+size 39384
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-rc|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-rc|0_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 670c7475b..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-rc|0_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5ff511fe233f3fa5d057ca06671779dd8acd990c195ac3132636d1612cb17dcd
-size 74222
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-rc|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-rc|0_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..45062f426
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-rc|0_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71090b25c032493e4ec26cada301343397043222143d55525d4049d0cfe2fea2
+size 74176
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en-without-passage|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en-without-passage|0_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index af81308bc..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en-without-passage|0_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c2770719dd0e256dc0634fb9a3b374b085080f76dbaf9b96326dcf2e070d3701
-size 25968
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en-without-passage|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en-without-passage|0_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..33b5e59c5
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en-without-passage|0_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca54ee0507b761db283874619584d9eefde9412cd38f1e158aa2557c2c69e95f
+size 25907
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en|0_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 2c88d4075..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en|0_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b1bf41a41845a4d41b8a5ba28c0117746689fa96143489fe798651bf2af98e5f
-size 72560
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en|0_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..695396792
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en|0_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03a313bf91b1642fc24bb23ef034a851a17d33610bfb3f83de4cc1c33d5d23dd
+size 72493
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|arc:challenge|25_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|arc:challenge|25_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..4ccd4261f
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|arc:challenge|25_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbb426e5d8f5b54a1d8527a9b6bc7b62e4d4fad5d6b75af1a3af47de816229dd
+size 87676
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:causal_judgment|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:causal_judgment|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 712c604c9..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:causal_judgment|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:afb32f7ffe8f53a1b892123e8c8f0325830c1703154b1e8ba07786aa32fcf163
-size 46253
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:date_understanding|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:date_understanding|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index e9904becd..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:date_understanding|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d741c8c198a8ad188da86f6ee5c8795abb1c89665580cec627216b4204e18a17
-size 28804
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:disambiguation_qa|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:disambiguation_qa|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index e6d0732ca..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:disambiguation_qa|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:209b8b1be20f217a687c9a2ea50e15176bd8df3a62d8e24f20afa371cdaac2da
-size 29675
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:geometric_shapes|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:geometric_shapes|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 2b4666c55..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:geometric_shapes|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:64228e6c0460d5dbf75dbff6a210db107611314f84df9105f91a17340703386c
-size 31219
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:logical_deduction_five_objects|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:logical_deduction_five_objects|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 3f5964fac..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:logical_deduction_five_objects|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:417d41730a5dd77c1729df05d1888e6d91f29d641c802bc45bd94c7cccf7581d
-size 33393
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:logical_deduction_seven_objects|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:logical_deduction_seven_objects|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 38984c530..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:logical_deduction_seven_objects|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b486108ab93f2b274b80cb45ce87da4e09bcab49b02c82f94838246cb1243cb6
-size 36893
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:movie_recommendation|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:movie_recommendation|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 868565ed9..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:movie_recommendation|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:511eda270bab7771b2697adaaa95aa5eb1a41da1926b51a73272a1104b3025bb
-size 28017
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:navigate|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:navigate|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 2158582ff..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:navigate|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f7f72df2e5a180fdda15ee2d4a2f23e63d6b5695d4a086fbe7baf55fa5854a74
-size 27629
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:ruin_names|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:ruin_names|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 7813c3884..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:ruin_names|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:789f8818d20a28f3ae6854a1b472ef6020875b99e217b067f71133ede511599b
-size 26814
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:salient_translation_error_detection|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:salient_translation_error_detection|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 6760674a8..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:salient_translation_error_detection|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:eba32e4dc54bdc313dd6c5cc9b24250418d9186cebca96e845d2b801750ec84a
-size 48058
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:snarks|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:snarks|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 596aa76e3..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:snarks|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f4ae6c4b877baa4a127d1e540c3522fe7d016d15e5827be9db5eb1ade50d2a4a
-size 27979
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:temporal_sequences|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:temporal_sequences|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 71a4ca996..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:temporal_sequences|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4ed5bda45b8bdb868e42361827501fb108304512e5b7a853d8fa3e314162e620
-size 33161
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:tracking_shuffled_objects_five_objects|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:tracking_shuffled_objects_five_objects|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index fe0896288..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:tracking_shuffled_objects_five_objects|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c65cf6bf80bd1d20420ca0925f120317ddaee59a5f283f1c544acb6b9bcf550f
-size 33631
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:tracking_shuffled_objects_seven_objects|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:tracking_shuffled_objects_seven_objects|3_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 74a321d63..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:tracking_shuffled_objects_seven_objects|3_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5d34487632eb79e9c5a59aa354434b681218e6406b3eb885caf81a735936fae2
-size 36162
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:causal_judgment|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:causal_judgment|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..a27f12606
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:causal_judgment|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:408adb2cc6ebfd6227c29ae7b36ebaec628d133b7a55fcd62996da1a81b683be
+size 47608
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:date_understanding|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:date_understanding|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..5e7551aa2
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:date_understanding|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26902afcf065eb91840fbfbe50bef53284141d0c1772c5dce0bb45acfac7dfbf
+size 30056
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:disambiguation_qa|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:disambiguation_qa|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..606551571
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:disambiguation_qa|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76dfb895bd369d3092b3faf32e52e070a7ac2797e918e6d78f10fe6521fcec73
+size 30982
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:geometric_shapes|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:geometric_shapes|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..7719095bb
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:geometric_shapes|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:601162ba27b672f1763513b2360846104e673bae46937e1990b0b146187c9e74
+size 32514
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:logical_deduction_five_objects|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:logical_deduction_five_objects|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..29cefcae6
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:logical_deduction_five_objects|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d97f9b7b0d06000abc67c45eed722c63237057358c603d67bfb9ce7855bffad9
+size 34703
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:logical_deduction_seven_objects|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:logical_deduction_seven_objects|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..730f0f472
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:logical_deduction_seven_objects|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e5acdff1e58361591fde26d1b3fd422b0be9adad4dfbee98dc211f75cfbb568
+size 38228
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:movie_recommendation|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:movie_recommendation|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..46404f494
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:movie_recommendation|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b79a11c9981b37b71e306fb7a0e049c1845adc6752f4394f6e7406db27a9c16
+size 29272
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:navigate|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:navigate|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..288d2c0e6
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:navigate|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a150f48c0928da6642309188a0ab5a89a9bed5eb66c9a9f7b3897f02af239809
+size 28884
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:ruin_names|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:ruin_names|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..83d132e37
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:ruin_names|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa768f89fd06423d3dad3bf7fd229442eb0d813e8f4c1be94b62a4ee91ce1c0e
+size 28021
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:salient_translation_error_detection|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:salient_translation_error_detection|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..d01582b4e
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:salient_translation_error_detection|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e6ee64f0595ef3db00de7c43d9e4411d8fe32ae4c1c5b576b713a09448b5038
+size 49390
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:snarks|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:snarks|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..84f17cfae
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:snarks|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc32011b7f35b96edb89efc0dfa2f2aa56de5b19566ec424427193f72d80424b
+size 29202
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:temporal_sequences|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:temporal_sequences|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..6376f53fd
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:temporal_sequences|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1f0fecbea584b4617f5c14e577acc2c516ce86a8e45e493be0e47f76c99a3d5
+size 34443
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..ce267d004
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ab2cd33ce068a2f6ec0a3613eb0b26790596e8be0da0491d31e0d0f293f35eb
+size 34896
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..085a59a9e
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e511aedc2c86800f5456a315c8ead57a216a0abab650f58e1282b3f9e96a60c7
+size 37440
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|gsm8k_test|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|gsm8k_test|0_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..5545aa11c
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|gsm8k_test|0_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:215753259adbd35ec5cf0fd30471064017e7f160a49f4b1542d22ccedbbb6f19
+size 35747
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|hellaswag|10_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|hellaswag|10_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..def3e823f
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|hellaswag|10_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:686cde8c82ccaea58035dcc0fd5729b67343af90c02cdac4768c260d13cd6ce0
+size 67303
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|mmlu:college_chemistry|5_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|mmlu:college_chemistry|5_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..3ac277a83
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|mmlu:college_chemistry|5_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:370499a7dbda06de110b28dd4803880a62b63d9f31480463848277a8784250aa
+size 37734
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|mmlu:us_foreign_policy|5_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|mmlu:us_foreign_policy|5_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..11ae5fd8f
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|mmlu:us_foreign_policy|5_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d42481a014c8beeeeb5009809418815652437330b2828a6b3b1f3696c269949
+size 38503
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|truthfulqa:mc|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|truthfulqa:mc|0_2025-10-17T14-08-59.659871.parquet
new file mode 100644
index 000000000..0cfc28382
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|truthfulqa:mc|0_2025-10-17T14-08-59.659871.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4e18a142f8af00c5681c49d0a7b4e0580f1c7096c1b72855ddff29e141620e3
+size 26087
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_test|gsm8k|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_test|gsm8k|0_2025-09-19T14-21-59.670987.parquet
deleted file mode 100644
index 160b3defc..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_test|gsm8k|0_2025-09-19T14-21-59.670987.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7e281554c86326b1f2e05f8c27ef7d58048a2b751a2ceed6c4c79d50ecbbdcab
-size 34833
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|arc:challenge|25_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|arc:challenge|25_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index da0f11a41..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|arc:challenge|25_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c7fe08af0c72407c1997534ac38db74cf716d2a4f6e9fcc9a7e138b8b55b1480
-size 144374
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|hellaswag|10_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|hellaswag|10_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index e1a9adf2c..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|hellaswag|10_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:be5cb187977d6f8a6acdf7712477da51c7cd66e353671f86c5cf8f48ce1b9d61
-size 137038
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|mmlu:college_chemistry|5_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|mmlu:college_chemistry|5_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index eab885a8d..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|mmlu:college_chemistry|5_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6ca8136266ee39de5ed61bfcffdb048d0f71b9428a2c3b78de70e9a5f189a818
-size 53139
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|mmlu:us_foreign_policy|5_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|mmlu:us_foreign_policy|5_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 4be39bbc6..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|mmlu:us_foreign_policy|5_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8a11b96fcc1f22ac5349a9acccb6f45203e01071afc50811a1646388a8d06199
-size 54501
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|truthfulqa:mc|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|truthfulqa:mc|0_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 638aab548..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|truthfulqa:mc|0_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b84277d5f3a97613f4e9f491281c64f2f224d017b99beeb7820ed948cf36d019
-size 31570
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:aqua-rat|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:aqua-rat|0_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 18d340905..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:aqua-rat|0_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:32e3aa399ece1fec63937b28f7058a0f92c2274ecbba0f404c6f6d2118faadfb
-size 26577
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:aqua-rat|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:aqua-rat|0_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..d690a4f14
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:aqua-rat|0_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55af4b3a8f20480b118b8697b95b766da6d87db04395141a4ffe750b0adf0e20
+size 26534
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:logiqa-en|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:logiqa-en|0_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index fb6a53e32..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:logiqa-en|0_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d62633ded1b67ed70f538c27f8f8756386d4b707bf7f878a2458d087fe8f3360
-size 45781
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:logiqa-en|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:logiqa-en|0_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..67146b758
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:logiqa-en|0_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09856647c8e52b0162bead55c03ec464bd36b4c297a8167bd0a2384ca51cc55a
+size 45739
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-ar|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-ar|0_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 1ebc2067e..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-ar|0_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:757b28842addb90c8278938fec7524f87a1b2b635f5a488b49a22197a9d9d885
-size 50807
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-ar|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-ar|0_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..7e438e70f
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-ar|0_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d0988269e97ebec6615ac36e7e72c6a46d513e49dc9d8683a74659acd2dd872
+size 50771
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-lr|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-lr|0_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index ad35380db..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-lr|0_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7f2edfe9a5f7501615b442e7026c6d5f16b0e7e03caf00f4a41846acf3e0ed3e
-size 55855
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-lr|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-lr|0_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..43c45d6f3
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-lr|0_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4485ca9806ba31f83fe8e4a411ef9ac14dcf2af7c4b440361c4fed5d3b4c2eb5
+size 55826
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-rc|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-rc|0_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 1b9b46481..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-rc|0_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:561fcf29d4ad4ff8d0f333e888b0cef84c133db009be34b989576d0bb3c78a44
-size 148865
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-rc|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-rc|0_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..484870f64
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-rc|0_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acb6163bd8503121ed2962c1080445976ef5e0fe7820a7c66354cd5984834273
+size 148838
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en-without-passage|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en-without-passage|0_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 958038ad0..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en-without-passage|0_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2d811dc576579af492de475703ddaa40d6bb0db3506facd2679f10de50f608db
-size 32795
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en-without-passage|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en-without-passage|0_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..19f2d87a2
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en-without-passage|0_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82ec622a7c7699f78e92bcadf6d3121ad114dca0959131b879b5489936ea6da0
+size 32753
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en|0_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 0b680f7af..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en|0_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d4729a89ab8729d83549ec34ec316b68bcf05fab4111bf8530ab2f7f6f16bc56
-size 110056
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en|0_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..f38c24d46
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en|0_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9c225a4b4295ddbcde3df6405c89751025ee910a6a5c55633a51cbb9485ed17
+size 109983
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|arc:challenge|25_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|arc:challenge|25_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..b978eba02
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|arc:challenge|25_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78d7529cb2b80be6022a5b41fa46d12f48a4556ae322c46afe1bb4a393eb7a98
+size 144845
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:causal_judgment|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:causal_judgment|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index c5cf55616..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:causal_judgment|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d920d6b1d9757af95d515a8435972a667375e13020a1709ab27a203484d04704
-size 70718
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:date_understanding|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:date_understanding|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index d4666b2fe..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:date_understanding|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38e56b21e15ca43fad2f286b8b75e7d2b3db729004c4cb825d8609118f194af3
-size 38152
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:disambiguation_qa|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:disambiguation_qa|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 2e8b80d83..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:disambiguation_qa|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:474a092eb73f0734f2a31b13fee8cd3edcc649c96ed13e054961be22e16efbe5
-size 36972
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:geometric_shapes|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:geometric_shapes|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 83ff6841a..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:geometric_shapes|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b3d8aea15719f8c31847fe5e415cfcad8f4bb24a9f5a7309b9eb5e74e95a513d
-size 48287
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:logical_deduction_five_objects|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:logical_deduction_five_objects|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 17ad7da3b..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:logical_deduction_five_objects|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4f5f4943c293cb2472f74030dbfd220eabd0c12d612fa20a0f905ef0a0a6846c
-size 46228
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:logical_deduction_seven_objects|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:logical_deduction_seven_objects|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 9eb4ad34f..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:logical_deduction_seven_objects|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2c05a9d6d976d4529483fcac90163705fabca22ccdba0b3ee33ad1df44b8c234
-size 54843
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:movie_recommendation|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:movie_recommendation|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 9e8068912..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:movie_recommendation|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d49cf61fba119a019d8047f64206ce860cb41d70c7a4b85a20e92fdb76b9c65a
-size 35234
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:navigate|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:navigate|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 2aca5e3bd..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:navigate|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:25cfadaf467f2850cee53b89ca1c05b8491f3f9d54612e96d113c9b9e0ca5fae
-size 33264
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:ruin_names|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:ruin_names|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 761b290f1..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:ruin_names|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:218ef4b465e8f164df7cce40c9ea367596165dfa1f392f56ba2029a36430556d
-size 33280
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:salient_translation_error_detection|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:salient_translation_error_detection|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 506566766..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:salient_translation_error_detection|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f2066ffecda60170f7d6e65384899fea4d3232011e5803e5f0d72b8159f8dd2e
-size 67823
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:snarks|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:snarks|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 3bf51107e..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:snarks|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1e8e1e9cefafc6872cee5ab021f5b418d2738b555b1ac7d0caaaa7ddbe1c84df
-size 36628
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:temporal_sequences|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:temporal_sequences|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 69e6f60bb..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:temporal_sequences|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4e7f092f6994c6e18349bdb3c489c059eee371c90f1a6d250495d9f7255db75e
-size 49007
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:tracking_shuffled_objects_five_objects|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:tracking_shuffled_objects_five_objects|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 0e86bb133..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:tracking_shuffled_objects_five_objects|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:49b6cab428aa555786fb5d74d6d91699f9246d8a0c7ff2d7dee4bb9621f5b9b2
-size 51220
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:tracking_shuffled_objects_seven_objects|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:tracking_shuffled_objects_seven_objects|3_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index 915319abc..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:tracking_shuffled_objects_seven_objects|3_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0442fff2fb12229444bfeb0fa4ccc8a9d73455b5494aed31b6c4b91950cdadf7
-size 58577
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:causal_judgment|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:causal_judgment|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..b5174abd1
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:causal_judgment|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8e4e4047e4b3bef68e96d106b404d5da844c254c4021c155159cfd00aebc036
+size 72102
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:date_understanding|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:date_understanding|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..968be4faa
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:date_understanding|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:732966b04a49242e642d06de47b15ca4a7fce1b52bf103baed843c29cc878d4e
+size 39473
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:disambiguation_qa|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:disambiguation_qa|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..9d8554d2d
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:disambiguation_qa|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7ce68d0631ee4707f57bd0848e86a544c70bc2268c08fbe24275cb47921d11f
+size 38313
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:geometric_shapes|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:geometric_shapes|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..21e80f4c7
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:geometric_shapes|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:549b13758170f710449b845b8c0bd3bc2a9eb8fab9c4a91751fb38830082ef8b
+size 49621
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:logical_deduction_five_objects|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:logical_deduction_five_objects|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..f051f91a5
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:logical_deduction_five_objects|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d33f3c199c57fd2ee607174043b2087ee26da4f27ef68cad8e81c133d85f5dad
+size 47607
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:logical_deduction_seven_objects|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:logical_deduction_seven_objects|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..4c8814b87
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:logical_deduction_seven_objects|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8af05943bd9fa01e2fe4f1fb082d0919c266c8ec478c8259577d0def03f45103
+size 56216
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:movie_recommendation|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:movie_recommendation|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..3c0ea7eaf
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:movie_recommendation|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8fbcafe67f79eaf7433c7a87c2bba773340ab6aa7872400ea993da1dff9e531
+size 36552
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:navigate|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:navigate|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..abfe874a3
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:navigate|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5be905adee2d8cee7e8d66441456c225c901e67746679ac80c6bc7f3763ff167
+size 34588
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:ruin_names|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:ruin_names|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..aa6142ea9
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:ruin_names|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a332d10b713b0995a11b8da1c8a17644261fccc79a0a19de343580e276842713
+size 34561
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:salient_translation_error_detection|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:salient_translation_error_detection|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..0974ffb0c
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:salient_translation_error_detection|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a4a9eb97df91f315104a89a71c3e3221ffcd97cd839b15bf7bcc060eaf25e8d
+size 69190
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:snarks|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:snarks|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..cdcc1db3d
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:snarks|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f41fc4e7250aa9fb05b4122ef062f27403c99d8e4960c3fde4072aede655563d
+size 37908
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:temporal_sequences|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:temporal_sequences|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..ffbb3e29d
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:temporal_sequences|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2c3985c35dce91ec1aba39f67ab5252af0663cd3f9664326498cbbfd753864a
+size 50327
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..1f1896d8e
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9660e3a3836b705e44922972cf7fe8cc1fd44bd16822892cc6706b3aa07590d
+size 52546
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..ead4ecd0b
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72287e782eefe8ef33278ce582a6d163e47e6b839dcb2bd4b031c58ff8d0b154
+size 59891
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|gsm8k_test|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|gsm8k_test|0_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..10071540f
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|gsm8k_test|0_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b97852148b8779de9185c1dfe506d104d98d1a5f06369614c188a023d5ab6b5e
+size 39107
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|hellaswag|10_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|hellaswag|10_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..127d5518e
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|hellaswag|10_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2aecc616ade5f82ca78d39b65743eb5890c671d83db6c274972d507a8fc997a4
+size 88652
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|mmlu:college_chemistry|5_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|mmlu:college_chemistry|5_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..1dbd0c716
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|mmlu:college_chemistry|5_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d663a38bb9208a98b2839093275ed9a1b0e8312d1308e0eace94a616191b79b1
+size 51027
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|mmlu:us_foreign_policy|5_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|mmlu:us_foreign_policy|5_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..94fa4337c
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|mmlu:us_foreign_policy|5_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:897d4d7063e681f928f709bff3ec8b2ace2566fd70faf812fe74e6cd65582785
+size 52560
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|truthfulqa:mc|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|truthfulqa:mc|0_2025-10-17T14-03-07.927732.parquet
new file mode 100644
index 000000000..48e6d2807
--- /dev/null
+++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|truthfulqa:mc|0_2025-10-17T14-03-07.927732.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b879e37019beb40032695a9e0a63d9d60ce571d601eca8f356cec2165c1962a
+size 32420
diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_test|gsm8k|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_test|gsm8k|0_2025-09-19T14-18-26.717757.parquet
deleted file mode 100644
index a95529696..000000000
--- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_test|gsm8k|0_2025-09-19T14-18-26.717757.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1122709febbfe4d9b3aefc6914eb43a4571611c67b37a2be79cc91d7b936150c
-size 38168
diff --git a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json
index e3cf75ccc..f35ac4d17 100644
--- a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json
+++ b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:6246068f1967408620b2f128c4b1e994d4afa3165f5ea2f59529073869dde29b
-size 51794
+oid sha256:063f2cbdc1f8f85147534dd590a5139b1f815e580771b353ee76c5b7672ff545
+size 46217
diff --git a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json
index fd40b5b92..26e304bcb 100644
--- a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json
+++ b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:d31bb1623784ef37efd4f90f39d6e662bdb139f6ac53a00d731c98a8b546de1f
-size 51893
+oid sha256:16a8bec22d5ebaf5064c6c9a6ca03e6009d36df6598a0fe3470c84f3914340df
+size 46345
diff --git a/tests/slow_tests/sample_comparison.py b/tests/slow_tests/sample_comparison.py
index 02237a1c1..d525bc948 100644
--- a/tests/slow_tests/sample_comparison.py
+++ b/tests/slow_tests/sample_comparison.py
@@ -37,29 +37,6 @@ def _to_plain_list(value):
return new_value
-def _logprobs_approximately_equal(current_logprobs, reference_logprobs):
- """Check if logprobs are sorted in the same order.
- for example:
- current_logprobs = [1.1, 2.1, 3.1]
- reference_logprobs = [1.0, 2.0, 3.0]
- should return True
- """
- if current_logprobs is None and reference_logprobs is None:
- return True
- if current_logprobs is None or reference_logprobs is None:
- return False
-
- current_logprobs = _to_plain_list(current_logprobs)
- reference_logprobs = _to_plain_list(reference_logprobs)
-
- # Check if both lists have the same ordering
- # Convert to relative ordering: 0 for smallest, 1 for second smallest, etc.
- current_indices = sorted(range(len(current_logprobs)), key=lambda i: current_logprobs[i])
- reference_indices = sorted(range(len(reference_logprobs)), key=lambda i: reference_logprobs[i])
-
- return current_indices == reference_indices
-
-
def load_sample_details(details_dir: str):
"""Load sample-level details from parquet files in the details directory."""
details = {}
@@ -115,12 +92,15 @@ def _compare_metrics(current, reference):
reference_metrics = reference["metric"]
metric_diffs = {}
- for metric_name in set(current_metrics.keys()) | set(reference_metrics.keys()):
- current_val = current_metrics.get(metric_name)
- reference_val = reference_metrics.get(metric_name)
+ for metric_name in set(current_metrics.keys()) & set(reference_metrics.keys()):
+ try:
+ current_val = current_metrics.get(metric_name)
+ reference_val = reference_metrics.get(metric_name)
- if not math.isclose(current_val, reference_val, abs_tol=0.05):
- metric_diffs[metric_name] = {"current": current_val, "reference": reference_val}
+ if not math.isclose(current_val, reference_val, abs_tol=0.05):
+ metric_diffs[metric_name] = {"current": current_val, "reference": reference_val}
+ except Exception:
+ breakpoint()
if metric_diffs:
sample_diff["metric_differences"] = metric_diffs
@@ -175,6 +155,7 @@ def compare_sample_details(current_details, reference_details):
for task_name in current_details:
if task_name not in reference_details:
+ breakpoint()
differences[task_name] = [{"error": "Task not found in reference results"}]
continue
diff --git a/tests/unit/metrics/test_metric_requests.py b/tests/unit/metrics/test_metric_requests.py
index e7f9ee473..7c383c737 100644
--- a/tests/unit/metrics/test_metric_requests.py
+++ b/tests/unit/metrics/test_metric_requests.py
@@ -25,9 +25,9 @@
from lighteval.metrics.normalizations import LogProbPMINorm
from lighteval.metrics.utils.metric_utils import Metric
from lighteval.models.model_output import ModelResponse
-from lighteval.tasks.default_tasks import xstory_cloze_en_lighteval
from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
from lighteval.tasks.requests import Doc
+from lighteval.tasks.tasks.xstory_cloze import xstory_cloze_en
from tests.utils import FakeModel, fake_evaluate_task
@@ -48,9 +48,9 @@ def get_pmi_task(metrics: list[Metric]):
metrics=metrics,
suite=["test"],
prompt_function=dummy_prompt_fc,
- hf_repo=xstory_cloze_en_lighteval.hf_repo,
- hf_subset=xstory_cloze_en_lighteval.hf_subset,
- evaluation_splits=xstory_cloze_en_lighteval.evaluation_splits,
+ hf_repo=xstory_cloze_en.hf_repo,
+ hf_subset=xstory_cloze_en.hf_subset,
+ evaluation_splits=xstory_cloze_en.evaluation_splits,
)
# This is manually edited when updating the config and in the post init function
# - we need to get a more homogeneous system for naming...
diff --git a/tests/unit/pipeline/test_reasoning_tags.py b/tests/unit/pipeline/test_reasoning_tags.py
index f772970c4..00fa00d78 100644
--- a/tests/unit/pipeline/test_reasoning_tags.py
+++ b/tests/unit/pipeline/test_reasoning_tags.py
@@ -22,9 +22,7 @@
import tempfile
import unittest
-from pathlib import Path
-from types import ModuleType
-from typing import Optional, Union
+from typing import Optional
from unittest.mock import patch
from lighteval.logging.evaluation_tracker import EvaluationTracker
@@ -96,7 +94,7 @@ def download_dataset_worker(task) -> None:
class FakeRegistry(Registry):
def __init__(
- self, tasks: Optional[str] = None, custom_tasks: Optional[Union[str, Path, ModuleType]] = None
+ self, tasks: Optional[str] = None, load_multilingual: bool = False, custom_tasks: Optional[str] = None
):
self.tasks_list = [input_task_name]
# suite_name, task_name, few_shot = input_task_name.split("|")
diff --git a/tests/unit/tasks/test_registry.py b/tests/unit/tasks/test_registry.py
index 377ea7d6c..bbbd32dc8 100644
--- a/tests/unit/tasks/test_registry.py
+++ b/tests/unit/tasks/test_registry.py
@@ -26,51 +26,6 @@
from lighteval.tasks.registry import Registry
-TASKS_TABLE = [
- LightevalTaskConfig(
- name="test_task_revision",
- # Won't be called, so it can be anything
- prompt_function=lambda x: x, # type: ignore
- hf_repo="test",
- hf_subset="default",
- evaluation_splits=["train"],
- metrics=[],
- )
-]
-
-TASKS_GROUPS = {
- "zero_and_one": "custom|test_task_revision|0,custom|test_task_revision|1",
- "all_mmlu": "original|mmlu|3",
-}
-
-
-def test_custom_task_groups():
- """
- Tests that task info selector correctly handles custom task groups.
- """
- registry = Registry(tasks="zero_and_one", custom_tasks="tests.unit.tasks.test_registry")
-
- assert set(registry.tasks_list) == {"custom|test_task_revision|0", "custom|test_task_revision|1"}
-
- assert set(registry.task_to_configs.keys()) == {"custom|test_task_revision"}
-
- task_info: list[LightevalTaskConfig] = registry.task_to_configs["custom|test_task_revision"]
- assert {task_info[0].num_fewshots, task_info[1].num_fewshots} == {0, 1}
-
-
-def test_custom_tasks():
- """
- Tests that task info selector correctly handles custom tasks.
- """
- registry = Registry(tasks="custom|test_task_revision|0", custom_tasks="tests.unit.tasks.test_registry")
-
- assert registry.tasks_list == ["custom|test_task_revision|0"]
- assert set(registry.task_to_configs.keys()) == {"custom|test_task_revision"}
-
- task_info: list[LightevalTaskConfig] = registry.task_to_configs["custom|test_task_revision"]
- assert task_info[0].num_fewshots == 0
-
-
def test_superset_expansion():
"""
Tests that task info selector correctly handles supersets.
@@ -92,13 +47,13 @@ def test_superset_with_subset_task():
"""
Tests that task info selector correctly handles if both superset and one of subset tasks are provided.
"""
- registry = Registry(tasks="original|mmlu|3,original|mmlu:abstract_algebra|5")
+ registry = Registry(tasks="lighteval|mmlu|3,lighteval|mmlu:abstract_algebra|5")
# We have all mmlu tasks
- assert set(registry.tasks_list) == {"original|mmlu|3", "original|mmlu:abstract_algebra|5"}
+ assert set(registry.tasks_list) == {"lighteval|mmlu|3", "lighteval|mmlu:abstract_algebra|5"}
assert len(registry.task_to_configs.keys()) == 57
- task_info: list[LightevalTaskConfig] = registry.task_to_configs["original|mmlu:abstract_algebra"]
+ task_info: list[LightevalTaskConfig] = registry.task_to_configs["lighteval|mmlu:abstract_algebra"]
assert {task_info[0].num_fewshots, task_info[1].num_fewshots} == {3, 5}
@@ -133,7 +88,7 @@ def test_task_group_expansion_with_subset_expansion():
"""
Tests that task info selector correctly handles a group with task superset is provided.
"""
- registry = Registry(tasks="all_mmlu", custom_tasks="tests.unit.tasks.test_registry")
+ registry = Registry(tasks="lighteval|mmlu|0")
# We have all mmlu tasks
assert len(registry.task_to_configs.keys()) == 57
@@ -151,11 +106,9 @@ def test_task_duplicates():
"""
Tests that task info selector correctly handles if duplicate tasks are provided.
"""
- registry = Registry(
- tasks="custom|test_task_revision|0,custom|test_task_revision|0", custom_tasks="tests.unit.tasks.test_registry"
- )
+ registry = Registry(tasks="lighteval|storycloze:2016|0,lighteval|storycloze:2016|0")
- assert list(registry.tasks_list) == ["custom|test_task_revision|0"]
+ assert list(registry.tasks_list) == ["lighteval|storycloze:2016|0"]
def test_task_creation():
diff --git a/tests/utils.py b/tests/utils.py
index 3b68dd631..b7ba2a042 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -20,9 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
-from pathlib import Path
-from types import ModuleType
-from typing import Optional, Union
+from typing import Optional
from unittest.mock import patch
from transformers import AutoTokenizer
@@ -108,7 +106,7 @@ def fake_evaluate_task(
# Create a mock Registry class
class FakeRegistry(Registry):
- def __init__(self, tasks: Optional[str], custom_tasks: Optional[Union[str, Path, ModuleType]] = None):
+ def __init__(self, tasks: Optional[str], load_multilingual: bool = False, custom_tasks: Optional[str] = None):
self.tasks_list = [task_name_fs]
self.task_to_configs = {task_name_fs: [lighteval_task.config]}