diff --git a/README.md b/README.md index 8fa4dbe7f..ba5f698b8 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,9 @@ Documentation + + Open Benchmark Index +

--- @@ -39,7 +42,10 @@ sample-by-sample results* to debug and see how your models stack-up. ## Available Tasks -Lighteval supports **7,000+ evaluation tasks** across multiple domains and languages. Here's an overview of some *popular benchmarks*: +Lighteval supports **1000+ evaluation tasks** across multiple domains and +languages. Use [this +space](https://huggingface.co/spaces/SaylorTwift/benchmark_finder) to find what +you need, or, here's an overview of some *popular benchmarks*: ### 📚 **Knowledge** @@ -62,7 +68,7 @@ Lighteval supports **7,000+ evaluation tasks** across multiple domains and langu ### 🌍 **Multilingual Evaluation** - **Cross-lingual**: XTREME, Flores200 (200 languages), XCOPA, XQuAD -- **Language-specific**: +- **Language-specific**: - **Arabic**: ArabicMMLU - **Filipino**: FilBench - **French**: IFEval-fr, GPQA-fr, BAC-fr diff --git a/community_tasks/_template.py b/community_tasks/_template.py deleted file mode 100644 index bfc7de505..000000000 --- a/community_tasks/_template.py +++ /dev/null @@ -1,114 +0,0 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -# ruff: noqa: F405, F403, F401 -""" -Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task. - -This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. - -Author: -""" - -import numpy as np - -from lighteval.metrics.metrics import SampleLevelMetric -from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.requests import Doc, SamplingMethod - - -# DEFINE YOUR PROMPT FUNCTIONS -# Define as many as you need for your different tasks -def prompt_fn(line, task_name: str = None): - """Defines how to go from a dataset line to a doc object. - Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info - about what this function should do in the README. - """ - return Doc( - task_name=task_name, - query="", - choices=[""], - gold_index=0, - instruction="", - ) - - -# EVAL WITH NO SUBSET ## -# This is how you create a simple task (like hellaswag) which has one single subset -# attached to it, and one evaluation possible. -task = LightevalTaskConfig( - name="myothertask", - prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py - suite=["community"], - hf_repo="", - hf_subset="default", - hf_avail_splits=[], - evaluation_splits=[], - few_shots_split="", - few_shots_select="", - metrics=[], # select your metric in Metrics -) - -# EVALS WITH SUBSET -# This is how you create a subset task (like MMLU), which has several subset -# each being its own evaluation task. - -# fmt: off -SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval -# fmt: on - - -class CustomSubsetTask(LightevalTaskConfig): - def __init__( - self, - name, - hf_subset, - ): - super().__init__( - name=name, - hf_subset=hf_subset, - prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py - hf_repo="", - metrics=[custom_metric], # select your metric in Metrics or use your custom_metric - hf_avail_splits=[], - evaluation_splits=[], - few_shots_split="", - few_shots_select="", - suite=["community"], - generation_size=-1, - stop_sequence=None, - ) - - -# STORE YOUR EVALS -SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS] -TASKS_TABLE = SUBSET_TASKS + [task] - - -# CUSTOM METRIC IF NEEDED -custom_metric = SampleLevelMetric( - metric_name="my_custom_metric_name", - higher_is_better=True, - category=SamplingMethod.GENERATIVE, # or LOGPROBS, PERPLEXITY, etc. - sample_level_fn=lambda x: x, # how to compute score for one sample - corpus_level_fn=np.mean, # aggregation -) diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py deleted file mode 100644 index 7895cabff..000000000 --- a/community_tasks/aimo_evals.py +++ /dev/null @@ -1,61 +0,0 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -# ruff: noqa: F405, F403, F401 -""" -Task to evaluate LLMs on the training set of the Kaggle AIMO competition: https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize -""" - -from lighteval.metrics.metrics import Metrics -from lighteval.metrics.normalizations import math_normalizer -from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.requests import Doc - - -def aimo_prompt(line, task_name: str = None): - return Doc( - task_name=task_name, - choices=[str(line["answer"])], - gold_index=0, - query=line["problem"], - ) - - -task = LightevalTaskConfig( - name="aimo_progress_prize_1", - prompt_function=aimo_prompt, - suite=["community"], - hf_subset="", - hf_repo="lighteval/aimo_progress_prize_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split="train", - few_shots_select="sequential", - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}) - ], - generation_size=2048, - stop_sequence=None, -) - -# STORE YOUR EVALS -TASKS_TABLE = [task] diff --git a/community_tasks/oz_evals.py b/community_tasks/oz_evals.py deleted file mode 100644 index 61c762bef..000000000 --- a/community_tasks/oz_evals.py +++ /dev/null @@ -1,87 +0,0 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -# ruff: noqa: F405, F403, F401 -""" -Custom evaluation tasks for lighteval. - -This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. - -OZ Eval (sr. Opšte Znanje Evaluacija) dataset was created for the purposes of evaluating General Knowledge of LLM models in Serbian language. -Data consists of 1k+ high-quality questions and answers which were used as part of entry exams at the Faculty of Philosophy and Faculty of Organizational Sciences, University of Belgrade. -The exams test the General Knowledge of students and were used in the enrollment periods from 2003 to 2024. -For more details and results see: https://huggingface.co/datasets/DjMel/oz-eval -""" - -from lighteval.metrics.metrics import Metrics -from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.requests import Doc - - -def prompt_fn_oz_eval_task(line, task_name: str = None): - query_template = """Pitanje: {question}\n - Ponuđeni odgovori: - A. {choice_a} - B. {choice_b} - C. {choice_c} - D. {choice_d} - E. {choice_e} - - Krajnji odgovor:""" - - options = line["options"] - - query = query_template.format( - question=line["questions"], - choice_a=options[0], - choice_b=options[1], - choice_c=options[2], - choice_d=options[3], - choice_e=options[4], - ) - - choices = ["A", "B", "C", "D", "E"] - return Doc( - task_name=task_name, - query=query, - choices=choices, - gold_index=choices.index(line["answer"]), - ) - - -oz_eval_task = LightevalTaskConfig( - name="serbian_evals:oz_task", - prompt_function=prompt_fn_oz_eval_task, - suite=["community"], - hf_repo="DjMel/oz-eval", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - metrics=[Metrics.loglikelihood_acc], - version=0, -) - - -# STORE YOUR EVALS -TASKS_TABLE = [oz_eval_task] diff --git a/community_tasks/slr_bench_requirements.txt b/community_tasks/slr_bench_requirements.txt deleted file mode 100644 index 57953d68e..000000000 --- a/community_tasks/slr_bench_requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -evaluate -swipl diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx index a97a0fd42..52e6d4aa2 100644 --- a/docs/source/adding-a-custom-task.mdx +++ b/docs/source/adding-a-custom-task.mdx @@ -2,37 +2,17 @@ Lighteval provides a flexible framework for creating custom evaluation tasks. This guide explains how to create and integrate new tasks into the evaluation system. -## Task Categories - -Before creating a custom task, consider which category it belongs to: - -### Core Evaluations -Core evaluations are evaluations that only require standard logic in their -metrics and processing, and that we will add to our test suite to ensure non-regression through time. They already see high usage in the community. - -### Extended Evaluations -Extended evaluations are evaluations that require custom logic in their -metrics (complex normalization, an LLM as a judge, etc.), that we added to -facilitate the life of users. They already see high usage in the community. - -### Community Evaluations -Community evaluations are submissions by the community of new tasks. - -A popular community evaluation can move to become an extended or core evaluation over time. - -> [!TIP] -> You can find examples of custom tasks in the [community_tasks](https://github.com/huggingface/lighteval/tree/main/community_tasks) directory. - -## Step-by-Step Creation of a Custom Task +## Step-by-Step Creation of a Task > [!WARNING] -> To contribute your custom task to the Lighteval repository, you would first need +> To contribute your task to the Lighteval repository, you would first need > to install the required dev dependencies by running `pip install -e .[dev]` > and then run `pre-commit install` to install the pre-commit hooks. ### Step 1: Create the Task File -First, create a Python file under the `community_tasks` directory. +First, create a Python file or directory under the `src/lighteval/tasks/tasks` directory. +A directory is helpfull if you need to split your file into multiple ones, just make sure to have one of the file named `main.py`. ### Step 2: Define the Prompt Function @@ -135,12 +115,12 @@ class CustomSubsetTask(LightevalTaskConfig): evaluation_splits=["test"], few_shots_split="train", few_shots_select="random_sampling_from_train", - suite=["community"], + suite=["lighteval"], generation_size=256, stop_sequence=["\n", "Question:"], ) -SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS] +SUBSET_TASKS = [CustomSubsetTask(name=f"task:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS] ``` ### Step 5: Add Tasks to the Table @@ -169,7 +149,7 @@ Once your file is created, you can run the evaluation with the following command ```bash lighteval accelerate \ "model_name=HuggingFaceH4/zephyr-7b-beta" \ - "community|{custom_task}|{fewshots}" \ + "lighteval|{task}|{fewshots}" \ --custom-tasks {path_to_your_custom_task_file} ``` @@ -179,12 +159,12 @@ lighteval accelerate \ # Run a custom task with zero-shot evaluation lighteval accelerate \ "model_name=openai-community/gpt2" \ - "community|myothertask|0" \ + "lighteval|myothertask|0" \ --custom-tasks community_tasks/my_custom_task.py # Run a custom task with few-shot evaluation lighteval accelerate \ "model_name=openai-community/gpt2" \ - "community|myothertask|3" \ + "lighteval|myothertask|3" \ --custom-tasks community_tasks/my_custom_task.py ``` diff --git a/docs/source/available-tasks.mdx b/docs/source/available-tasks.mdx index 2acb4ef95..450b7ed49 100644 --- a/docs/source/available-tasks.mdx +++ b/docs/source/available-tasks.mdx @@ -1,8 +1,12 @@ -# Available Tasks -## Discovering Available Tasks + + -### List All Tasks You can get a list of all available tasks by running: @@ -10,8 +14,6 @@ You can get a list of all available tasks by running: lighteval tasks list ``` -This command will display all tasks organized by their suites (e.g., leaderboard, lighteval, community). - ### Inspect Specific Tasks You can inspect a specific task to see its configuration, metrics, and requirements by running: @@ -22,5 +24,5 @@ lighteval tasks inspect For example: ```bash -lighteval tasks inspect "leaderboard|truthfulqa:mc|0" +lighteval tasks inspect "lighteval|truthfulqa:mc|0" ``` diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx index d93af7078..e22ed3223 100644 --- a/docs/source/quicktour.mdx +++ b/docs/source/quicktour.mdx @@ -7,6 +7,16 @@ Lighteval can be used with several different commands, each optimized for different evaluation scenarios. + +## Find your benchmark + + + ## Available Commands ### Evaluation Backends diff --git a/examples/custom_models/google_translate_model.py b/examples/custom_models/google_translate_model.py index 04493fe35..1fe456900 100644 --- a/examples/custom_models/google_translate_model.py +++ b/examples/custom_models/google_translate_model.py @@ -110,7 +110,6 @@ def greedy_until( Args: requests (list[Request]): list of requests containing the context and ending conditions. - override_bs (int, optional): Override the batch size for generation. Defaults to None. Returns: list[ModelResponse]: list of generated responses. diff --git a/examples/custom_tasks_tests.py b/examples/custom_tasks_tests.py index 34c871cd5..1a189c177 100644 --- a/examples/custom_tasks_tests.py +++ b/examples/custom_tasks_tests.py @@ -26,8 +26,8 @@ gsm8k_test = LightevalTaskConfig( - name="gsm8k", - suite=["test"], + name="gsm8k_test", + suite=["lighteval"], prompt_function=prompt.gsm8k, hf_repo="gsm8k", hf_subset="main", @@ -42,8 +42,8 @@ ) gpqa_diamond_test = LightevalTaskConfig( - name="gpqa:diamond", - suite=["test"], + name="gpqa:diamond_test", + suite=["lighteval"], prompt_function=prompt.gpqa_instruct, hf_repo="Idavidrein/gpqa", hf_subset="gpqa_diamond", diff --git a/examples/test_tasks.txt b/examples/test_tasks.txt index 12c8662a9..14f847f06 100644 --- a/examples/test_tasks.txt +++ b/examples/test_tasks.txt @@ -1,8 +1,8 @@ -leaderboard|arc:challenge|25 -leaderboard|truthfulqa:mc|0 -leaderboard|hellaswag|10 -leaderboard|mmlu:college_chemistry|5 -leaderboard|mmlu:us_foreign_policy|5 +lighteval|arc:challenge|25 +lighteval|truthfulqa:mc|0 +lighteval|hellaswag|10 +lighteval|mmlu:college_chemistry|5 +lighteval|mmlu:us_foreign_policy|5 lighteval|agieval:aqua-rat|0 lighteval|agieval:logiqa-en|0 lighteval|agieval:lsat-ar|0 @@ -10,18 +10,18 @@ lighteval|agieval:lsat-lr|0 lighteval|agieval:lsat-rc|0 lighteval|agieval:sat-en-without-passage|0 lighteval|agieval:sat-en|0 -lighteval|bigbench:causal_judgment|3 -lighteval|bigbench:date_understanding|3 -lighteval|bigbench:disambiguation_qa|3 -lighteval|bigbench:geometric_shapes|3 -lighteval|bigbench:logical_deduction_five_objects|3 -lighteval|bigbench:logical_deduction_seven_objects|3 -lighteval|bigbench:movie_recommendation|3 -lighteval|bigbench:navigate|3 -lighteval|bigbench:ruin_names|3 -lighteval|bigbench:salient_translation_error_detection|3 -lighteval|bigbench:snarks|3 -lighteval|bigbench:temporal_sequences|3 -lighteval|bigbench:tracking_shuffled_objects_five_objects|3 -lighteval|bigbench:tracking_shuffled_objects_seven_objects|3 -test|gsm8k|0 +lighteval|bigbench_hard:causal_judgment|3 +lighteval|bigbench_hard:date_understanding|3 +lighteval|bigbench_hard:disambiguation_qa|3 +lighteval|bigbench_hard:geometric_shapes|3 +lighteval|bigbench_hard:logical_deduction_five_objects|3 +lighteval|bigbench_hard:logical_deduction_seven_objects|3 +lighteval|bigbench_hard:movie_recommendation|3 +lighteval|bigbench_hard:navigate|3 +lighteval|bigbench_hard:ruin_names|3 +lighteval|bigbench_hard:salient_translation_error_detection|3 +lighteval|bigbench_hard:snarks|3 +lighteval|bigbench_hard:temporal_sequences|3 +lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3 +lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3 +lighteval|gsm8k_test|0 diff --git a/pyproject.toml b/pyproject.toml index 45b88d1f2..a89024487 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ line-length = 119 [tool.ruff.lint] # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. # Never enforce `E501` (line length violations). -ignore = ["E501", "D100", "D101", "D102", "D103", "D104", "D415", "D105", "DOC501", "DOC201"] +ignore = ["E501", "D100", "D101", "D102", "D103", "D104", "D415", "D105", "DOC501", "DOC201", "CPY001"] select = ["C", "E", "F", "I", "W", "CPY", "D417", "DOC"] preview = true @@ -108,7 +108,8 @@ extended_tasks = [ "langdetect", # ifeval "openai>1.87", # llm as a judge using openai models "tiktoken", - "emoji", "spacy", "syllapy" # ifbench + "emoji", "spacy", "syllapy", # ifbench + "evaluate", # slr_bench ] s3 = ["s3fs"] multilingual = [ diff --git a/src/lighteval/cli_args.py b/src/lighteval/cli_args.py index 30e85a1a9..a8123218f 100644 --- a/src/lighteval/cli_args.py +++ b/src/lighteval/cli_args.py @@ -113,6 +113,16 @@ class Arg: default="[('', '')]", ) +load_tasks_multilingual = Arg( + type=Annotated[ + bool, + Option( + help="Whether to load multilingual tasks.", + rich_help_panel=HELP_PANEL_NAME_1, + ), + ], + default=False, +) # Logging Parameters (HELP_PANEL_NAME_2) output_dir = Arg( diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index 3eca3b1c5..00fe25676 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -31,6 +31,7 @@ dataset_loading_processes, job_id, load_responses_from_details_date_id, + load_tasks_multilingual, max_samples, model_args, num_fewshot_seeds, @@ -59,8 +60,9 @@ def accelerate( # noqa C901 vision_model: Annotated[ bool, Option(help="Use vision model for evaluation.", rich_help_panel=HELP_PANEL_NAME_4) ] = False, - dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, custom_tasks: custom_tasks.type = custom_tasks.default, + load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default, + dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, @@ -105,9 +107,10 @@ def accelerate( # noqa C901 ) pipeline_params = PipelineParameters( launcher_type=ParallelismManager.ACCELERATE, + custom_tasks_directory=custom_tasks, + load_tasks_multilingual=load_tasks_multilingual, job_id=job_id, dataset_loading_processes=dataset_loading_processes, - custom_tasks_directory=custom_tasks, num_fewshot_seeds=num_fewshot_seeds, max_samples=max_samples, remove_reasoning_tags=remove_reasoning_tags, diff --git a/src/lighteval/main_baseline.py b/src/lighteval/main_baseline.py index f082af726..2ba82095c 100644 --- a/src/lighteval/main_baseline.py +++ b/src/lighteval/main_baseline.py @@ -24,6 +24,7 @@ from lighteval.cli_args import ( custom_tasks, dataset_loading_processes, + load_tasks_multilingual, max_samples, output_dir, tasks, @@ -32,8 +33,9 @@ def baseline( tasks: tasks.type, - custom_tasks: custom_tasks.type = custom_tasks.default, + load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default, dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, + custom_tasks: custom_tasks.type = custom_tasks.default, output_dir: output_dir.type = output_dir.default, max_samples: max_samples.type = max_samples.default, ): @@ -55,7 +57,7 @@ def baseline( from lighteval.tasks.requests import SamplingMethod from lighteval.utils.utils import as_list - registry = Registry(tasks=tasks, custom_tasks=custom_tasks) + registry = Registry(tasks=tasks, custom_tasks=custom_tasks, load_multilingual=load_tasks_multilingual) tasks_dict: dict[str, LightevalTask] = registry.load_tasks() evaluation_tracker = EvaluationTracker( diff --git a/src/lighteval/main_custom.py b/src/lighteval/main_custom.py index 1cef8f3dc..e6124ce62 100644 --- a/src/lighteval/main_custom.py +++ b/src/lighteval/main_custom.py @@ -29,6 +29,7 @@ custom_tasks, dataset_loading_processes, job_id, + load_tasks_multilingual, max_samples, num_fewshot_seeds, output_dir, @@ -55,9 +56,10 @@ def custom( model_definition_file_path: Annotated[str, Argument(help="The model definition file path to evaluate")], tasks: tasks.type, # === Common parameters === + load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default, dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, - custom_tasks: custom_tasks.type = custom_tasks.default, num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + custom_tasks: custom_tasks.type = custom_tasks.default, remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, reasoning_tags: reasoning_tags.type = reasoning_tags.default, # === saving === @@ -102,6 +104,7 @@ def custom( max_samples=max_samples, remove_reasoning_tags=remove_reasoning_tags, reasoning_tags=reasoning_tags, + load_tasks_multilingual=load_tasks_multilingual, ) pipeline = Pipeline( tasks=tasks, diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py index 060b93822..ece2ac430 100644 --- a/src/lighteval/main_endpoint.py +++ b/src/lighteval/main_endpoint.py @@ -31,6 +31,7 @@ dataset_loading_processes, job_id, load_responses_from_details_date_id, + load_tasks_multilingual, max_samples, num_fewshot_seeds, output_dir, @@ -65,6 +66,7 @@ def inference_endpoint( ), ] = False, # === Common parameters === + load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default, dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, custom_tasks: custom_tasks.type = custom_tasks.default, num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, @@ -121,6 +123,7 @@ def inference_endpoint( load_responses_from_details_date_id=load_responses_from_details_date_id, remove_reasoning_tags=remove_reasoning_tags, reasoning_tags=reasoning_tags, + load_tasks_multilingual=load_tasks_multilingual, ) pipeline = Pipeline( tasks=tasks, @@ -148,6 +151,7 @@ def tgi( ], tasks: tasks.type, # === Common parameters === + load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default, dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, custom_tasks: custom_tasks.type = custom_tasks.default, num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, @@ -193,6 +197,7 @@ def tgi( pipeline_params = PipelineParameters( launcher_type=parallelism_manager, + load_tasks_multilingual=load_tasks_multilingual, job_id=job_id, dataset_loading_processes=dataset_loading_processes, custom_tasks_directory=custom_tasks, @@ -231,9 +236,10 @@ def litellm( ], tasks: tasks.type, # === Common parameters === + load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default, dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, - custom_tasks: custom_tasks.type = custom_tasks.default, num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, + custom_tasks: custom_tasks.type = custom_tasks.default, load_responses_from_details_date_id: load_responses_from_details_date_id.type = load_responses_from_details_date_id.default, remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, reasoning_tags: reasoning_tags.type = reasoning_tags.default, @@ -285,6 +291,7 @@ def litellm( pipeline_params = PipelineParameters( launcher_type=parallelism_manager, + load_tasks_multilingual=load_tasks_multilingual, job_id=job_id, dataset_loading_processes=dataset_loading_processes, custom_tasks_directory=custom_tasks, @@ -324,6 +331,7 @@ def inference_providers( ], tasks: tasks.type, # === Common parameters === + load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default, dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, custom_tasks: custom_tasks.type = custom_tasks.default, num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, @@ -373,6 +381,7 @@ def inference_providers( pipeline_params = PipelineParameters( launcher_type=parallelism_manager, + load_tasks_multilingual=load_tasks_multilingual, job_id=job_id, dataset_loading_processes=dataset_loading_processes, custom_tasks_directory=custom_tasks, diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py index b844a74a4..9399e82cd 100644 --- a/src/lighteval/main_nanotron.py +++ b/src/lighteval/main_nanotron.py @@ -29,6 +29,7 @@ from yaml import SafeLoader from lighteval.cli_args import ( + load_tasks_multilingual, reasoning_tags, remove_reasoning_tags, ) @@ -44,6 +45,7 @@ def nanotron( str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.") ], lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")], + load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default, remove_reasoning_tags: remove_reasoning_tags.type = remove_reasoning_tags.default, reasoning_tags: reasoning_tags.type = reasoning_tags.default, ): @@ -102,6 +104,7 @@ def nanotron( max_samples=lighteval_config.tasks.max_samples, remove_reasoning_tags=remove_reasoning_tags, reasoning_tags=reasoning_tags, + load_tasks_multilingual=load_tasks_multilingual, ) pipeline = Pipeline( diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py index 0b506988e..ab86349f9 100644 --- a/src/lighteval/main_sglang.py +++ b/src/lighteval/main_sglang.py @@ -25,6 +25,7 @@ dataset_loading_processes, job_id, load_responses_from_details_date_id, + load_tasks_multilingual, max_samples, model_args, num_fewshot_seeds, @@ -47,6 +48,7 @@ def sglang( model_args: model_args.type, tasks: tasks.type, # === Common parameters === + load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default, dataset_loading_processes: dataset_loading_processes.type = dataset_loading_processes.default, custom_tasks: custom_tasks.type = custom_tasks.default, num_fewshot_seeds: num_fewshot_seeds.type = num_fewshot_seeds.default, @@ -91,6 +93,7 @@ def sglang( pipeline_params = PipelineParameters( launcher_type=ParallelismManager.SGLANG, job_id=job_id, + load_tasks_multilingual=load_tasks_multilingual, dataset_loading_processes=dataset_loading_processes, custom_tasks_directory=custom_tasks, num_fewshot_seeds=num_fewshot_seeds, diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py index 62f1129f4..230359730 100644 --- a/src/lighteval/main_tasks.py +++ b/src/lighteval/main_tasks.py @@ -25,7 +25,7 @@ from typer import Argument, Option from typing_extensions import Annotated -from lighteval.cli_args import custom_tasks +from lighteval.cli_args import custom_tasks, load_tasks_multilingual app = typer.Typer() @@ -46,7 +46,7 @@ def inspect( from lighteval.tasks.registry import Registry - registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True) + registry = Registry(custom_tasks=custom_tasks, load_multilingual=True) # Loading task task_dict = registry.load_tasks() @@ -64,19 +64,14 @@ def inspect( @app.command() def list( + load_tasks_multilingual: load_tasks_multilingual.type = load_tasks_multilingual.default, custom_tasks: custom_tasks.type = custom_tasks.default, - suites: Annotated[ - str | None, - Option( - help="Comma-separated list of suites to display (e.g., 'helm,harness'). Use 'all' for all suites. If not specified, shows core suites only." - ), - ] = None, ): """List all tasks""" from lighteval.tasks.registry import Registry - registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True) - registry.print_all_tasks(suites=suites) + registry = Registry(custom_tasks=custom_tasks, load_multilingual=load_tasks_multilingual) + registry.print_all_tasks() @app.command() diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py index b8025bf3f..db9c16c34 100644 --- a/src/lighteval/metrics/imports/bert_scorer.py +++ b/src/lighteval/metrics/imports/bert_scorer.py @@ -110,7 +110,7 @@ def get_bert_embedding( Args: all_sens (list of str): sentences to encode. - model: a BERT model from `pytorch_pretrained_bert`. + model: a BERT model. tokenizer: a BERT tokenizer corresponds to `model`. idf_dict (dict): mapping a word piece index to its inverse document frequency. batch_size (int): batch size for processing, -1 for all sentences. @@ -330,7 +330,6 @@ def __init__( `model_type` or `lang`. num_layers (int): The layer of representation to use. Default using the number of layer tuned on WMT16 correlation data. - verbose (bool): Turn on intermediate status update. idf (bool): A boolean to specify whether to use idf or not (this should be True even if `idf_sents` is given). device (str): On which the contextual embedding model will be allocated on. If this argument is None, the model lives on cuda:0 if cuda is available. @@ -340,7 +339,6 @@ def __init__( lang (str): Language of the sentences; has to specify at least one of `model_type` or `lang`. `lang` needs to be specified when `rescale_with_baseline` is True. - return_hash (bool): Return hash code of the setting. rescale_with_baseline (bool): Rescale bertscore with pre-computed baseline. baseline_path (str): Customized baseline file. """ diff --git a/src/lighteval/metrics/utils/math_comparison.py b/src/lighteval/metrics/utils/math_comparison.py index 2329acfe0..974d6d2cc 100644 --- a/src/lighteval/metrics/utils/math_comparison.py +++ b/src/lighteval/metrics/utils/math_comparison.py @@ -297,7 +297,7 @@ def is_equation(expr: Basic | MatrixBase) -> bool: Args: expr: The expression to check Returns: - bool: True if expr is an equation, False otherwise + True if expr is an equation, False otherwise """ if isinstance(expr, Eq): return True diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 0f02c4b38..1f5da9c14 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -94,6 +94,7 @@ class PipelineParameters: reasoning_tags: str | list[tuple[str, str]] = "[('', '')]" load_responses_from_details_date_id: str | None = None bootstrap_iters: int = 1000 + load_tasks_multilingual: bool = False def __post_init__(self): # noqa C901 if not isinstance(self.reasoning_tags, list): @@ -210,7 +211,11 @@ def _init_tasks_and_requests(self, tasks: str): logger.info("--- LOADING TASKS ---") # The registry contains all the potential tasks - self.registry = Registry(tasks=tasks, custom_tasks=self.pipeline_parameters.custom_tasks_directory) + self.registry = Registry( + tasks=tasks, + load_multilingual=self.pipeline_parameters.load_tasks_multilingual, + custom_tasks=self.pipeline_parameters.custom_tasks_directory, + ) # load the tasks from the configs and their datasets self.tasks_dict: dict[str, LightevalTask] = self.registry.load_tasks() diff --git a/src/lighteval/tasks/__init__.py b/src/lighteval/tasks/__init__.py index a732db8d0..e3e34484b 100644 --- a/src/lighteval/tasks/__init__.py +++ b/src/lighteval/tasks/__init__.py @@ -19,3 +19,8 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. + +""" +Automatically imports all task configs from the tasks/ directory. +This module dynamically loads all Python files in tasks/ and exposes their LightevalTaskConfig objects. +""" diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py deleted file mode 100644 index 1c72d5008..000000000 --- a/src/lighteval/tasks/default_tasks.py +++ /dev/null @@ -1,22871 +0,0 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import lighteval.tasks.default_prompts as prompt -from lighteval.metrics.metrics import Metrics -from lighteval.metrics.normalizations import ( - LogProbCharNorm, - gsm8k_normalizer, - harness_triviaqa_normalizer, - helm_normalizer, - math_normalizer, -) -from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.templates.qa import get_qa_prompt_function -from lighteval.utils.language import Language - - -mmmu_pro_standard_4_options = LightevalTaskConfig( - name="mmmu_pro:standard-4", - suite=["lighteval"], - prompt_function=prompt.mmmu_pro, - hf_repo="MMMU/MMMU_pro", - hf_subset="standard (4 options)", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, # expected an answer in a format 'Answer: B' - metrics=[Metrics.gpqa_instruct_metric], - stop_sequence=None, - version=0, -) -mmmu_pro_standard_10_options = LightevalTaskConfig( - name="mmmu_pro:standard-10", - suite=["lighteval"], - prompt_function=prompt.mmmu_pro, - hf_repo="MMMU/MMMU_pro", - hf_subset="standard (10 options)", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, # expected an answer in a format 'Answer: B' - metrics=[Metrics.gpqa_instruct_metric], - stop_sequence=None, - version=0, -) -mmmu_pro_vision = LightevalTaskConfig( - name="mmmu_pro:vision", - suite=["lighteval"], - prompt_function=prompt.mmmu_pro_vision, - hf_repo="MMMU/MMMU_pro", - hf_subset="vision", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, # expected an answer in a format 'Answer: B' - metrics=[Metrics.gpqa_instruct_metric], - stop_sequence=None, - version=0, -) -abstract_narrative_understanding_bigbench = LightevalTaskConfig( - name="abstract_narrative_understanding", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="abstract_narrative_understanding", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -agieval_aqua_rat_lighteval = LightevalTaskConfig( - name="agieval:aqua-rat", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-aqua-rat", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_gaokao_biology_lighteval = LightevalTaskConfig( - name="agieval:gaokao-biology", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-gaokao-biology", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_gaokao_chemistry_lighteval = LightevalTaskConfig( - name="agieval:gaokao-chemistry", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-gaokao-chemistry", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_gaokao_chinese_lighteval = LightevalTaskConfig( - name="agieval:gaokao-chinese", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-gaokao-chinese", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_gaokao_english_lighteval = LightevalTaskConfig( - name="agieval:gaokao-english", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-gaokao-english", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_gaokao_geography_lighteval = LightevalTaskConfig( - name="agieval:gaokao-geography", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-gaokao-geography", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_gaokao_history_lighteval = LightevalTaskConfig( - name="agieval:gaokao-history", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-gaokao-history", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_gaokao_mathqa_lighteval = LightevalTaskConfig( - name="agieval:gaokao-mathqa", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-gaokao-mathqa", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_gaokao_physics_lighteval = LightevalTaskConfig( - name="agieval:gaokao-physics", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-gaokao-physics", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_logiqa_en_lighteval = LightevalTaskConfig( - name="agieval:logiqa-en", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-logiqa-en", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_logiqa_zh_lighteval = LightevalTaskConfig( - name="agieval:logiqa-zh", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-logiqa-zh", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_lsat_ar_lighteval = LightevalTaskConfig( - name="agieval:lsat-ar", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-lsat-ar", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_lsat_lr_lighteval = LightevalTaskConfig( - name="agieval:lsat-lr", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-lsat-lr", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_lsat_rc_lighteval = LightevalTaskConfig( - name="agieval:lsat-rc", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-lsat-rc", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_sat_en_lighteval = LightevalTaskConfig( - name="agieval:sat-en", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-sat-en", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_sat_en_without_passage_lighteval = LightevalTaskConfig( - name="agieval:sat-en-without-passage", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-sat-en-without-passage", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -agieval_sat_math_lighteval = LightevalTaskConfig( - name="agieval:sat-math", - suite=["lighteval"], - prompt_function=prompt.agieval, - hf_repo="dmayhem93/agieval-sat-math", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=None, - version=0, -) -aime24 = LightevalTaskConfig( - name="aime24", - suite=["lighteval"], - prompt_function=prompt.aime_prompt_fn, - hf_repo="HuggingFaceH4/aime_2024", - hf_subset="default", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.pass_at_k_math(sample_params={"k": 1}), Metrics.avg_at_n_math(sample_params={"n": 1})], - version=2, -) -aime24_avg = LightevalTaskConfig( - name="aime24_avg", - suite=["lighteval"], - prompt_function=prompt.aime_prompt_fn, - hf_repo="HuggingFaceH4/aime_2024", - hf_subset="default", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.avg_at_n_math(sample_params={"n": 64})], - version=2, -) -aime24_gpassk = LightevalTaskConfig( - name="aime24_gpassk", - suite=["lighteval"], - prompt_function=prompt.aime_prompt_fn, - hf_repo="HuggingFaceH4/aime_2024", - hf_subset="default", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=8192, - metrics=[Metrics.g_pass_at_k_math(sample_params={"k": 16, "n": 48})], - version=1, -) -aime25 = LightevalTaskConfig( - name="aime25", - suite=["lighteval"], - prompt_function=prompt.aime_prompt_fn, - hf_repo="yentinglin/aime_2025", - hf_subset="default", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=10000, - metrics=[Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1})], - version=2, -) -aime25_gpassk = LightevalTaskConfig( - name="aime25_gpassk", - suite=["lighteval"], - prompt_function=prompt.aime_prompt_fn, - hf_repo="yentinglin/aime_2025", - hf_subset="default", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=8192, - metrics=[Metrics.g_pass_at_k_math(sample_params={"k": 16, "n": 48})], - version=1, -) -anachronisms_bigbench = LightevalTaskConfig( - name="anachronisms", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="anachronisms", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -analogical_similarity_bigbench = LightevalTaskConfig( - name="analogical_similarity", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="analogical_similarity", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -analytic_entailment_bigbench = LightevalTaskConfig( - name="analytic_entailment", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="analytic_entailment", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -anli_r1_lighteval = LightevalTaskConfig( - name="anli:r1", - suite=["lighteval", "anli"], - prompt_function=prompt.anli, - hf_repo="anli", - hf_subset="plain_text", - hf_avail_splits=["train_r1", "dev_r1", "test_r1"], - evaluation_splits=["test_r1"], - few_shots_split="train_r1", - few_shots_select="random_sampling_from_train", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -anli_r2_lighteval = LightevalTaskConfig( - name="anli:r2", - suite=["lighteval", "anli"], - prompt_function=prompt.anli, - hf_repo="anli", - hf_subset="plain_text", - hf_avail_splits=["train_r2", "dev_r2", "test_r2"], - evaluation_splits=["test_r2"], - few_shots_split="train_r2", - few_shots_select="random_sampling_from_train", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -anli_r3_lighteval = LightevalTaskConfig( - name="anli:r3", - suite=["lighteval", "anli"], - prompt_function=prompt.anli, - hf_repo="anli", - hf_subset="plain_text", - hf_avail_splits=["train_r3", "dev_r3", "test_r3"], - evaluation_splits=["test_r3"], - few_shots_split="train_r3", - few_shots_select="random_sampling_from_train", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -arc_agi_2 = LightevalTaskConfig( - name="arc_agi_2", - suite=["lighteval"], - prompt_function=prompt.arc_agi_2, - hf_repo="arc-agi-community/arc-agi-2", - hf_subset="default", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[Metrics.exact_match], - stop_sequence=None, - version=0, -) -arc_c_letters_original = LightevalTaskConfig( - name="arc:c:letters", - suite=["original", "arc"], - prompt_function=prompt.arc_with_options_letters_predict, - hf_repo="ai2_arc", - hf_subset="ARC-Challenge", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.exact_match], - stop_sequence=["\n"], - version=0, -) -arc_c_options_original = LightevalTaskConfig( - name="arc:c:options", - suite=["original", "arc"], - prompt_function=prompt.arc_with_options, - hf_repo="ai2_arc", - hf_subset="ARC-Challenge", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -arc_c_simple_original = LightevalTaskConfig( - name="arc:c:simple", - suite=["original", "arc"], - prompt_function=prompt.arc, - hf_repo="ai2_arc", - hf_subset="ARC-Challenge", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -arc_challenge_leaderboard = LightevalTaskConfig( - name="arc:challenge", - suite=["leaderboard", "arc"], - prompt_function=prompt.arc, - hf_repo="ai2_arc", - hf_subset="ARC-Challenge", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling_from_train", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -arc_easy_lighteval = LightevalTaskConfig( - name="arc:easy", - suite=["lighteval", "arc"], - prompt_function=prompt.arc, - hf_repo="ai2_arc", - hf_subset="ARC-Easy", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling_from_train", - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -arithmetic_1dc_lighteval = LightevalTaskConfig( - name="arithmetic:1dc", - suite=["lighteval", "arithmetic"], - prompt_function=prompt.arithmetic, - hf_repo="EleutherAI/arithmetic", - hf_subset="arithmetic_1dc", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.acc_golds_likelihood], - stop_sequence=["\n"], - version=0, -) -arithmetic_2da_lighteval = LightevalTaskConfig( - name="arithmetic:2da", - suite=["lighteval", "arithmetic"], - prompt_function=prompt.arithmetic, - hf_repo="EleutherAI/arithmetic", - hf_subset="arithmetic_2da", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.acc_golds_likelihood], - stop_sequence=["\n"], - version=0, -) -arithmetic_2dm_lighteval = LightevalTaskConfig( - name="arithmetic:2dm", - suite=["lighteval", "arithmetic"], - prompt_function=prompt.arithmetic, - hf_repo="EleutherAI/arithmetic", - hf_subset="arithmetic_2dm", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.acc_golds_likelihood], - stop_sequence=["\n"], - version=0, -) -arithmetic_2ds_lighteval = LightevalTaskConfig( - name="arithmetic:2ds", - suite=["lighteval", "arithmetic"], - prompt_function=prompt.arithmetic, - hf_repo="EleutherAI/arithmetic", - hf_subset="arithmetic_2ds", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.acc_golds_likelihood], - stop_sequence=["\n"], - version=0, -) -arithmetic_3da_lighteval = LightevalTaskConfig( - name="arithmetic:3da", - suite=["lighteval", "arithmetic"], - prompt_function=prompt.arithmetic, - hf_repo="EleutherAI/arithmetic", - hf_subset="arithmetic_3da", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.acc_golds_likelihood], - stop_sequence=["\n"], - version=0, -) -arithmetic_3ds_lighteval = LightevalTaskConfig( - name="arithmetic:3ds", - suite=["lighteval", "arithmetic"], - prompt_function=prompt.arithmetic, - hf_repo="EleutherAI/arithmetic", - hf_subset="arithmetic_3ds", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.acc_golds_likelihood], - stop_sequence=["\n"], - version=0, -) -arithmetic_4da_lighteval = LightevalTaskConfig( - name="arithmetic:4da", - suite=["lighteval", "arithmetic"], - prompt_function=prompt.arithmetic, - hf_repo="EleutherAI/arithmetic", - hf_subset="arithmetic_4da", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.acc_golds_likelihood], - stop_sequence=["\n"], - version=0, -) -arithmetic_4ds_lighteval = LightevalTaskConfig( - name="arithmetic:4ds", - suite=["lighteval", "arithmetic"], - prompt_function=prompt.arithmetic, - hf_repo="EleutherAI/arithmetic", - hf_subset="arithmetic_4ds", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.acc_golds_likelihood], - stop_sequence=["\n"], - version=0, -) -arithmetic_5da_lighteval = LightevalTaskConfig( - name="arithmetic:5da", - suite=["lighteval", "arithmetic"], - prompt_function=prompt.arithmetic, - hf_repo="EleutherAI/arithmetic", - hf_subset="arithmetic_5da", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.acc_golds_likelihood], - stop_sequence=["\n"], - version=0, -) -arithmetic_5ds_lighteval = LightevalTaskConfig( - name="arithmetic:5ds", - suite=["lighteval", "arithmetic"], - prompt_function=prompt.arithmetic, - hf_repo="EleutherAI/arithmetic", - hf_subset="arithmetic_5ds", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.acc_golds_likelihood], - stop_sequence=["\n"], - version=0, -) -arithmetic_bb_bigbench = LightevalTaskConfig( - name="arithmetic_bb", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="arithmetic", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -ascii_word_recognition_bigbench = LightevalTaskConfig( - name="ascii_word_recognition", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="ascii_word_recognition", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -asdiv_lighteval = LightevalTaskConfig( - name="asdiv", - suite=["lighteval"], - prompt_function=prompt.asdiv, - hf_repo="EleutherAI/asdiv", - hf_subset="asdiv", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.acc_golds_likelihood], - stop_sequence=["\n"], - version=0, -) -authorship_verification_bigbench = LightevalTaskConfig( - name="authorship_verification", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="authorship_verification", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -auto_categorization_bigbench = LightevalTaskConfig( - name="auto_categorization", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="auto_categorization", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu], - stop_sequence=["\n"], - version=0, -) -auto_debugging_bigbench_lite = LightevalTaskConfig( - name="auto_debugging", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_linefeed_before_and_after_query, - hf_repo="tasksource/bigbench", - hf_subset="auto_debugging", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=None, - version=0, -) -babi_qa_helm = LightevalTaskConfig( - name="babi_qa", - suite=["helm"], - prompt_function=prompt.babi_qa, - hf_repo="facebook/babi_qa", - hf_subset="en-valid-qa1", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_causal_judgment_lighteval = LightevalTaskConfig( - name="bigbench:causal_judgment", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="causal_judgement", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_date_understanding_lighteval = LightevalTaskConfig( - name="bigbench:date_understanding", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="date_understanding", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_disambiguation_qa_lighteval = LightevalTaskConfig( - name="bigbench:disambiguation_qa", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="disambiguation_qa", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_geometric_shapes_lighteval = LightevalTaskConfig( - name="bigbench:geometric_shapes", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="geometric_shapes", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_logical_deduction_five_objects_lighteval = LightevalTaskConfig( - name="bigbench:logical_deduction_five_objects", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="logical_deduction_five_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_logical_deduction_seven_objects_lighteval = LightevalTaskConfig( - name="bigbench:logical_deduction_seven_objects", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="logical_deduction_seven_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_logical_deduction_three_objects_lighteval = LightevalTaskConfig( - name="bigbench:logical_deduction_three_objects", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="logical_deduction_three_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_movie_recommendation_lighteval = LightevalTaskConfig( - name="bigbench:movie_recommendation", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="movie_recommendation", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_navigate_lighteval = LightevalTaskConfig( - name="bigbench:navigate", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="navigate", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_reasoning_about_colored_objects_lighteval = LightevalTaskConfig( - name="bigbench:reasoning_about_colored_objects", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="reasoning_about_colored_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_ruin_names_lighteval = LightevalTaskConfig( - name="bigbench:ruin_names", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="ruin_names", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_salient_translation_error_detection_lighteval = LightevalTaskConfig( - name="bigbench:salient_translation_error_detection", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="salient_translation_error_detection", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_snarks_lighteval = LightevalTaskConfig( - name="bigbench:snarks", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="snarks", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_sports_understanding_lighteval = LightevalTaskConfig( - name="bigbench:sports_understanding", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="sports_understanding", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_temporal_sequences_lighteval = LightevalTaskConfig( - name="bigbench:temporal_sequences", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="temporal_sequences", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_tracking_shuffled_objects_five_objects_lighteval = LightevalTaskConfig( - name="bigbench:tracking_shuffled_objects_five_objects", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="tracking_shuffled_objects_five_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_tracking_shuffled_objects_seven_objects_lighteval = LightevalTaskConfig( - name="bigbench:tracking_shuffled_objects_seven_objects", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="tracking_shuffled_objects_seven_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_tracking_shuffled_objects_three_objects_lighteval = LightevalTaskConfig( - name="bigbench:tracking_shuffled_objects_three_objects", - suite=["lighteval"], - prompt_function=prompt.bbh_lighteval, - hf_repo="lighteval/bbh", - hf_subset="tracking_shuffled_objects_three_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bigbench_causal_judgment_harness = LightevalTaskConfig( - name="bigbench:causal_judgment", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="causal_judgement", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_date_understanding_harness = LightevalTaskConfig( - name="bigbench:date_understanding", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="date_understanding", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_disambiguation_qa_harness = LightevalTaskConfig( - name="bigbench:disambiguation_qa", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="disambiguation_qa", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_geometric_shapes_harness = LightevalTaskConfig( - name="bigbench:geometric_shapes", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="geometric_shapes", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_logical_deduction_five_objects_harness = LightevalTaskConfig( - name="bigbench:logical_deduction_five_objects", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="logical_deduction_five_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_logical_deduction_seven_objects_harness = LightevalTaskConfig( - name="bigbench:logical_deduction_seven_objects", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="logical_deduction_seven_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_logical_deduction_three_objects_harness = LightevalTaskConfig( - name="bigbench:logical_deduction_three_objects", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="logical_deduction_three_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_movie_recommendation_harness = LightevalTaskConfig( - name="bigbench:movie_recommendation", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="movie_recommendation", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_navigate_harness = LightevalTaskConfig( - name="bigbench:navigate", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="navigate", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_reasoning_about_colored_objects_harness = LightevalTaskConfig( - name="bigbench:reasoning_about_colored_objects", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="reasoning_about_colored_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_ruin_names_harness = LightevalTaskConfig( - name="bigbench:ruin_names", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="ruin_names", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_salient_translation_error_detection_harness = LightevalTaskConfig( - name="bigbench:salient_translation_error_detection", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="salient_translation_error_detection", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_snarks_harness = LightevalTaskConfig( - name="bigbench:snarks", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="snarks", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_sports_understanding_harness = LightevalTaskConfig( - name="bigbench:sports_understanding", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="sports_understanding", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_temporal_sequences_harness = LightevalTaskConfig( - name="bigbench:temporal_sequences", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="temporal_sequences", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_tracking_shuffled_objects_five_objects_harness = LightevalTaskConfig( - name="bigbench:tracking_shuffled_objects_five_objects", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="tracking_shuffled_objects_five_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_tracking_shuffled_objects_seven_objects_harness = LightevalTaskConfig( - name="bigbench:tracking_shuffled_objects_seven_objects", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="tracking_shuffled_objects_seven_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bigbench_tracking_shuffled_objects_three_objects_harness = LightevalTaskConfig( - name="bigbench:tracking_shuffled_objects_three_objects", - suite=["harness"], - prompt_function=prompt.bbh_harness, - hf_repo="lighteval/bbh", - hf_subset="tracking_shuffled_objects_three_objects", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["", "Q=", "\n\n"], - must_remove_duplicate_docs=True, - version=0, -) -bbh_boolean_expressions_harness = LightevalTaskConfig( - name="bbh:boolean_expressions", - suite=["harness"], - prompt_function=prompt.bbh_boolean_expressions, - hf_repo="lukaemon/bbh", - hf_subset="boolean_expressions", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_causal_judgment_harness = LightevalTaskConfig( - name="bbh:causal_judgment", - suite=["harness"], - prompt_function=prompt.bbh_causal_judgment, - hf_repo="lukaemon/bbh", - hf_subset="causal_judgement", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_date_understanding_harness = LightevalTaskConfig( - name="bbh:date_understanding", - suite=["harness"], - prompt_function=prompt.bbh_date_understanding, - hf_repo="lukaemon/bbh", - hf_subset="date_understanding", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_disambiguation_qa_harness = LightevalTaskConfig( - name="bbh:disambiguation_qa", - suite=["harness"], - prompt_function=prompt.bbh_disambiguation_qa, - hf_repo="lukaemon/bbh", - hf_subset="disambiguation_qa", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_dyck_languages_harness = LightevalTaskConfig( - name="bbh:dyck_languages", - suite=["harness"], - prompt_function=prompt.bbh_dyck_languages, - hf_repo="lukaemon/bbh", - hf_subset="dyck_languages", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_formal_fallacies_harness = LightevalTaskConfig( - name="bbh:formal_fallacies", - suite=["harness"], - prompt_function=prompt.bbh_formal_fallacies, - hf_repo="lukaemon/bbh", - hf_subset="formal_fallacies", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_geometric_shapes_harness = LightevalTaskConfig( - name="bbh:geometric_shapes", - suite=["harness"], - prompt_function=prompt.bbh_geometric_shapes, - hf_repo="lukaemon/bbh", - hf_subset="geometric_shapes", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_hyperbaton_harness = LightevalTaskConfig( - name="bbh:hyperbaton", - suite=["harness"], - prompt_function=prompt.bbh_hyperbaton, - hf_repo="lukaemon/bbh", - hf_subset="hyperbaton", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_logical_deduction_five_objects_harness = LightevalTaskConfig( - name="bbh:logical_deduction_five_objects", - suite=["harness"], - prompt_function=prompt.bbh_logical_deduction_five_objects, - hf_repo="lukaemon/bbh", - hf_subset="logical_deduction_five_objects", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_logical_deduction_seven_objects_harness = LightevalTaskConfig( - name="bbh:logical_deduction_seven_objects", - suite=["harness"], - prompt_function=prompt.bbh_logical_deduction_seven_objects, - hf_repo="lukaemon/bbh", - hf_subset="logical_deduction_seven_objects", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_logical_deduction_three_objects_harness = LightevalTaskConfig( - name="bbh:logical_deduction_three_objects", - suite=["harness"], - prompt_function=prompt.bbh_logical_deduction_three_objects, - hf_repo="lukaemon/bbh", - hf_subset="logical_deduction_three_objects", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_movie_recommendation_harness = LightevalTaskConfig( - name="bbh:movie_recommendation", - suite=["harness"], - prompt_function=prompt.bbh_movie_recommendation, - hf_repo="lukaemon/bbh", - hf_subset="movie_recommendation", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_multistep_arithmetic_two_harness = LightevalTaskConfig( - name="bbh:multistep_arithmetic_two", - suite=["harness"], - prompt_function=prompt.bbh_multistep_arithmetic_two, - hf_repo="lukaemon/bbh", - hf_subset="multistep_arithmetic_two", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_navigate_harness = LightevalTaskConfig( - name="bbh:navigate", - suite=["harness"], - prompt_function=prompt.bbh_navigate, - hf_repo="lukaemon/bbh", - hf_subset="navigate", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_object_counting_harness = LightevalTaskConfig( - name="bbh:object_counting", - suite=["harness"], - prompt_function=prompt.bbh_object_counting, - hf_repo="lukaemon/bbh", - hf_subset="object_counting", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_penguins_in_a_table_harness = LightevalTaskConfig( - name="bbh:penguins_in_a_table", - suite=["harness"], - prompt_function=prompt.bbh_penguins_in_a_table, - hf_repo="lukaemon/bbh", - hf_subset="penguins_in_a_table", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_reasoning_about_colored_objects_harness = LightevalTaskConfig( - name="bbh:reasoning_about_colored_objects", - suite=["harness"], - prompt_function=prompt.bbh_reasoning_about_colored_objects, - hf_repo="lukaemon/bbh", - hf_subset="reasoning_about_colored_objects", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_ruin_names_harness = LightevalTaskConfig( - name="bbh:ruin_names", - suite=["harness"], - prompt_function=prompt.bbh_ruin_names, - hf_repo="lukaemon/bbh", - hf_subset="ruin_names", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_salient_translation_error_detection_harness = LightevalTaskConfig( - name="bbh:salient_translation_error_detection", - suite=["harness"], - prompt_function=prompt.bbh_salient_translation_error_detection, - hf_repo="lukaemon/bbh", - hf_subset="salient_translation_error_detection", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_snarks_harness = LightevalTaskConfig( - name="bbh:snarks", - suite=["harness"], - prompt_function=prompt.bbh_snarks, - hf_repo="lukaemon/bbh", - hf_subset="snarks", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_sports_understanding_harness = LightevalTaskConfig( - name="bbh:sports_understanding", - suite=["harness"], - prompt_function=prompt.bbh_sports_understanding, - hf_repo="lukaemon/bbh", - hf_subset="sports_understanding", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_temporal_sequences_harness = LightevalTaskConfig( - name="bbh:temporal_sequences", - suite=["harness"], - prompt_function=prompt.bbh_temporal_sequences, - hf_repo="lukaemon/bbh", - hf_subset="temporal_sequences", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_tracking_shuffled_objects_five_objects_harness = LightevalTaskConfig( - name="bbh:tracking_shuffled_objects_five_objects", - suite=["harness"], - prompt_function=prompt.bbh_tracking_shuffled_objects_five_objects, - hf_repo="lukaemon/bbh", - hf_subset="tracking_shuffled_objects_five_objects", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_tracking_shuffled_objects_seven_objects_harness = LightevalTaskConfig( - name="bbh:tracking_shuffled_objects_seven_objects", - suite=["harness"], - prompt_function=prompt.bbh_tracking_shuffled_objects_seven_objects, - hf_repo="lukaemon/bbh", - hf_subset="tracking_shuffled_objects_seven_objects", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_tracking_shuffled_objects_three_objects_harness = LightevalTaskConfig( - name="bbh:tracking_shuffled_objects_three_objects", - suite=["harness"], - prompt_function=prompt.bbh_tracking_shuffled_objects_three_objects, - hf_repo="lukaemon/bbh", - hf_subset="tracking_shuffled_objects_three_objects", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_web_of_lies_harness = LightevalTaskConfig( - name="bbh:web_of_lies", - suite=["harness"], - prompt_function=prompt.bbh_web_of_lies, - hf_repo="lukaemon/bbh", - hf_subset="web_of_lies", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbh_word_sorting_harness = LightevalTaskConfig( - name="bbh:word_sorting", - suite=["harness"], - prompt_function=prompt.bbh_word_sorting, - hf_repo="lukaemon/bbh", - hf_subset="word_sorting", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["", "Q=", "\n\n"], - version=0, -) -bbq_helm = LightevalTaskConfig( - name="bbq", - suite=["helm"], - prompt_function=prompt.bbq, - hf_repo="lighteval/bbq_helm", - hf_subset="all", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -bbq_Age_helm = LightevalTaskConfig( - name="bbq:Age", - suite=["helm"], - prompt_function=prompt.bbq, - hf_repo="lighteval/bbq_helm", - hf_subset="Age", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -bbq_Disability_status_helm = LightevalTaskConfig( - name="bbq:Disability_status", - suite=["helm"], - prompt_function=prompt.bbq, - hf_repo="lighteval/bbq_helm", - hf_subset="Disability_status", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -bbq_Gender_identity_helm = LightevalTaskConfig( - name="bbq:Gender_identity", - suite=["helm"], - prompt_function=prompt.bbq, - hf_repo="lighteval/bbq_helm", - hf_subset="Gender_identity", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -bbq_Nationality_helm = LightevalTaskConfig( - name="bbq:Nationality", - suite=["helm"], - prompt_function=prompt.bbq, - hf_repo="lighteval/bbq_helm", - hf_subset="Nationality", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -bbq_Physical_appearance_helm = LightevalTaskConfig( - name="bbq:Physical_appearance", - suite=["helm"], - prompt_function=prompt.bbq, - hf_repo="lighteval/bbq_helm", - hf_subset="Physical_appearance", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -bbq_Race_ethnicity_helm = LightevalTaskConfig( - name="bbq:Race_ethnicity", - suite=["helm"], - prompt_function=prompt.bbq, - hf_repo="lighteval/bbq_helm", - hf_subset="Race_ethnicity", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -bbq_Race_x_SES_helm = LightevalTaskConfig( - name="bbq:Race_x_SES", - suite=["helm"], - prompt_function=prompt.bbq, - hf_repo="lighteval/bbq_helm", - hf_subset="Race_x_SES", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -bbq_Race_x_gender_helm = LightevalTaskConfig( - name="bbq:Race_x_gender", - suite=["helm"], - prompt_function=prompt.bbq, - hf_repo="lighteval/bbq_helm", - hf_subset="Race_x_gender", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -bbq_Religion_helm = LightevalTaskConfig( - name="bbq:Religion", - suite=["helm"], - prompt_function=prompt.bbq, - hf_repo="lighteval/bbq_helm", - hf_subset="Religion", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -bbq_SES_helm = LightevalTaskConfig( - name="bbq:SES", - suite=["helm"], - prompt_function=prompt.bbq, - hf_repo="lighteval/bbq_helm", - hf_subset="SES", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -bbq_Sexual_orientation_helm = LightevalTaskConfig( - name="bbq:Sexual_orientation", - suite=["helm"], - prompt_function=prompt.bbq, - hf_repo="lighteval/bbq_helm", - hf_subset="Sexual_orientation", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -bbq_lite_json_bigbench_lite = LightevalTaskConfig( - name="bbq_lite_json", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="bbq_lite_json", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -bigbench_auto_debugging_helm = LightevalTaskConfig( - name="bigbench:auto_debugging", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="auto_debugging", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_age_ambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:age_ambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-age_ambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_age_disambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:age_disambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-age_disambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_disability_status_ambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:disability_status_ambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-disability_status_ambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_disability_status_disambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:disability_status_disambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-disability_status_disambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_gender_identity_ambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:gender_identity_ambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-gender_identity_ambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_gender_identity_disambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:gender_identity_disambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-gender_identity_disambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_nationality_ambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:nationality_ambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-nationality_ambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_nationality_disambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:nationality_disambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-nationality_disambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_physical_appearance_ambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:physical_appearance_ambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-physical_appearance_ambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_physical_appearance_disambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:physical_appearance_disambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-physical_appearance_disambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_race_ethnicity_ambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:race_ethnicity_ambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-race_ethnicity_ambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_race_ethnicity_disambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:race_ethnicity_disambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-race_ethnicity_disambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_religion_ambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:religion_ambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-religion_ambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_religion_disambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:religion_disambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-religion_disambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_ses_ambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:ses_ambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-ses_ambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_ses_disambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:ses_disambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-ses_disambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_sexual_orientation_ambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:sexual_orientation_ambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-sexual_orientation_ambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_bbq_lite_json_sexual_orientation_disambig_helm = LightevalTaskConfig( - name="bigbench:bbq_lite_json:sexual_orientation_disambig", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="bbq_lite_json-sexual_orientation_disambig", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_code_line_description_helm = LightevalTaskConfig( - name="bigbench:code_line_description", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="code_line_description", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_conceptual_combinations_contradictions_helm = LightevalTaskConfig( - name="bigbench:conceptual_combinations:contradictions", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conceptual_combinations-contradictions", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_conceptual_combinations_emergent_properties_helm = LightevalTaskConfig( - name="bigbench:conceptual_combinations:emergent_properties", - suite=["helm"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conceptual_combinations-emergent_properties", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_conceptual_combinations_fanciful_fictional_combinations_helm = LightevalTaskConfig( - name="bigbench:conceptual_combinations:fanciful_fictional_combinations", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conceptual_combinations-fanciful_fictional_combinations", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_conceptual_combinations_homonyms_helm = LightevalTaskConfig( - name="bigbench:conceptual_combinations:homonyms", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conceptual_combinations-homonyms", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_conceptual_combinations_invented_words_helm = LightevalTaskConfig( - name="bigbench:conceptual_combinations:invented_words", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conceptual_combinations-invented_words", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_adna_from_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:adna_from", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-adna_from", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_adna_to_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:adna_to", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-adna_to", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_atikampe_from_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:atikampe_from", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-atikampe_from", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_atikampe_to_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:atikampe_to", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-atikampe_to", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_gornam_from_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:gornam_from", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-gornam_from", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_gornam_to_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:gornam_to", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-gornam_to", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_holuan_from_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:holuan_from", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-holuan_from", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_holuan_to_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:holuan_to", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-holuan_to", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_mkafala_from_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:mkafala_from", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-mkafala_from", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_mkafala_to_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:mkafala_to", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-mkafala_to", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_postpositive_english_from_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:postpositive_english_from", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-postpositive_english_from", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_postpositive_english_to_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:postpositive_english_to", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-postpositive_english_to", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_unapuri_from_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:unapuri_from", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-unapuri_from", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_unapuri_to_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:unapuri_to", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-unapuri_to", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_vaomi_from_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:vaomi_from", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-vaomi_from", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_conlang_translation_vaomi_to_helm = LightevalTaskConfig( - name="bigbench:conlang_translation:vaomi_to", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="conlang_translation-vaomi_to", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge1, Metrics.rouge2, Metrics.rougeL], - stop_sequence=["\n"], - version=0, -) -bigbench_emoji_movie_helm = LightevalTaskConfig( - name="bigbench:emoji_movie", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="emoji_movie", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_formal_fallacies_syllogisms_negation_helm = LightevalTaskConfig( - name="bigbench:formal_fallacies_syllogisms_negation", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="formal_fallacies_syllogisms_negation", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_hindu_knowledge_helm = LightevalTaskConfig( - name="bigbench:hindu_knowledge", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="hindu_knowledge", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_known_unknowns_helm = LightevalTaskConfig( - name="bigbench:known_unknowns", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="known_unknowns", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_language_identification_helm = LightevalTaskConfig( - name="bigbench:language_identification", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="language_identification", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_linguistics_puzzles_helm = LightevalTaskConfig( - name="bigbench:linguistics_puzzles", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="linguistics_puzzles", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_logic_grid_puzzle_helm = LightevalTaskConfig( - name="bigbench:logic_grid_puzzle", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="logic_grid_puzzle", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_logical_deduction_five_objects_helm = LightevalTaskConfig( - name="bigbench:logical_deduction-five_objects", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="logical_deduction-five_objects", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_logical_deduction_seven_objects_helm = LightevalTaskConfig( - name="bigbench:logical_deduction-seven_objects", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="logical_deduction-seven_objects", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_logical_deduction_three_objects_helm = LightevalTaskConfig( - name="bigbench:logical_deduction-three_objects", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="logical_deduction-three_objects", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_misconceptions_russian_helm = LightevalTaskConfig( - name="bigbench:misconceptions_russian", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="misconceptions_russian", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_novel_concepts_helm = LightevalTaskConfig( - name="bigbench:novel_concepts", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="novel_concepts", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_operators_helm = LightevalTaskConfig( - name="bigbench:operators", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="operators", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_parsinlu_reading_comprehension_helm = LightevalTaskConfig( - name="bigbench:parsinlu_reading_comprehension", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="parsinlu_reading_comprehension", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_play_dialog_same_or_different_helm = LightevalTaskConfig( - name="bigbench:play_dialog_same_or_different", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="play_dialog_same_or_different", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_repeat_copy_logic_helm = LightevalTaskConfig( - name="bigbench:repeat_copy_logic", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="repeat_copy_logic", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_strange_stories_boolean_helm = LightevalTaskConfig( - name="bigbench:strange_stories-boolean", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="strange_stories-boolean", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_strange_stories_multiple_choice_helm = LightevalTaskConfig( - name="bigbench:strange_stories-multiple_choice", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="strange_stories-multiple_choice", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_strategyqa_helm = LightevalTaskConfig( - name="bigbench:strategyqa", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="strategyqa", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_symbol_interpretation_adversarial_helm = LightevalTaskConfig( - name="bigbench:symbol_interpretation-adversarial", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="symbol_interpretation-adversarial", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_symbol_interpretation_emoji_agnostic_helm = LightevalTaskConfig( - name="bigbench:symbol_interpretation-emoji_agnostic", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="symbol_interpretation-emoji_agnostic", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_symbol_interpretation_name_agnostic_helm = LightevalTaskConfig( - name="bigbench:symbol_interpretation-name_agnostic", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="symbol_interpretation-name_agnostic", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_symbol_interpretation_plain_helm = LightevalTaskConfig( - name="bigbench:symbol_interpretation-plain", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="symbol_interpretation-plain", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_symbol_interpretation_tricky_helm = LightevalTaskConfig( - name="bigbench:symbol_interpretation-tricky", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="symbol_interpretation-tricky", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_vitaminc_fact_verification_helm = LightevalTaskConfig( - name="bigbench:vitaminc_fact_verification", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="vitaminc_fact_verification", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bigbench_winowhy_helm = LightevalTaskConfig( - name="bigbench:winowhy", - suite=["helm", "bigbench_scenario"], - prompt_function=prompt.bigbench_helm, - hf_repo="lighteval/bigbench_helm", - hf_subset="winowhy", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -blimp_adjunct_island_lighteval = LightevalTaskConfig( - name="blimp:adjunct_island", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="adjunct_island", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_adjunct_island_helm = LightevalTaskConfig( - name="blimp:adjunct_island", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="adjunct_island", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_anaphor_gender_agreement_lighteval = LightevalTaskConfig( - name="blimp:anaphor_gender_agreement", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="anaphor_gender_agreement", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_anaphor_gender_agreement_helm = LightevalTaskConfig( - name="blimp:anaphor_gender_agreement", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="anaphor_gender_agreement", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_anaphor_number_agreement_lighteval = LightevalTaskConfig( - name="blimp:anaphor_number_agreement", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="anaphor_number_agreement", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_anaphor_number_agreement_helm = LightevalTaskConfig( - name="blimp:anaphor_number_agreement", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="anaphor_number_agreement", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_animate_subject_passive_lighteval = LightevalTaskConfig( - name="blimp:animate_subject_passive", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="animate_subject_passive", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_animate_subject_passive_helm = LightevalTaskConfig( - name="blimp:animate_subject_passive", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="animate_subject_passive", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_animate_subject_trans_lighteval = LightevalTaskConfig( - name="blimp:animate_subject_trans", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="animate_subject_trans", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_animate_subject_trans_helm = LightevalTaskConfig( - name="blimp:animate_subject_trans", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="animate_subject_trans", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_causative_lighteval = LightevalTaskConfig( - name="blimp:causative", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="causative", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_causative_helm = LightevalTaskConfig( - name="blimp:causative", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="causative", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_complex_NP_island_lighteval = LightevalTaskConfig( - name="blimp:complex_NP_island", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="complex_NP_island", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_complex_NP_island_helm = LightevalTaskConfig( - name="blimp:complex_NP_island", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="complex_NP_island", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_coordinate_structure_constraint_complex_left_branch_lighteval = LightevalTaskConfig( - name="blimp:coordinate_structure_constraint_complex_left_branch", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="coordinate_structure_constraint_complex_left_branch", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_coordinate_structure_constraint_complex_left_branch_helm = LightevalTaskConfig( - name="blimp:coordinate_structure_constraint_complex_left_branch", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="coordinate_structure_constraint_complex_left_branch", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_coordinate_structure_constraint_object_extraction_lighteval = LightevalTaskConfig( - name="blimp:coordinate_structure_constraint_object_extraction", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="coordinate_structure_constraint_object_extraction", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_coordinate_structure_constraint_object_extraction_helm = LightevalTaskConfig( - name="blimp:coordinate_structure_constraint_object_extraction", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="coordinate_structure_constraint_object_extraction", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_1_lighteval = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_1_helm = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_2_lighteval = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_2_helm = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_irregular_1_lighteval = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_irregular_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_irregular_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_irregular_1_helm = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_irregular_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_irregular_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_irregular_2_lighteval = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_irregular_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_irregular_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_irregular_2_helm = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_irregular_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_irregular_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_with_adj_2_lighteval = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_with_adj_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_with_adj_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_with_adj_2_helm = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_with_adj_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_with_adj_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_with_adj_irregular_1_lighteval = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_with_adj_irregular_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_with_adj_irregular_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_with_adj_irregular_1_helm = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_with_adj_irregular_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_with_adj_irregular_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_with_adj_irregular_2_lighteval = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_with_adj_irregular_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_with_adj_irregular_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_with_adj_irregular_2_helm = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_with_adj_irregular_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_with_adj_irregular_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_with_adjective_1_lighteval = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_with_adjective_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_with_adjective_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_determiner_noun_agreement_with_adjective_1_helm = LightevalTaskConfig( - name="blimp:determiner_noun_agreement_with_adjective_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="determiner_noun_agreement_with_adjective_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_distractor_agreement_relational_noun_lighteval = LightevalTaskConfig( - name="blimp:distractor_agreement_relational_noun", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="distractor_agreement_relational_noun", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_distractor_agreement_relational_noun_helm = LightevalTaskConfig( - name="blimp:distractor_agreement_relational_noun", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="distractor_agreement_relational_noun", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_distractor_agreement_relative_clause_lighteval = LightevalTaskConfig( - name="blimp:distractor_agreement_relative_clause", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="distractor_agreement_relative_clause", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_distractor_agreement_relative_clause_helm = LightevalTaskConfig( - name="blimp:distractor_agreement_relative_clause", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="distractor_agreement_relative_clause", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_drop_argument_lighteval = LightevalTaskConfig( - name="blimp:drop_argument", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="drop_argument", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_drop_argument_helm = LightevalTaskConfig( - name="blimp:drop_argument", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="drop_argument", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_ellipsis_n_bar_1_lighteval = LightevalTaskConfig( - name="blimp:ellipsis_n_bar_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="ellipsis_n_bar_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_ellipsis_n_bar_1_helm = LightevalTaskConfig( - name="blimp:ellipsis_n_bar_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="ellipsis_n_bar_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_ellipsis_n_bar_2_lighteval = LightevalTaskConfig( - name="blimp:ellipsis_n_bar_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="ellipsis_n_bar_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_ellipsis_n_bar_2_helm = LightevalTaskConfig( - name="blimp:ellipsis_n_bar_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="ellipsis_n_bar_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_existential_there_object_raising_lighteval = LightevalTaskConfig( - name="blimp:existential_there_object_raising", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="existential_there_object_raising", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_existential_there_object_raising_helm = LightevalTaskConfig( - name="blimp:existential_there_object_raising", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="existential_there_object_raising", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_existential_there_quantifiers_1_lighteval = LightevalTaskConfig( - name="blimp:existential_there_quantifiers_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="existential_there_quantifiers_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_existential_there_quantifiers_1_helm = LightevalTaskConfig( - name="blimp:existential_there_quantifiers_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="existential_there_quantifiers_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_existential_there_quantifiers_2_lighteval = LightevalTaskConfig( - name="blimp:existential_there_quantifiers_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="existential_there_quantifiers_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_existential_there_quantifiers_2_helm = LightevalTaskConfig( - name="blimp:existential_there_quantifiers_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="existential_there_quantifiers_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_existential_there_subject_raising_lighteval = LightevalTaskConfig( - name="blimp:existential_there_subject_raising", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="existential_there_subject_raising", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_existential_there_subject_raising_helm = LightevalTaskConfig( - name="blimp:existential_there_subject_raising", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="existential_there_subject_raising", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_expletive_it_object_raising_lighteval = LightevalTaskConfig( - name="blimp:expletive_it_object_raising", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="expletive_it_object_raising", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_expletive_it_object_raising_helm = LightevalTaskConfig( - name="blimp:expletive_it_object_raising", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="expletive_it_object_raising", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_inchoative_lighteval = LightevalTaskConfig( - name="blimp:inchoative", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="inchoative", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_inchoative_helm = LightevalTaskConfig( - name="blimp:inchoative", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="inchoative", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_intransitive_lighteval = LightevalTaskConfig( - name="blimp:intransitive", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="intransitive", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_intransitive_helm = LightevalTaskConfig( - name="blimp:intransitive", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="intransitive", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_irregular_past_participle_adjectives_lighteval = LightevalTaskConfig( - name="blimp:irregular_past_participle_adjectives", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="irregular_past_participle_adjectives", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_irregular_past_participle_adjectives_helm = LightevalTaskConfig( - name="blimp:irregular_past_participle_adjectives", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="irregular_past_participle_adjectives", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_irregular_past_participle_verbs_lighteval = LightevalTaskConfig( - name="blimp:irregular_past_participle_verbs", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="irregular_past_participle_verbs", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_irregular_past_participle_verbs_helm = LightevalTaskConfig( - name="blimp:irregular_past_participle_verbs", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="irregular_past_participle_verbs", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_irregular_plural_subject_verb_agreement_1_lighteval = LightevalTaskConfig( - name="blimp:irregular_plural_subject_verb_agreement_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="irregular_plural_subject_verb_agreement_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_irregular_plural_subject_verb_agreement_1_helm = LightevalTaskConfig( - name="blimp:irregular_plural_subject_verb_agreement_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="irregular_plural_subject_verb_agreement_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_irregular_plural_subject_verb_agreement_2_lighteval = LightevalTaskConfig( - name="blimp:irregular_plural_subject_verb_agreement_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="irregular_plural_subject_verb_agreement_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_irregular_plural_subject_verb_agreement_2_helm = LightevalTaskConfig( - name="blimp:irregular_plural_subject_verb_agreement_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="irregular_plural_subject_verb_agreement_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_left_branch_island_echo_question_lighteval = LightevalTaskConfig( - name="blimp:left_branch_island_echo_question", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="left_branch_island_echo_question", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_left_branch_island_echo_question_helm = LightevalTaskConfig( - name="blimp:left_branch_island_echo_question", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="left_branch_island_echo_question", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_left_branch_island_simple_question_lighteval = LightevalTaskConfig( - name="blimp:left_branch_island_simple_question", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="left_branch_island_simple_question", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_left_branch_island_simple_question_helm = LightevalTaskConfig( - name="blimp:left_branch_island_simple_question", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="left_branch_island_simple_question", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_matrix_question_npi_licensor_present_lighteval = LightevalTaskConfig( - name="blimp:matrix_question_npi_licensor_present", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="matrix_question_npi_licensor_present", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_matrix_question_npi_licensor_present_helm = LightevalTaskConfig( - name="blimp:matrix_question_npi_licensor_present", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="matrix_question_npi_licensor_present", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_npi_present_1_lighteval = LightevalTaskConfig( - name="blimp:npi_present_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="npi_present_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_npi_present_1_helm = LightevalTaskConfig( - name="blimp:npi_present_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="npi_present_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_npi_present_2_lighteval = LightevalTaskConfig( - name="blimp:npi_present_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="npi_present_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_npi_present_2_helm = LightevalTaskConfig( - name="blimp:npi_present_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="npi_present_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_only_npi_licensor_present_lighteval = LightevalTaskConfig( - name="blimp:only_npi_licensor_present", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="only_npi_licensor_present", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_only_npi_licensor_present_helm = LightevalTaskConfig( - name="blimp:only_npi_licensor_present", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="only_npi_licensor_present", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_only_npi_scope_lighteval = LightevalTaskConfig( - name="blimp:only_npi_scope", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="only_npi_scope", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_only_npi_scope_helm = LightevalTaskConfig( - name="blimp:only_npi_scope", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="only_npi_scope", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_passive_1_lighteval = LightevalTaskConfig( - name="blimp:passive_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="passive_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_passive_1_helm = LightevalTaskConfig( - name="blimp:passive_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="passive_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_passive_2_lighteval = LightevalTaskConfig( - name="blimp:passive_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="passive_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_passive_2_helm = LightevalTaskConfig( - name="blimp:passive_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="passive_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_c_command_lighteval = LightevalTaskConfig( - name="blimp:principle_A_c_command", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="principle_A_c_command", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_c_command_helm = LightevalTaskConfig( - name="blimp:principle_A_c_command", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="principle_A_c_command", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_case_1_lighteval = LightevalTaskConfig( - name="blimp:principle_A_case_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="principle_A_case_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_case_1_helm = LightevalTaskConfig( - name="blimp:principle_A_case_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="principle_A_case_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_case_2_lighteval = LightevalTaskConfig( - name="blimp:principle_A_case_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="principle_A_case_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_case_2_helm = LightevalTaskConfig( - name="blimp:principle_A_case_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="principle_A_case_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_domain_1_lighteval = LightevalTaskConfig( - name="blimp:principle_A_domain_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="principle_A_domain_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_domain_1_helm = LightevalTaskConfig( - name="blimp:principle_A_domain_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="principle_A_domain_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_domain_2_lighteval = LightevalTaskConfig( - name="blimp:principle_A_domain_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="principle_A_domain_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_domain_2_helm = LightevalTaskConfig( - name="blimp:principle_A_domain_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="principle_A_domain_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_domain_3_lighteval = LightevalTaskConfig( - name="blimp:principle_A_domain_3", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="principle_A_domain_3", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_domain_3_helm = LightevalTaskConfig( - name="blimp:principle_A_domain_3", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="principle_A_domain_3", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_reconstruction_lighteval = LightevalTaskConfig( - name="blimp:principle_A_reconstruction", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="principle_A_reconstruction", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_principle_A_reconstruction_helm = LightevalTaskConfig( - name="blimp:principle_A_reconstruction", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="principle_A_reconstruction", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_regular_plural_subject_verb_agreement_1_lighteval = LightevalTaskConfig( - name="blimp:regular_plural_subject_verb_agreement_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="regular_plural_subject_verb_agreement_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_regular_plural_subject_verb_agreement_1_helm = LightevalTaskConfig( - name="blimp:regular_plural_subject_verb_agreement_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="regular_plural_subject_verb_agreement_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_regular_plural_subject_verb_agreement_2_lighteval = LightevalTaskConfig( - name="blimp:regular_plural_subject_verb_agreement_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="regular_plural_subject_verb_agreement_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_regular_plural_subject_verb_agreement_2_helm = LightevalTaskConfig( - name="blimp:regular_plural_subject_verb_agreement_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="regular_plural_subject_verb_agreement_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_sentential_negation_npi_licensor_present_lighteval = LightevalTaskConfig( - name="blimp:sentential_negation_npi_licensor_present", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="sentential_negation_npi_licensor_present", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_sentential_negation_npi_licensor_present_helm = LightevalTaskConfig( - name="blimp:sentential_negation_npi_licensor_present", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="sentential_negation_npi_licensor_present", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_sentential_negation_npi_scope_lighteval = LightevalTaskConfig( - name="blimp:sentential_negation_npi_scope", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="sentential_negation_npi_scope", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_sentential_negation_npi_scope_helm = LightevalTaskConfig( - name="blimp:sentential_negation_npi_scope", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="sentential_negation_npi_scope", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_sentential_subject_island_lighteval = LightevalTaskConfig( - name="blimp:sentential_subject_island", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="sentential_subject_island", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_sentential_subject_island_helm = LightevalTaskConfig( - name="blimp:sentential_subject_island", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="sentential_subject_island", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_superlative_quantifiers_1_lighteval = LightevalTaskConfig( - name="blimp:superlative_quantifiers_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="superlative_quantifiers_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_superlative_quantifiers_1_helm = LightevalTaskConfig( - name="blimp:superlative_quantifiers_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="superlative_quantifiers_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_superlative_quantifiers_2_lighteval = LightevalTaskConfig( - name="blimp:superlative_quantifiers_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="superlative_quantifiers_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_superlative_quantifiers_2_helm = LightevalTaskConfig( - name="blimp:superlative_quantifiers_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="superlative_quantifiers_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_tough_vs_raising_1_lighteval = LightevalTaskConfig( - name="blimp:tough_vs_raising_1", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="tough_vs_raising_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_tough_vs_raising_1_helm = LightevalTaskConfig( - name="blimp:tough_vs_raising_1", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="tough_vs_raising_1", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_tough_vs_raising_2_lighteval = LightevalTaskConfig( - name="blimp:tough_vs_raising_2", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="tough_vs_raising_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_tough_vs_raising_2_helm = LightevalTaskConfig( - name="blimp:tough_vs_raising_2", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="tough_vs_raising_2", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_transitive_lighteval = LightevalTaskConfig( - name="blimp:transitive", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="transitive", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_transitive_helm = LightevalTaskConfig( - name="blimp:transitive", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="transitive", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_wh_island_lighteval = LightevalTaskConfig( - name="blimp:wh_island", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="wh_island", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_wh_island_helm = LightevalTaskConfig( - name="blimp:wh_island", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="wh_island", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_wh_questions_object_gap_lighteval = LightevalTaskConfig( - name="blimp:wh_questions_object_gap", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="wh_questions_object_gap", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_wh_questions_object_gap_helm = LightevalTaskConfig( - name="blimp:wh_questions_object_gap", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="wh_questions_object_gap", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_wh_questions_subject_gap_lighteval = LightevalTaskConfig( - name="blimp:wh_questions_subject_gap", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="wh_questions_subject_gap", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_wh_questions_subject_gap_helm = LightevalTaskConfig( - name="blimp:wh_questions_subject_gap", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="wh_questions_subject_gap", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_wh_questions_subject_gap_long_distance_lighteval = LightevalTaskConfig( - name="blimp:wh_questions_subject_gap_long_distance", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="wh_questions_subject_gap_long_distance", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_wh_questions_subject_gap_long_distance_helm = LightevalTaskConfig( - name="blimp:wh_questions_subject_gap_long_distance", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="wh_questions_subject_gap_long_distance", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_wh_vs_that_no_gap_lighteval = LightevalTaskConfig( - name="blimp:wh_vs_that_no_gap", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="wh_vs_that_no_gap", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_wh_vs_that_no_gap_helm = LightevalTaskConfig( - name="blimp:wh_vs_that_no_gap", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="wh_vs_that_no_gap", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_wh_vs_that_no_gap_long_distance_lighteval = LightevalTaskConfig( - name="blimp:wh_vs_that_no_gap_long_distance", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="wh_vs_that_no_gap_long_distance", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_wh_vs_that_no_gap_long_distance_helm = LightevalTaskConfig( - name="blimp:wh_vs_that_no_gap_long_distance", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="wh_vs_that_no_gap_long_distance", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_wh_vs_that_with_gap_lighteval = LightevalTaskConfig( - name="blimp:wh_vs_that_with_gap", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="wh_vs_that_with_gap", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_wh_vs_that_with_gap_helm = LightevalTaskConfig( - name="blimp:wh_vs_that_with_gap", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="wh_vs_that_with_gap", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -blimp_wh_vs_that_with_gap_long_distance_lighteval = LightevalTaskConfig( - name="blimp:wh_vs_that_with_gap_long_distance", - suite=["lighteval", "blimp"], - prompt_function=prompt.blimp, - hf_repo="blimp", - hf_subset="wh_vs_that_with_gap_long_distance", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -blimp_wh_vs_that_with_gap_long_distance_helm = LightevalTaskConfig( - name="blimp:wh_vs_that_with_gap_long_distance", - suite=["helm", "blimp"], - prompt_function=prompt.blimp_helm, - hf_repo="blimp", - hf_subset="wh_vs_that_with_gap_long_distance", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -bold_helm = LightevalTaskConfig( - name="bold", - suite=["helm"], - prompt_function=prompt.bold, - hf_repo="lighteval/bold_helm", - hf_subset="all", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.prediction_perplexity], - stop_sequence=["\n"], - version=0, -) -bold_gender_helm = LightevalTaskConfig( - name="bold:gender", - suite=["helm"], - prompt_function=prompt.bold, - hf_repo="lighteval/bold_helm", - hf_subset="gender", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.prediction_perplexity], - stop_sequence=["\n"], - version=0, -) -bold_political_ideology_helm = LightevalTaskConfig( - name="bold:political_ideology", - suite=["helm"], - prompt_function=prompt.bold, - hf_repo="lighteval/bold_helm", - hf_subset="political_ideology", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.prediction_perplexity], - stop_sequence=["\n"], - version=0, -) -bold_profession_helm = LightevalTaskConfig( - name="bold:profession", - suite=["helm"], - prompt_function=prompt.bold, - hf_repo="lighteval/bold_helm", - hf_subset="profession", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.prediction_perplexity], - stop_sequence=["\n"], - version=0, -) -bold_race_helm = LightevalTaskConfig( - name="bold:race", - suite=["helm"], - prompt_function=prompt.bold, - hf_repo="lighteval/bold_helm", - hf_subset="race", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.prediction_perplexity], - stop_sequence=["\n"], - version=0, -) -bold_religious_ideology_helm = LightevalTaskConfig( - name="bold:religious_ideology", - suite=["helm"], - prompt_function=prompt.bold, - hf_repo="lighteval/bold_helm", - hf_subset="religious_ideology", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.prediction_perplexity], - stop_sequence=["\n"], - version=0, -) -boolq_helm = LightevalTaskConfig( - name="boolq", - suite=["helm", "helm_general"], - prompt_function=prompt.boolq_helm, - hf_repo="lighteval/boolq_helm", - hf_subset="default", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -boolq_contrastset_helm = LightevalTaskConfig( - name="boolq:contrastset", - suite=["helm"], - prompt_function=prompt.boolq_helm_contrastset, - hf_repo="lighteval/boolq_helm", - hf_subset="default", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -bridging_anaphora_resolution_barqa_bigbench = LightevalTaskConfig( - name="bridging_anaphora_resolution_barqa", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="bridging_anaphora_resolution_barqa", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -causal_judgment_bigbench = LightevalTaskConfig( - name="causal_judgment", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="causal_judgment", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -cause_and_effect_bigbench = LightevalTaskConfig( - name="cause_and_effect", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="cause_and_effect", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -checkmate_in_one_bigbench = LightevalTaskConfig( - name="checkmate_in_one", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="checkmate_in_one", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -chess_state_tracking_bigbench = LightevalTaskConfig( - name="chess_state_tracking", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="chess_state_tracking", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -chinese_remainder_theorem_bigbench = LightevalTaskConfig( - name="chinese_remainder_theorem", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="chinese_remainder_theorem", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -cifar10_classification_bigbench = LightevalTaskConfig( - name="cifar10_classification", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="cifar10_classification", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -civil_comments_helm = LightevalTaskConfig( - name="civil_comments", - suite=["helm", "helm_general"], - prompt_function=prompt.civil_comments, - hf_repo="lighteval/civil_comments_helm", - hf_subset="all", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -civil_comments_LGBTQ_helm = LightevalTaskConfig( - name="civil_comments:LGBTQ", - suite=["helm"], - prompt_function=prompt.civil_comments, - hf_repo="lighteval/civil_comments_helm", - hf_subset="LGBTQ", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -civil_comments_black_helm = LightevalTaskConfig( - name="civil_comments:black", - suite=["helm"], - prompt_function=prompt.civil_comments, - hf_repo="lighteval/civil_comments_helm", - hf_subset="black", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -civil_comments_christian_helm = LightevalTaskConfig( - name="civil_comments:christian", - suite=["helm"], - prompt_function=prompt.civil_comments, - hf_repo="lighteval/civil_comments_helm", - hf_subset="christian", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -civil_comments_female_helm = LightevalTaskConfig( - name="civil_comments:female", - suite=["helm"], - prompt_function=prompt.civil_comments, - hf_repo="lighteval/civil_comments_helm", - hf_subset="female", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -civil_comments_male_helm = LightevalTaskConfig( - name="civil_comments:male", - suite=["helm"], - prompt_function=prompt.civil_comments, - hf_repo="lighteval/civil_comments_helm", - hf_subset="male", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -civil_comments_muslim_helm = LightevalTaskConfig( - name="civil_comments:muslim", - suite=["helm"], - prompt_function=prompt.civil_comments, - hf_repo="lighteval/civil_comments_helm", - hf_subset="muslim", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -civil_comments_other_religions_helm = LightevalTaskConfig( - name="civil_comments:other_religions", - suite=["helm"], - prompt_function=prompt.civil_comments, - hf_repo="lighteval/civil_comments_helm", - hf_subset="other_religions", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -civil_comments_white_helm = LightevalTaskConfig( - name="civil_comments:white", - suite=["helm"], - prompt_function=prompt.civil_comments, - hf_repo="lighteval/civil_comments_helm", - hf_subset="white", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -code_line_description_bigbench_lite = LightevalTaskConfig( - name="code_line_description", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_linefeed_before_and_after_query, - hf_repo="tasksource/bigbench", - hf_subset="code_line_description", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -codenames_bigbench = LightevalTaskConfig( - name="codenames", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="codenames", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.bleu], - stop_sequence=["\n"], - version=0, -) -color_bigbench = LightevalTaskConfig( - name="color", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="color", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.rouge_t5, - Metrics.bleu, - Metrics.loglikelihood_acc, - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -common_morpheme_bigbench = LightevalTaskConfig( - name="common_morpheme", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="common_morpheme", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -commonsenseqa_helm = LightevalTaskConfig( - name="commonsenseqa", - suite=["helm", "commonsense_scenario"], - prompt_function=prompt.commonsense_qa, - hf_repo="commonsense_qa", - hf_subset="default", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -conceptual_combinations_bigbench_lite = LightevalTaskConfig( - name="conceptual_combinations", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="conceptual_combinations", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -conlang_translation_bigbench_lite = LightevalTaskConfig( - name="conlang_translation", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="conlang_translation", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=[".", ";", "!", "?"], - version=0, -) -contextual_parametric_knowledge_conflicts_bigbench = LightevalTaskConfig( - name="contextual_parametric_knowledge_conflicts", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="contextual_parametric_knowledge_conflicts", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -copyright_n_books_1000_extractions_per_book_1_prefix_length_125_helm = LightevalTaskConfig( - name="copyright:n_books_1000-extractions_per_book_1-prefix_length_125", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="n_books_1000-extractions_per_book_1-prefix_length_125", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_n_books_1000_extractions_per_book_1_prefix_length_25_helm = LightevalTaskConfig( - name="copyright:n_books_1000-extractions_per_book_1-prefix_length_25", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="n_books_1000-extractions_per_book_1-prefix_length_25", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_n_books_1000_extractions_per_book_1_prefix_length_5_helm = LightevalTaskConfig( - name="copyright:n_books_1000-extractions_per_book_1-prefix_length_5", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="n_books_1000-extractions_per_book_1-prefix_length_5", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_n_books_1000_extractions_per_book_3_prefix_length_125_helm = LightevalTaskConfig( - name="copyright:n_books_1000-extractions_per_book_3-prefix_length_125", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="n_books_1000-extractions_per_book_3-prefix_length_125", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_n_books_1000_extractions_per_book_3_prefix_length_25_helm = LightevalTaskConfig( - name="copyright:n_books_1000-extractions_per_book_3-prefix_length_25", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="n_books_1000-extractions_per_book_3-prefix_length_25", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_n_books_1000_extractions_per_book_3_prefix_length_5_helm = LightevalTaskConfig( - name="copyright:n_books_1000-extractions_per_book_3-prefix_length_5", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="n_books_1000-extractions_per_book_3-prefix_length_5", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_oh_the_places_helm = LightevalTaskConfig( - name="copyright:oh_the_places", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="oh_the_places", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_pilot_helm = LightevalTaskConfig( - name="copyright:pilot", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="pilot", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_popular_books_prefix_length_10_helm = LightevalTaskConfig( - name="copyright:popular_books-prefix_length_10", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="popular_books-prefix_length_10", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_popular_books_prefix_length_125_helm = LightevalTaskConfig( - name="copyright:popular_books-prefix_length_125", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="popular_books-prefix_length_125", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_popular_books_prefix_length_25_helm = LightevalTaskConfig( - name="copyright:popular_books-prefix_length_25", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="popular_books-prefix_length_25", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_popular_books_prefix_length_250_helm = LightevalTaskConfig( - name="copyright:popular_books-prefix_length_250", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="popular_books-prefix_length_250", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_popular_books_prefix_length_5_helm = LightevalTaskConfig( - name="copyright:popular_books-prefix_length_5", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="popular_books-prefix_length_5", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_popular_books_prefix_length_50_helm = LightevalTaskConfig( - name="copyright:popular_books-prefix_length_50", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="popular_books-prefix_length_50", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_prompt_num_line_1_min_lines_20_helm = LightevalTaskConfig( - name="copyright:prompt_num_line_1-min_lines_20", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="prompt_num_line_1-min_lines_20", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_prompt_num_line_10_min_lines_20_helm = LightevalTaskConfig( - name="copyright:prompt_num_line_10-min_lines_20", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="prompt_num_line_10-min_lines_20", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -copyright_prompt_num_line_5_min_lines_20_helm = LightevalTaskConfig( - name="copyright:prompt_num_line_5-min_lines_20", - suite=["helm", "copyright_scenario"], - prompt_function=prompt.copyright, - hf_repo="lighteval/copyright_helm", - hf_subset="prompt_num_line_5-min_lines_20", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.copyright], - stop_sequence=["\n"], - version=0, -) -coqa_first_question = LightevalTaskConfig( - name="coqa", - prompt_function=get_qa_prompt_function( - Language.ENGLISH, - lambda line: { - "question": line["questions"][0], - "context": line["story"], - "choices": [line["answers"]["input_text"][0]], - }, - ), - suite=["lighteval"], - hf_repo="stanfordnlp/coqa", - hf_subset="default", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - stop_sequence=["\n", "Question:", "question:"], - generation_size=100, - version=1, - metrics=( - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ), -) -coqa_bb_lighteval = LightevalTaskConfig( - name="coqa_bb", - suite=["lighteval", "bigbench_programmatic", "bigbench"], - prompt_function=prompt.coqa, - hf_repo="coqa", - hf_subset="default", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False}), Metrics.f1_score], - stop_sequence=["\n"], - version=0, -) -covid_dialogue_helm = LightevalTaskConfig( - name="covid_dialogue", - suite=["helm"], - prompt_function=prompt.covid_dialogue, - hf_repo="lighteval/covid_dialogue", - hf_subset="default", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.rougeL, - Metrics.bleu_1, - Metrics.bleu_4, - ], - stop_sequence=["\n"], - version=0, -) -crash_blossom_bigbench = LightevalTaskConfig( - name="crash_blossom", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="crash_blossom", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -crass_ai_bigbench = LightevalTaskConfig( - name="crass_ai", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="crass_ai", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -cryobiology_spanish_bigbench = LightevalTaskConfig( - name="cryobiology_spanish", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="cryobiology_spanish", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -cryptonite_bigbench = LightevalTaskConfig( - name="cryptonite", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="cryptonite", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -cs_algorithms_bigbench = LightevalTaskConfig( - name="cs_algorithms", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="cs_algorithms", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -dark_humor_detection_bigbench = LightevalTaskConfig( - name="dark_humor_detection", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="dark_humor_detection", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -date_understanding_bigbench = LightevalTaskConfig( - name="date_understanding", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="date_understanding", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -disambiguation_qa_bigbench = LightevalTaskConfig( - name="disambiguation_qa", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="disambiguation_qa", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -discourse_marker_prediction_bigbench = LightevalTaskConfig( - name="discourse_marker_prediction", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="discourse_marker_prediction", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -disfl_qa_bigbench = LightevalTaskConfig( - name="disfl_qa", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="disfl_qa", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -drop_qa = LightevalTaskConfig( - name="drop", - prompt_function=get_qa_prompt_function( - Language.ENGLISH, - lambda line: { - "context": line["passage"], - "question": line["question"], - "choices": list( - filter( - lambda x: x, - [line["answer"].get("number")] - + line["answer"]["spans"] - + [prompt.get_drop_date(line["answer"].get("date"))], - ) - ), - }, - ), - suite=("lighteval",), - hf_repo="lighteval/drop_harness", - hf_subset="default", - hf_filter=lambda line: list( - filter( - lambda x: x, - [line["answer"].get("number")] - + line["answer"]["spans"] - + [prompt.get_drop_date(line["answer"].get("date"))], - ) - ), - evaluation_splits=("validation",), - few_shots_split="train", - generation_size=250, - stop_sequence=["Question:", "question:", "\n"], - metrics=( - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ), - version=1, -) -dyck_language_2_helm = LightevalTaskConfig( - name="dyck_language:2", - suite=["helm"], - prompt_function=prompt.dyck_language, - hf_repo="lighteval/DyckLanguage", - hf_subset="2", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[Metrics.exact_match], - stop_sequence=["\n"], - version=0, -) -dyck_language_3_helm = LightevalTaskConfig( - name="dyck_language:3", - suite=["helm"], - prompt_function=prompt.dyck_language, - hf_repo="lighteval/DyckLanguage", - hf_subset="3", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[Metrics.exact_match], - stop_sequence=["\n"], - version=0, -) -dyck_language_4_helm = LightevalTaskConfig( - name="dyck_language:4", - suite=["helm"], - prompt_function=prompt.dyck_language, - hf_repo="lighteval/DyckLanguage", - hf_subset="4", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[Metrics.exact_match], - stop_sequence=["\n"], - version=0, -) -dyck_languages_bigbench = LightevalTaskConfig( - name="dyck_languages", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="dyck_languages", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -elementary_math_qa_bigbench = LightevalTaskConfig( - name="elementary_math_qa", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="elementary_math_qa", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -emoji_movie_bigbench_lite = LightevalTaskConfig( - name="emoji_movie", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="emoji_movie", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.rouge_t5, - Metrics.bleu, - Metrics.loglikelihood_acc, - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -emojis_emotion_prediction_bigbench = LightevalTaskConfig( - name="emojis_emotion_prediction", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="emojis_emotion_prediction", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -empirical_judgments_bigbench = LightevalTaskConfig( - name="empirical_judgments", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="empirical_judgments", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -english_proverbs_bigbench = LightevalTaskConfig( - name="english_proverbs", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="english_proverbs", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -english_russian_proverbs_bigbench = LightevalTaskConfig( - name="english_russian_proverbs", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="english_russian_proverbs", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -entailed_polarity_bigbench = LightevalTaskConfig( - name="entailed_polarity", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="entailed_polarity", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -entailed_polarity_hindi_bigbench = LightevalTaskConfig( - name="entailed_polarity_hindi", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="entailed_polarity_hindi", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -entity_data_imputation_Buy_helm = LightevalTaskConfig( - name="entity_data_imputation:Buy", - suite=["helm"], - prompt_function=prompt.entity_data_imputation, - hf_repo="lighteval/Buy", - hf_subset="default", - hf_avail_splits=["train", "test", "valid"], - evaluation_splits=["valid", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_data_imputation_Restaurant_helm = LightevalTaskConfig( - name="entity_data_imputation:Restaurant", - suite=["helm"], - prompt_function=prompt.entity_data_imputation, - hf_repo="lighteval/Restaurant", - hf_subset="default", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_Abt_Buy_helm = LightevalTaskConfig( - name="entity_matching:Abt_Buy", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="Abt_Buy", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_Amazon_Google_helm = LightevalTaskConfig( - name="entity_matching:Amazon_Google", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="Amazon_Google", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_Beer_helm = LightevalTaskConfig( - name="entity_matching:Beer", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="Beer", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_Company_helm = LightevalTaskConfig( - name="entity_matching:Company", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="Company", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_DBLP_ACM_helm = LightevalTaskConfig( - name="entity_matching:DBLP_ACM", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="DBLP_ACM", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_DBLP_GoogleScholar_helm = LightevalTaskConfig( - name="entity_matching:DBLP_GoogleScholar", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="DBLP_GoogleScholar", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_Dirty_DBLP_ACM_helm = LightevalTaskConfig( - name="entity_matching:Dirty_DBLP_ACM", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="Dirty_DBLP_ACM", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_Dirty_DBLP_GoogleScholar_helm = LightevalTaskConfig( - name="entity_matching:Dirty_DBLP_GoogleScholar", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="Dirty_DBLP_GoogleScholar", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_Dirty_Walmart_Amazon_helm = LightevalTaskConfig( - name="entity_matching:Dirty_Walmart_Amazon", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="Dirty_Walmart_Amazon", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_Dirty_iTunes_Amazon_helm = LightevalTaskConfig( - name="entity_matching:Dirty_iTunes_Amazon", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="Dirty_iTunes_Amazon", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_Fodors_Zagats_helm = LightevalTaskConfig( - name="entity_matching=Fodors_Zagats", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="Fodors_Zagats", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_Walmart_Amazon_helm = LightevalTaskConfig( - name="entity_matching:Walmart_Amazon", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="Walmart_Amazon", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -entity_matching_iTunes_Amazon_helm = LightevalTaskConfig( - name="entity_matching:iTunes_Amazon", - suite=["helm"], - prompt_function=prompt.entity_matching, - hf_repo="lighteval/EntityMatching", - hf_subset="iTunes_Amazon", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -epistemic_reasoning_bigbench = LightevalTaskConfig( - name="epistemic_reasoning", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="epistemic_reasoning", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -ethics_commonsense_lighteval = LightevalTaskConfig( - name="ethics:commonsense", - suite=["lighteval", "ethics"], - prompt_function=prompt.ethics_commonsense, - hf_repo="lighteval/hendrycks_ethics", - hf_subset="commonsense", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -ethics_deontology_lighteval = LightevalTaskConfig( - name="ethics:deontology", - suite=["lighteval", "ethics"], - prompt_function=prompt.ethics_deontology, - hf_repo="lighteval/hendrycks_ethics", - hf_subset="deontology", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -ethics_justice_lighteval = LightevalTaskConfig( - name="ethics:justice", - suite=["lighteval", "ethics"], - prompt_function=prompt.ethics_justice, - hf_repo="lighteval/hendrycks_ethics", - hf_subset="justice", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -ethics_utilitarianism_lighteval = LightevalTaskConfig( - name="ethics:utilitarianism", - suite=["lighteval", "ethics"], - prompt_function=prompt.ethics_utilitarianism, - hf_repo="lighteval/hendrycks_ethics", - hf_subset="utilitarianism", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -ethics_virtue_lighteval = LightevalTaskConfig( - name="ethics:virtue", - suite=["lighteval", "ethics"], - prompt_function=prompt.ethics_virtue, - hf_repo="lighteval/hendrycks_ethics", - hf_subset="virtue", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -evaluating_information_essentiality_bigbench = LightevalTaskConfig( - name="evaluating_information_essentiality", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="evaluating_information_essentiality", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -fact_checker_bigbench = LightevalTaskConfig( - name="fact_checker", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="fact_checker", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -fantasy_reasoning_bigbench = LightevalTaskConfig( - name="fantasy_reasoning", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="fantasy_reasoning", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -few_shot_nlg_bigbench = LightevalTaskConfig( - name="few_shot_nlg", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="few_shot_nlg", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.bleurt], - stop_sequence=["\n"], - version=0, -) -figure_of_speech_detection_bigbench = LightevalTaskConfig( - name="figure_of_speech_detection", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="figure_of_speech_detection", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -formal_fallacies_syllogisms_negation_bigbench_lite = LightevalTaskConfig( - name="formal_fallacies_syllogisms_negation", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="formal_fallacies_syllogisms_negation", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -gem_bigbench = LightevalTaskConfig( - name="gem", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="gem", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5], - stop_sequence=["\n"], - version=0, -) -gender_inclusive_sentences_german_bigbench = LightevalTaskConfig( - name="gender_inclusive_sentences_german", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="gender_inclusive_sentences_german", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -general_knowledge_bigbench = LightevalTaskConfig( - name="general_knowledge", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="general_knowledge", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -geometric_shapes_bigbench = LightevalTaskConfig( - name="geometric_shapes", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="geometric_shapes", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.rouge_t5, - Metrics.bleu, - Metrics.loglikelihood_acc, - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -glue_cola_lighteval = LightevalTaskConfig( - name="glue:cola", - suite=["lighteval", "glue"], - prompt_function=prompt.cola, - hf_repo="glue", - hf_subset="cola", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.mcc], - stop_sequence=["\n"], - version=0, -) -glue_mnli_lighteval = LightevalTaskConfig( - name="glue:mnli", - suite=["lighteval", "glue"], - prompt_function=prompt.mnli, - hf_repo="glue", - hf_subset="mnli_matched", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -glue_mnli_mismatched_lighteval = LightevalTaskConfig( - name="glue:mnli_mismatched", - suite=["lighteval", "glue"], - prompt_function=prompt.mnli, - hf_repo="glue", - hf_subset="mnli_mismatched", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -glue_mrpc_lighteval = LightevalTaskConfig( - name="glue:mrpc", - suite=["lighteval", "glue"], - prompt_function=prompt.mrpc, - hf_repo="glue", - hf_subset="mrpc", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_f1], - stop_sequence=["\n"], - version=0, -) -glue_qnli_lighteval = LightevalTaskConfig( - name="glue:qnli", - suite=["lighteval", "glue"], - prompt_function=prompt.qnli, - hf_repo="glue", - hf_subset="qnli", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -glue_qqp_lighteval = LightevalTaskConfig( - name="glue:qqp", - suite=["lighteval", "glue"], - prompt_function=prompt.qqp, - hf_repo="glue", - hf_subset="qqp", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_f1], - stop_sequence=["\n"], - version=0, -) -glue_rte_lighteval = LightevalTaskConfig( - name="glue:rte", - suite=["lighteval", "glue"], - prompt_function=prompt.rte, - hf_repo="glue", - hf_subset="rte", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -glue_sst2_lighteval = LightevalTaskConfig( - name="glue:sst2", - suite=["lighteval", "glue"], - prompt_function=prompt.sst, - hf_repo="glue", - hf_subset="sst2", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -glue_stsb_lighteval = LightevalTaskConfig( - name="glue:stsb", - suite=["lighteval", "glue"], - prompt_function=prompt.stsb, - hf_repo="glue", - hf_subset="stsb", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -glue_wnli_lighteval = LightevalTaskConfig( - name="glue:wnli", - suite=["lighteval", "glue"], - prompt_function=prompt.wnli, - hf_repo="glue", - hf_subset="wnli", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -goal_step_wikihow_bigbench = LightevalTaskConfig( - name="goal_step_wikihow", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="goal_step_wikihow", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -gpqa_lighteval = LightevalTaskConfig( - name="gpqa:mc", - suite=["lighteval"], - prompt_function=prompt.gpqa, - hf_repo="Idavidrein/gpqa", - hf_subset="gpqa_main", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -gpqa_diamond_instruct_lighteval = LightevalTaskConfig( - name="gpqa:diamond", - suite=["lighteval"], - prompt_function=prompt.gpqa_instruct, - hf_repo="Idavidrein/gpqa", - hf_subset="gpqa_diamond", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=32768, # needed for reasoning models like R1 - metrics=[Metrics.gpqa_instruct_pass_at_k(sample_params={"k": 1})], - stop_sequence=[], # no stop sequence, will use eos token - version=1, -) -gpqa_extended_instruct_lighteval = LightevalTaskConfig( - name="gpqa:extended", - suite=["lighteval"], - prompt_function=prompt.gpqa_instruct, - hf_repo="Idavidrein/gpqa", - hf_subset="gpqa_extended", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=32768, # needed for reasoning models like R1 - metrics=[Metrics.gpqa_instruct_metric], - stop_sequence=[], # no stop sequence, will use eos token - version=0, -) -gpqa_main_instruct_lighteval = LightevalTaskConfig( - name="gpqa:main", - suite=["lighteval"], - prompt_function=prompt.gpqa_instruct, - hf_repo="Idavidrein/gpqa", - hf_subset="gpqa_main", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=32768, # needed for reasoning models like R1 - metrics=[Metrics.gpqa_instruct_metric], - stop_sequence=[], # no stop sequence, will use eos token - version=0, -) -gre_reading_comprehension_bigbench = LightevalTaskConfig( - name="gre_reading_comprehension", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="gre_reading_comprehension", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -gsm_plus = LightevalTaskConfig( - name="gsm_plus", - suite=["lighteval"], - prompt_function=prompt.gsm_plus, - hf_repo="qintongli/GSM-Plus", - hf_subset="default", - hf_avail_splits=["test", "testmini"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.expr_gold_metric], - stop_sequence=None, - version=0, -) -gsm8k_leaderboard = LightevalTaskConfig( - name="gsm8k", - suite=["leaderboard"], - prompt_function=prompt.gsm8k, - hf_repo="gsm8k", - hf_subset="main", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling_from_train", - generation_size=256, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": gsm8k_normalizer, "normalize_pred": gsm8k_normalizer}) - ], - stop_sequence=[], - version=0, -) -gsm8k_lighteval = LightevalTaskConfig( - name="gsm8k", - suite=["lighteval"], - prompt_function=prompt.gsm8k, - hf_repo="openai/gsm8k", - hf_subset="main", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling_from_train", - generation_size=256, - metrics=[ - Metrics.expr_gold_metric, - ], - stop_sequence=["Question:"], - version=0, -) -headqa_en_lighteval = LightevalTaskConfig( - name="headqa:en", - suite=["lighteval", "headqa"], - prompt_function=prompt.headqa, - hf_repo="lighteval/headqa_harness", - hf_subset="en", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -headqa_es_lighteval = LightevalTaskConfig( - name="headqa:es", - suite=["lighteval", "headqa"], - prompt_function=prompt.headqa, - hf_repo="lighteval/headqa_harness", - hf_subset="es", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -hellaswag_leaderboard = LightevalTaskConfig( - name="hellaswag", - suite=["leaderboard"], - prompt_function=prompt.hellaswag_harness, - hf_repo="hellaswag", - hf_subset="default", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select="random_sampling_from_train", - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -hellaswag_generative = LightevalTaskConfig( - name="hellaswag", - suite=["helm", "helm_general"], - prompt_function=prompt.hellaswag_generative, - hf_repo="hellaswag", - hf_subset="default", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -hhh_alignment_bigbench = LightevalTaskConfig( - name="hhh_alignment", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="hhh_alignment", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -hindi_question_answering_bigbench = LightevalTaskConfig( - name="hindi_question_answering", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="hindi_question_answering", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -hindu_knowledge_bigbench_lite = LightevalTaskConfig( - name="hindu_knowledge", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="hindu_knowledge", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -hinglish_toxicity_bigbench = LightevalTaskConfig( - name="hinglish_toxicity", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="hinglish_toxicity", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -human_organs_senses_bigbench = LightevalTaskConfig( - name="human_organs_senses", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="human_organs_senses", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -hyperbaton_bigbench = LightevalTaskConfig( - name="hyperbaton", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="hyperbaton", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -identify_math_theorems_bigbench = LightevalTaskConfig( - name="identify_math_theorems", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="identify_math_theorems", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -identify_odd_metaphor_bigbench = LightevalTaskConfig( - name="identify_odd_metaphor", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="identify_odd_metaphor", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -imdb_helm = LightevalTaskConfig( - name="imdb", - suite=["helm", "helm_general"], - prompt_function=prompt.imdb, - hf_repo="lighteval/IMDB_helm", - hf_subset="default", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -imdb_contrastset_helm = LightevalTaskConfig( - name="imdb:contrastset", - suite=["helm"], - prompt_function=prompt.imdb_contrastset, - hf_repo="lighteval/IMDB_helm", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -implicatures_bigbench = LightevalTaskConfig( - name="implicatures", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="implicatures", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -implicit_relations_bigbench = LightevalTaskConfig( - name="implicit_relations", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="implicit_relations", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -intent_recognition_bigbench = LightevalTaskConfig( - name="intent_recognition", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="intent_recognition", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -interactive_qa_mmlu_abstract_algebra_helm = LightevalTaskConfig( - name="interactive_qa_mmlu:abstract_algebra", - suite=["helm", "interactive_qa_mmlu_scenario"], - prompt_function=prompt.mmlu_qa_abstract_algebra, - hf_repo="lighteval/mmlu", - hf_subset="abstract_algebra", - hf_avail_splits=["dev", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -interactive_qa_mmlu_college_chemistry_helm = LightevalTaskConfig( - name="interactive_qa_mmlu:college_chemistry", - suite=["helm", "interactive_qa_mmlu_scenario"], - prompt_function=prompt.mmlu_qa_college_chemistry, - hf_repo="lighteval/mmlu", - hf_subset="college_chemistry", - hf_avail_splits=["dev", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -interactive_qa_mmlu_global_facts_helm = LightevalTaskConfig( - name="interactive_qa_mmlu:global_facts", - suite=["helm", "interactive_qa_mmlu_scenario"], - prompt_function=prompt.mmlu_qa_global_facts, - hf_repo="lighteval/mmlu", - hf_subset="global_facts", - hf_avail_splits=["dev", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -interactive_qa_mmlu_miscellaneous_helm = LightevalTaskConfig( - name="interactive_qa_mmlu:miscellaneous", - suite=["helm", "interactive_qa_mmlu_scenario"], - prompt_function=prompt.mmlu_qa_miscellaneous, - hf_repo="lighteval/mmlu", - hf_subset="miscellaneous", - hf_avail_splits=["dev", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -interactive_qa_mmlu_nutrition_helm = LightevalTaskConfig( - name="interactive_qa_mmlu:nutrition", - suite=["helm", "interactive_qa_mmlu_scenario"], - prompt_function=prompt.mmlu_qa_nutrition, - hf_repo="lighteval/mmlu", - hf_subset="nutrition", - hf_avail_splits=["dev", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -interactive_qa_mmlu_us_foreign_policy_helm = LightevalTaskConfig( - name="interactive_qa_mmlu:us_foreign_policy", - suite=["helm", "interactive_qa_mmlu_scenario"], - prompt_function=prompt.mmlu_qa_us_foreign_policy, - hf_repo="lighteval/mmlu", - hf_subset="us_foreign_policy", - hf_avail_splits=["dev", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -international_phonetic_alphabet_nli_bigbench = LightevalTaskConfig( - name="international_phonetic_alphabet_nli", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="international_phonetic_alphabet_nli", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -international_phonetic_alphabet_transliterate_bigbench = LightevalTaskConfig( - name="international_phonetic_alphabet_transliterate", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="international_phonetic_alphabet_transliterate", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -intersect_geometry_bigbench = LightevalTaskConfig( - name="intersect_geometry", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="intersect_geometry", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -irony_identification_bigbench = LightevalTaskConfig( - name="irony_identification", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="irony_identification", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -iwslt17_ar_en_lighteval = LightevalTaskConfig( - name="iwslt17:ar-en", - suite=["lighteval", "harness_selection"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="iwslt17_ar-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -iwslt17_de_en_lighteval = LightevalTaskConfig( - name="iwslt17:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="iwslt17_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -iwslt17_en_ar_lighteval = LightevalTaskConfig( - name="iwslt17:en-ar", - suite=["lighteval", "harness_selection"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="iwslt17_ar-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -iwslt17_en_de_lighteval = LightevalTaskConfig( - name="iwslt17:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="iwslt17_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -iwslt17_en_fr_lighteval = LightevalTaskConfig( - name="iwslt17:en-fr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="iwslt17_en-fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -iwslt17_en_ja_lighteval = LightevalTaskConfig( - name="iwslt17:en-ja", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="iwslt17_en-ja", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -iwslt17_en_ko_lighteval = LightevalTaskConfig( - name="iwslt17:en-ko", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="iwslt17_en-ko", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -iwslt17_en_zh_lighteval = LightevalTaskConfig( - name="iwslt17:en-zh", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="iwslt17_en-zh", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -iwslt17_fr_en_lighteval = LightevalTaskConfig( - name="iwslt17:fr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="iwslt17_fr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -iwslt17_ja_en_lighteval = LightevalTaskConfig( - name="iwslt17:ja-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="iwslt17_ja-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -iwslt17_ko_en_lighteval = LightevalTaskConfig( - name="iwslt17:ko-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="iwslt17_ko-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -iwslt17_zh_en_lighteval = LightevalTaskConfig( - name="iwslt17:zh-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="iwslt17_zh-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -jeopardy = LightevalTaskConfig( - name="jeopardy", - prompt_function=get_qa_prompt_function( - Language.ENGLISH, - lambda line: { - "question": line["question"], - "choices": [line["answer"]], - }, - ), - suite=("lighteval",), - hf_repo="openaccess-ai-collective/jeopardy", - hf_subset="default", - evaluation_splits=("train",), - few_shots_split="train", - generation_size=250, - stop_sequence=["\n", "Question:", "question:"], - metrics=( - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ), -) -kanji_ascii_bigbench = LightevalTaskConfig( - name="kanji_ascii", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="kanji_ascii", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -kannada_bigbench = LightevalTaskConfig( - name="kannada", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="kannada", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -key_value_maps_bigbench = LightevalTaskConfig( - name="key_value_maps", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="key_value_maps", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -known_unknowns_bigbench_lite = LightevalTaskConfig( - name="known_unknowns", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="known_unknowns", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -lambada_standard_lighteval = LightevalTaskConfig( - name="lambada:standard", - suite=["lighteval", "lambada"], - prompt_function=prompt.lambada, - hf_repo="lambada", - hf_subset="plain_text", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[Metrics.target_perplexity], - stop_sequence=["\n"], - version=0, -) -lambada_standard_cloze_lighteval = LightevalTaskConfig( - name="lambada:standard_cloze", - suite=["lighteval", "lambada"], - prompt_function=prompt.lambada_cloze, - hf_repo="lambada", - hf_subset="plain_text", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[Metrics.target_perplexity], - stop_sequence=["\n"], - version=0, -) -lambada_openai_lighteval = LightevalTaskConfig( - name="lambada:openai", - suite=["lighteval", "lambada"], - prompt_function=prompt.lambada, - hf_repo="EleutherAI/lambada_openai", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[Metrics.target_perplexity], - stop_sequence=["\n"], - version=0, -) -lambada_openai_de_lighteval = LightevalTaskConfig( - name="lambada:openai:de", - suite=["lighteval", "lambada"], - prompt_function=prompt.lambada, - hf_repo="EleutherAI/lambada_openai", - hf_subset="de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[Metrics.target_perplexity], - stop_sequence=["\n"], - version=0, -) -lambada_openai_en_lighteval = LightevalTaskConfig( - name="lambada:openai:en", - suite=["lighteval", "lambada"], - prompt_function=prompt.lambada, - hf_repo="EleutherAI/lambada_openai", - hf_subset="en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[Metrics.target_perplexity], - stop_sequence=["\n"], - version=0, -) -lambada_openai_es_lighteval = LightevalTaskConfig( - name="lambada:openai:es", - suite=["lighteval", "lambada"], - prompt_function=prompt.lambada, - hf_repo="EleutherAI/lambada_openai", - hf_subset="es", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[Metrics.target_perplexity], - stop_sequence=["\n"], - version=0, -) -lambada_openai_fr_lighteval = LightevalTaskConfig( - name="lambada:openai:fr", - suite=["lighteval", "lambada"], - prompt_function=prompt.lambada, - hf_repo="EleutherAI/lambada_openai", - hf_subset="fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[Metrics.target_perplexity], - stop_sequence=["\n"], - version=0, -) -lambada_openai_it_lighteval = LightevalTaskConfig( - name="lambada:openai:it", - suite=["lighteval", "lambada"], - prompt_function=prompt.lambada, - hf_repo="EleutherAI/lambada_openai", - hf_subset="it", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[Metrics.target_perplexity], - stop_sequence=["\n"], - version=0, -) -lambada_openai_cloze_lighteval = LightevalTaskConfig( - name="lambada:openai_cloze", - suite=["lighteval", "lambada"], - prompt_function=prompt.lambada_cloze, - hf_repo="EleutherAI/lambada_openai", - hf_subset="en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[Metrics.target_perplexity], - stop_sequence=["\n"], - version=0, -) -language_games_bigbench = LightevalTaskConfig( - name="language_games", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="language_games", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -language_identification_bigbench_lite = LightevalTaskConfig( - name="language_identification", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="language_identification", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -legal_summarization_billsum_helm = LightevalTaskConfig( - name="legal_summarization:billsum", - suite=["helm"], - prompt_function=prompt.legal_summarization, - hf_repo="lighteval/legal_summarization", - hf_subset="BillSum", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=1024, - metrics=[ - Metrics.rouge1, - Metrics.rouge2, - Metrics.rougeL, - Metrics.faithfulness, - Metrics.extractiveness, - Metrics.bert_score, - ], - stop_sequence=["\n"], - version=0, -) -legal_summarization_eurlexsum_helm = LightevalTaskConfig( - name="legal_summarization:eurlexsum", - suite=["helm"], - prompt_function=prompt.legal_summarization, - hf_repo="lighteval/legal_summarization", - hf_subset="EurLexSum", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.rouge1, - Metrics.rouge2, - Metrics.rougeL, - Metrics.faithfulness, - Metrics.extractiveness, - Metrics.bert_score, - ], - stop_sequence=["\n"], - version=0, -) -legal_summarization_multilexsum_helm = LightevalTaskConfig( - name="legal_summarization:multilexsum", - suite=["helm"], - prompt_function=prompt.multilexsum, - hf_repo="lighteval/legal_summarization", - hf_subset="MultiLexSum", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=256, - metrics=[ - Metrics.rouge1, - Metrics.rouge2, - Metrics.rougeL, - Metrics.faithfulness, - Metrics.extractiveness, - Metrics.bert_score, - ], - stop_sequence=["\n"], - version=0, -) -legalsupport_helm = LightevalTaskConfig( - name="legalsupport", - suite=["helm"], - prompt_function=prompt.legal_support, - hf_repo="lighteval/LegalSupport", - hf_subset="default", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -lexglue_case_hold_helm = LightevalTaskConfig( - name="lexglue:case_hold", - suite=["helm", "lex_glue_scenario"], - prompt_function=prompt.lex_glue_case_hold, - hf_repo="lighteval/lexglue", - hf_subset="case_hold", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lexglue_ecthr_a_helm = LightevalTaskConfig( - name="lexglue:ecthr_a", - suite=["helm", "lex_glue_scenario"], - prompt_function=prompt.lex_glue_ecthr_a, - hf_repo="lighteval/lexglue", - hf_subset="ecthr_a", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lexglue_ecthr_b_helm = LightevalTaskConfig( - name="lexglue:ecthr_b", - suite=["helm", "lex_glue_scenario"], - prompt_function=prompt.lex_glue_ecthr_b, - hf_repo="lighteval/lexglue", - hf_subset="ecthr_b", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lexglue_eurlex_helm = LightevalTaskConfig( - name="lexglue:eurlex", - suite=["helm", "lex_glue_scenario"], - prompt_function=prompt.lex_glue_eurlex, - hf_repo="lighteval/lexglue", - hf_subset="eurlex", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lexglue_ledgar_helm = LightevalTaskConfig( - name="lexglue:ledgar", - suite=["helm", "lex_glue_scenario"], - prompt_function=prompt.lex_glue_ledgar, - hf_repo="lighteval/lexglue", - hf_subset="ledgar", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lexglue_scotus_helm = LightevalTaskConfig( - name="lexglue:scotus", - suite=["helm", "lex_glue_scenario"], - prompt_function=prompt.lex_glue_scotus, - hf_repo="lighteval/lexglue", - hf_subset="scotus", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lexglue_unfair_tos_helm = LightevalTaskConfig( - name="lexglue:unfair_tos", - suite=["helm", "lex_glue_scenario"], - prompt_function=prompt.lex_glue_unfair_tos, - hf_repo="lighteval/lexglue", - hf_subset="unfair_tos", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_brazilian_court_decisions_judgment_helm = LightevalTaskConfig( - name="lextreme:brazilian_court_decisions_judgment", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_brazilian_court_decisions_judgment, - hf_repo="lighteval/lextreme", - hf_subset="brazilian_court_decisions_judgment", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_brazilian_court_decisions_unanimity_helm = LightevalTaskConfig( - name="lextreme:brazilian_court_decisions_unanimity", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_brazilian_court_decisions_unanimity, - hf_repo="lighteval/lextreme", - hf_subset="brazilian_court_decisions_unanimity", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_covid19_emergency_event_helm = LightevalTaskConfig( - name="lextreme:covid19_emergency_event", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_covid19_emergency_event, - hf_repo="lighteval/lextreme", - hf_subset="covid19_emergency_event", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_german_argument_mining_helm = LightevalTaskConfig( - name="lextreme:german_argument_mining", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_german_argument_mining, - hf_repo="lighteval/lextreme", - hf_subset="german_argument_mining", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_greek_legal_code_chapter_helm = LightevalTaskConfig( - name="lextreme:greek_legal_code_chapter", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_greek_legal_code_chapter, - hf_repo="lighteval/lextreme", - hf_subset="greek_legal_code_chapter", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_greek_legal_code_subject_helm = LightevalTaskConfig( - name="lextreme:greek_legal_code_subject", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_greek_legal_code_subject, - hf_repo="lighteval/lextreme", - hf_subset="greek_legal_code_subject", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_greek_legal_code_volume_helm = LightevalTaskConfig( - name="lextreme:greek_legal_code_volume", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_greek_legal_code_volume, - hf_repo="lighteval/lextreme", - hf_subset="greek_legal_code_volume", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_greek_legal_ner_helm = LightevalTaskConfig( - name="lextreme:greek_legal_ner", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_greek_legal_ner, - hf_repo="lighteval/lextreme", - hf_subset="greek_legal_ner", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=430, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_legalnero_helm = LightevalTaskConfig( - name="lextreme:legalnero", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_legalnero, - hf_repo="lighteval/lextreme", - hf_subset="legalnero", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=788, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_lener_br_helm = LightevalTaskConfig( - name="lextreme:lener_br", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_lener_br, - hf_repo="lighteval/lextreme", - hf_subset="lener_br", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=338, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_mapa_coarse_helm = LightevalTaskConfig( - name="lextreme:mapa_coarse", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_mapa_coarse, - hf_repo="lighteval/lextreme", - hf_subset="mapa_coarse", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=274, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_mapa_fine_helm = LightevalTaskConfig( - name="lextreme:mapa_fine", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_mapa_fine, - hf_repo="lighteval/lextreme", - hf_subset="mapa_fine", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=274, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_multi_eurlex_level_1_helm = LightevalTaskConfig( - name="lextreme:multi_eurlex_level_1", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_multi_eurlex_level_1, - hf_repo="lighteval/lextreme", - hf_subset="multi_eurlex_level_1", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_multi_eurlex_level_2_helm = LightevalTaskConfig( - name="lextreme:multi_eurlex_level_2", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_multi_eurlex_level_2, - hf_repo="lighteval/lextreme", - hf_subset="multi_eurlex_level_2", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_multi_eurlex_level_3_helm = LightevalTaskConfig( - name="lextreme:multi_eurlex_level_3", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_multi_eurlex_level_3, - hf_repo="lighteval/lextreme", - hf_subset="multi_eurlex_level_3", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_online_terms_of_service_clause_topics_helm = LightevalTaskConfig( - name="lextreme:online_terms_of_service_clause_topics", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_online_terms_of_service_clause_topics, - hf_repo="lighteval/lextreme", - hf_subset="online_terms_of_service_clause_topics", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_online_terms_of_service_unfairness_levels_helm = LightevalTaskConfig( - name="lextreme:online_terms_of_service_unfairness_levels", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_online_terms_of_service_unfairness_levels, - hf_repo="lighteval/lextreme", - hf_subset="online_terms_of_service_unfairness_levels", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=10, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -lextreme_swiss_judgment_prediction_helm = LightevalTaskConfig( - name="lextreme:swiss_judgment_prediction", - suite=["helm", "lextreme_scenario"], - prompt_function=prompt.lextreme_swiss_judgment_prediction, - hf_repo="lighteval/lextreme", - hf_subset="swiss_judgment_prediction", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -linguistic_mappings_bigbench = LightevalTaskConfig( - name="linguistic_mappings", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="linguistic_mappings", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -linguistics_puzzles_bigbench_lite = LightevalTaskConfig( - name="linguistics_puzzles", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="linguistics_puzzles", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=None, - version=0, -) -logic_grid_puzzle_bigbench_lite = LightevalTaskConfig( - name="logic_grid_puzzle", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="logic_grid_puzzle", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -logical_args_bigbench = LightevalTaskConfig( - name="logical_args", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="logical_args", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -logical_deduction_bigbench_lite = LightevalTaskConfig( - name="logical_deduction", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="logical_deduction", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -logical_fallacy_detection_bigbench = LightevalTaskConfig( - name="logical_fallacy_detection", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="logical_fallacy_detection", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -logical_sequence_bigbench = LightevalTaskConfig( - name="logical_sequence", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="logical_sequence", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -logiqa_lighteval = LightevalTaskConfig( - name="logiqa", - suite=["lighteval"], - prompt_function=prompt.logiqa, - hf_repo="lighteval/logiqa_harness", - hf_subset="logiqa", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -lsat_qa_helm = LightevalTaskConfig( - name="lsat_qa", - suite=["helm", "lsat_qa_scenario"], - prompt_function=prompt.lsat_qa, - hf_repo="lighteval/lsat_qa", - hf_subset="all", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -lsat_qa_assignment_helm = LightevalTaskConfig( - name="lsat_qa:assignment", - suite=["helm", "lsat_qa_scenario"], - prompt_function=prompt.lsat_qa, - hf_repo="lighteval/lsat_qa", - hf_subset="assignment", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -lsat_qa_grouping_helm = LightevalTaskConfig( - name="lsat_qa:grouping", - suite=["helm", "lsat_qa_scenario"], - prompt_function=prompt.lsat_qa, - hf_repo="lighteval/lsat_qa", - hf_subset="grouping", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -lsat_qa_miscellaneous_helm = LightevalTaskConfig( - name="lsat_qa:miscellaneous", - suite=["helm", "lsat_qa_scenario"], - prompt_function=prompt.lsat_qa, - hf_repo="lighteval/lsat_qa", - hf_subset="miscellaneous", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -lsat_qa_ordering_helm = LightevalTaskConfig( - name="lsat_qa:ordering", - suite=["helm", "lsat_qa_scenario"], - prompt_function=prompt.lsat_qa, - hf_repo="lighteval/lsat_qa", - hf_subset="ordering", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -math_500 = LightevalTaskConfig( - name="math_500", - suite=["lighteval"], - prompt_function=prompt.math_500, - hf_repo="HuggingFaceH4/MATH-500", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=32768, - metrics=[ - Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}), - ], - version=2, -) -math_500_gpassk = LightevalTaskConfig( - name="math_500_gpassk", - suite=["lighteval"], - prompt_function=prompt.math_500, - hf_repo="HuggingFaceH4/MATH-500", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8192, - metrics=[Metrics.g_pass_at_k_latex(sample_params={"k": 16, "n": 48})], - version=1, -) -math_algebra_lighteval = LightevalTaskConfig( - name="math:algebra", - suite=["lighteval", "math"], - prompt_function=prompt.math, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="algebra", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=1, -) -math_counting_and_probability_lighteval = LightevalTaskConfig( - name="math:counting_and_probability", - suite=["lighteval", "math"], - prompt_function=prompt.math, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="counting_and_probability", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=1, -) -math_geometry_lighteval = LightevalTaskConfig( - name="math:geometry", - suite=["lighteval", "math"], - prompt_function=prompt.math, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="geometry", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=1, -) -math_intermediate_algebra_lighteval = LightevalTaskConfig( - name="math:intermediate_algebra", - suite=["lighteval", "math"], - prompt_function=prompt.math, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="intermediate_algebra", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=1, -) -math_number_theory_lighteval = LightevalTaskConfig( - name="math:number_theory", - suite=["lighteval", "math"], - prompt_function=prompt.math, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="number_theory", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=1, -) -math_prealgebra_lighteval = LightevalTaskConfig( - name="math:prealgebra", - suite=["lighteval", "math"], - prompt_function=prompt.math, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="prealgebra", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=1, -) -math_precalculus_lighteval = LightevalTaskConfig( - name="math:precalculus", - suite=["lighteval", "math"], - prompt_function=prompt.math, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="precalculus", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=1, -) -math_cot_algebra_lighteval = LightevalTaskConfig( - name="math_cot:algebra", - suite=["lighteval", "math"], - prompt_function=prompt.math_cot, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="algebra", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=0, -) -math_cot_counting_and_probability_lighteval = LightevalTaskConfig( - name="math_cot:counting_and_probability", - suite=["lighteval", "math"], - prompt_function=prompt.math_cot, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="counting_and_probability", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=0, -) -math_cot_geometry_lighteval = LightevalTaskConfig( - name="math_cot:geometry", - suite=["lighteval", "math"], - prompt_function=prompt.math_cot, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="geometry", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=0, -) -math_cot_intermediate_algebra_lighteval = LightevalTaskConfig( - name="math_cot:intermediate_algebra", - suite=["lighteval", "math"], - prompt_function=prompt.math_cot, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="intermediate_algebra", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=0, -) -math_cot_number_theory_lighteval = LightevalTaskConfig( - name="math_cot:number_theory", - suite=["lighteval", "math"], - prompt_function=prompt.math_cot, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="number_theory", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=0, -) -math_cot_prealgebra_lighteval = LightevalTaskConfig( - name="math_cot:prealgebra", - suite=["lighteval", "math"], - prompt_function=prompt.math_cot, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="prealgebra", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=0, -) -math_cot_precalculus_lighteval = LightevalTaskConfig( - name="math_cot:precalculus", - suite=["lighteval", "math"], - prompt_function=prompt.math_cot, - hf_repo="DigitalLearningGmbH/MATH-lighteval", - hf_subset="precalculus", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=2048, - metrics=[ - Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_n( - sample_params={ - "n": 4, - "strip_strings": True, - "normalize_pred": math_normalizer, - "normalize_gold": math_normalizer, - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mathematical_induction_bigbench = LightevalTaskConfig( - name="mathematical_induction", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="mathematical_induction", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mathqa_lighteval = LightevalTaskConfig( - name="mathqa", - suite=["lighteval"], - prompt_function=prompt.mathqa, - hf_repo="allenai/math_qa", - hf_subset="default", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -matrixshapes_bigbench = LightevalTaskConfig( - name="matrixshapes", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="matrixshapes", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -me_q_sum_helm = LightevalTaskConfig( - name="me_q_sum", - suite=["helm"], - prompt_function=prompt.me_q_sum, - hf_repo="lighteval/me_q_sum", - hf_subset="default", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.rougeL, - Metrics.bleu_1, - Metrics.bleu_4, - ], - stop_sequence=["\n"], - version=0, -) -med_dialog_healthcaremagic_helm = LightevalTaskConfig( - name="med_dialog:healthcaremagic", - suite=["helm"], - prompt_function=prompt.med_dialog, - hf_repo="lighteval/med_dialog", - hf_subset="healthcaremagic", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.rougeL, - Metrics.bleu_1, - Metrics.bleu_4, - ], - stop_sequence=["\n"], - version=0, -) -med_dialog_icliniq_helm = LightevalTaskConfig( - name="med_dialog:icliniq", - suite=["helm"], - prompt_function=prompt.med_dialog, - hf_repo="lighteval/med_dialog", - hf_subset="icliniq", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.rougeL, - Metrics.bleu_1, - Metrics.bleu_4, - ], - stop_sequence=["\n"], - version=0, -) -med_mcqa_helm = LightevalTaskConfig( - name="med_mcqa", - suite=["helm"], - prompt_function=prompt.med_mcqa, - hf_repo="lighteval/med_mcqa", - hf_subset="default", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -med_paragraph_simplification_helm = LightevalTaskConfig( - name="med_paragraph_simplification", - suite=["helm"], - prompt_function=prompt.med_paragraph_simplification, - hf_repo="lighteval/med_paragraph_simplification", - hf_subset="default", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=512, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.rougeL, - Metrics.bleu_1, - Metrics.bleu_4, - ], - stop_sequence=["\n"], - version=0, -) -med_qa_helm = LightevalTaskConfig( - name="med_qa", - suite=["helm"], - prompt_function=prompt.med_qa, - hf_repo="bigbio/med_qa", - hf_subset="med_qa_en_source", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -metaphor_boolean_bigbench = LightevalTaskConfig( - name="metaphor_boolean", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="metaphor_boolean", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -metaphor_understanding_bigbench = LightevalTaskConfig( - name="metaphor_understanding", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="metaphor_understanding", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mgsm_en_lighteval = LightevalTaskConfig( - name="mgsm:en", - suite=["lighteval"], - prompt_function=prompt.mgsm_en, - hf_repo="juletxara/mgsm", - hf_subset="en", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n", "=", "Question="], - version=0, -) -mgsm_es_lighteval = LightevalTaskConfig( - name="mgsm:es", - suite=["lighteval"], - prompt_function=prompt.mgsm_es, - hf_repo="juletxara/mgsm", - hf_subset="es", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n", "=", "Pregunta="], - version=0, -) -mgsm_fr_lighteval = LightevalTaskConfig( - name="mgsm:fr", - suite=["lighteval"], - prompt_function=prompt.mgsm_fr, - hf_repo="juletxara/mgsm", - hf_subset="fr", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n", "=", "Question="], - version=0, -) -mgsm_de_lighteval = LightevalTaskConfig( - name="mgsm:de", - suite=["lighteval"], - prompt_function=prompt.mgsm_de, - hf_repo="juletxara/mgsm", - hf_subset="de", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n", "=", "Frage="], - version=0, -) -mgsm_ru_lighteval = LightevalTaskConfig( - name="mgsm:ru", - suite=["lighteval"], - prompt_function=prompt.mgsm_ru, - hf_repo="juletxara/mgsm", - hf_subset="ru", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n", "=", "\u0417\u0430\u0434\u0430\u0447\u0430="], - version=0, -) -mgsm_zh_lighteval = LightevalTaskConfig( - name="mgsm:zh", - suite=["lighteval"], - prompt_function=prompt.mgsm_zh, - hf_repo="juletxara/mgsm", - hf_subset="zh", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n", "=", "\u95ee\u9898="], - version=0, -) -mgsm_ja_lighteval = LightevalTaskConfig( - name="mgsm:ja", - suite=["lighteval"], - prompt_function=prompt.mgsm_ja, - hf_repo="juletxara/mgsm", - hf_subset="ja", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n", "=", "\u554f\u984c="], - version=0, -) -mgsm_th_lighteval = LightevalTaskConfig( - name="mgsm:th", - suite=["lighteval"], - prompt_function=prompt.mgsm_th, - hf_repo="juletxara/mgsm", - hf_subset="th", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n", "=", "\u0e42\u0e08\u0e17\u0e22\u0e4c="], - version=0, -) -mgsm_sw_lighteval = LightevalTaskConfig( - name="mgsm:sw", - suite=["lighteval"], - prompt_function=prompt.mgsm_sw, - hf_repo="juletxara/mgsm", - hf_subset="sw", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n", "=", "Swali="], - version=0, -) -mgsm_bn_lighteval = LightevalTaskConfig( - name="mgsm:bn", - suite=["lighteval"], - prompt_function=prompt.mgsm_bn, - hf_repo="juletxara/mgsm", - hf_subset="bn", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n", "=", "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8="], - version=0, -) -mgsm_te_lighteval = LightevalTaskConfig( - name="mgsm:te", - suite=["lighteval"], - prompt_function=prompt.mgsm_te, - hf_repo="juletxara/mgsm", - hf_subset="te", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n", "=", "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28="], - version=0, -) -minute_mysteries_qa_bigbench = LightevalTaskConfig( - name="minute_mysteries_qa", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="minute_mysteries_qa", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.rouge_t5], - stop_sequence=["\n"], - version=0, -) -misconceptions_bigbench = LightevalTaskConfig( - name="misconceptions", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="misconceptions", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -misconceptions_russian_bigbench_lite = LightevalTaskConfig( - name="misconceptions_russian", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="misconceptions_russian", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_abstract_algebra_original = LightevalTaskConfig( - name="mmlu:abstract_algebra", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_abstract_algebra, - hf_repo="cais/mmlu", - hf_subset="abstract_algebra", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_abstract_algebra_leaderboard = LightevalTaskConfig( - name="mmlu:abstract_algebra", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="abstract_algebra", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_abstract_algebra_helm = LightevalTaskConfig( - name="mmlu:abstract_algebra", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="abstract_algebra", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_anatomy_original = LightevalTaskConfig( - name="mmlu:anatomy", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_anatomy, - hf_repo="cais/mmlu", - hf_subset="anatomy", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_anatomy_leaderboard = LightevalTaskConfig( - name="mmlu:anatomy", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="anatomy", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_anatomy_helm = LightevalTaskConfig( - name="mmlu:anatomy", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="anatomy", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_astronomy_original = LightevalTaskConfig( - name="mmlu:astronomy", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_astronomy, - hf_repo="cais/mmlu", - hf_subset="astronomy", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_astronomy_leaderboard = LightevalTaskConfig( - name="mmlu:astronomy", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="astronomy", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_astronomy_helm = LightevalTaskConfig( - name="mmlu:astronomy", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="astronomy", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_business_ethics_original = LightevalTaskConfig( - name="mmlu:business_ethics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_business_ethics, - hf_repo="cais/mmlu", - hf_subset="business_ethics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_business_ethics_leaderboard = LightevalTaskConfig( - name="mmlu:business_ethics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="business_ethics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_business_ethics_helm = LightevalTaskConfig( - name="mmlu:business_ethics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="business_ethics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_clinical_knowledge_original = LightevalTaskConfig( - name="mmlu:clinical_knowledge", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_clinical_knowledge, - hf_repo="cais/mmlu", - hf_subset="clinical_knowledge", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_clinical_knowledge_leaderboard = LightevalTaskConfig( - name="mmlu:clinical_knowledge", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="clinical_knowledge", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_clinical_knowledge_helm = LightevalTaskConfig( - name="mmlu:clinical_knowledge", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="clinical_knowledge", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_college_biology_original = LightevalTaskConfig( - name="mmlu:college_biology", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_college_biology, - hf_repo="cais/mmlu", - hf_subset="college_biology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_college_biology_leaderboard = LightevalTaskConfig( - name="mmlu:college_biology", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="college_biology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_college_biology_helm = LightevalTaskConfig( - name="mmlu:college_biology", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="college_biology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_college_chemistry_original = LightevalTaskConfig( - name="mmlu:college_chemistry", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_college_chemistry, - hf_repo="cais/mmlu", - hf_subset="college_chemistry", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_college_chemistry_leaderboard = LightevalTaskConfig( - name="mmlu:college_chemistry", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="college_chemistry", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_college_chemistry_helm = LightevalTaskConfig( - name="mmlu:college_chemistry", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="college_chemistry", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_college_computer_science_original = LightevalTaskConfig( - name="mmlu:college_computer_science", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_college_computer_science, - hf_repo="cais/mmlu", - hf_subset="college_computer_science", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_college_computer_science_leaderboard = LightevalTaskConfig( - name="mmlu:college_computer_science", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="college_computer_science", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_college_computer_science_helm = LightevalTaskConfig( - name="mmlu:college_computer_science", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="college_computer_science", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_college_mathematics_original = LightevalTaskConfig( - name="mmlu:college_mathematics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_college_mathematics, - hf_repo="cais/mmlu", - hf_subset="college_mathematics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_college_mathematics_leaderboard = LightevalTaskConfig( - name="mmlu:college_mathematics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="college_mathematics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_college_mathematics_helm = LightevalTaskConfig( - name="mmlu:college_mathematics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="college_mathematics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_college_medicine_original = LightevalTaskConfig( - name="mmlu:college_medicine", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_college_medicine, - hf_repo="cais/mmlu", - hf_subset="college_medicine", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_college_medicine_leaderboard = LightevalTaskConfig( - name="mmlu:college_medicine", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="college_medicine", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_college_medicine_helm = LightevalTaskConfig( - name="mmlu:college_medicine", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="college_medicine", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_college_physics_original = LightevalTaskConfig( - name="mmlu:college_physics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_college_physics, - hf_repo="cais/mmlu", - hf_subset="college_physics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_college_physics_leaderboard = LightevalTaskConfig( - name="mmlu:college_physics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="college_physics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_college_physics_helm = LightevalTaskConfig( - name="mmlu:college_physics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="college_physics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_computer_security_original = LightevalTaskConfig( - name="mmlu:computer_security", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_computer_security, - hf_repo="cais/mmlu", - hf_subset="computer_security", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_computer_security_leaderboard = LightevalTaskConfig( - name="mmlu:computer_security", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="computer_security", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_computer_security_helm = LightevalTaskConfig( - name="mmlu:computer_security", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="computer_security", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_conceptual_physics_original = LightevalTaskConfig( - name="mmlu:conceptual_physics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_conceptual_physics, - hf_repo="cais/mmlu", - hf_subset="conceptual_physics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_conceptual_physics_leaderboard = LightevalTaskConfig( - name="mmlu:conceptual_physics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="conceptual_physics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_conceptual_physics_helm = LightevalTaskConfig( - name="mmlu:conceptual_physics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="conceptual_physics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_econometrics_original = LightevalTaskConfig( - name="mmlu:econometrics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_econometrics, - hf_repo="cais/mmlu", - hf_subset="econometrics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_econometrics_leaderboard = LightevalTaskConfig( - name="mmlu:econometrics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="econometrics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_econometrics_helm = LightevalTaskConfig( - name="mmlu:econometrics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="econometrics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_electrical_engineering_original = LightevalTaskConfig( - name="mmlu:electrical_engineering", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_electrical_engineering, - hf_repo="cais/mmlu", - hf_subset="electrical_engineering", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_electrical_engineering_leaderboard = LightevalTaskConfig( - name="mmlu:electrical_engineering", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="electrical_engineering", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_electrical_engineering_helm = LightevalTaskConfig( - name="mmlu:electrical_engineering", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="electrical_engineering", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_elementary_mathematics_original = LightevalTaskConfig( - name="mmlu:elementary_mathematics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_elementary_mathematics, - hf_repo="cais/mmlu", - hf_subset="elementary_mathematics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_elementary_mathematics_leaderboard = LightevalTaskConfig( - name="mmlu:elementary_mathematics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="elementary_mathematics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_elementary_mathematics_helm = LightevalTaskConfig( - name="mmlu:elementary_mathematics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="elementary_mathematics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_formal_logic_original = LightevalTaskConfig( - name="mmlu:formal_logic", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_formal_logic, - hf_repo="cais/mmlu", - hf_subset="formal_logic", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_formal_logic_leaderboard = LightevalTaskConfig( - name="mmlu:formal_logic", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="formal_logic", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_formal_logic_helm = LightevalTaskConfig( - name="mmlu:formal_logic", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="formal_logic", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_global_facts_original = LightevalTaskConfig( - name="mmlu:global_facts", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_global_facts, - hf_repo="cais/mmlu", - hf_subset="global_facts", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_global_facts_leaderboard = LightevalTaskConfig( - name="mmlu:global_facts", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="global_facts", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_global_facts_helm = LightevalTaskConfig( - name="mmlu:global_facts", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="global_facts", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_biology_original = LightevalTaskConfig( - name="mmlu:high_school_biology", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_biology, - hf_repo="cais/mmlu", - hf_subset="high_school_biology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_biology_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_biology", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_biology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_biology_helm = LightevalTaskConfig( - name="mmlu:high_school_biology", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_biology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_chemistry_original = LightevalTaskConfig( - name="mmlu:high_school_chemistry", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_chemistry, - hf_repo="cais/mmlu", - hf_subset="high_school_chemistry", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_chemistry_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_chemistry", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_chemistry", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_chemistry_helm = LightevalTaskConfig( - name="mmlu:high_school_chemistry", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_chemistry", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_computer_science_original = LightevalTaskConfig( - name="mmlu:high_school_computer_science", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_computer_science, - hf_repo="cais/mmlu", - hf_subset="high_school_computer_science", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_computer_science_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_computer_science", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_computer_science", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_computer_science_helm = LightevalTaskConfig( - name="mmlu:high_school_computer_science", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_computer_science", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_european_history_original = LightevalTaskConfig( - name="mmlu:high_school_european_history", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_european_history, - hf_repo="cais/mmlu", - hf_subset="high_school_european_history", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_european_history_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_european_history", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_european_history", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_european_history_helm = LightevalTaskConfig( - name="mmlu:high_school_european_history", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_european_history", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_geography_original = LightevalTaskConfig( - name="mmlu:high_school_geography", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_geography, - hf_repo="cais/mmlu", - hf_subset="high_school_geography", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_geography_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_geography", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_geography", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_geography_helm = LightevalTaskConfig( - name="mmlu:high_school_geography", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_geography", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_government_and_politics_original = LightevalTaskConfig( - name="mmlu:high_school_government_and_politics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_government_and_politics, - hf_repo="cais/mmlu", - hf_subset="high_school_government_and_politics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_government_and_politics_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_government_and_politics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_government_and_politics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_government_and_politics_helm = LightevalTaskConfig( - name="mmlu:high_school_government_and_politics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_government_and_politics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_macroeconomics_original = LightevalTaskConfig( - name="mmlu:high_school_macroeconomics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_macroeconomics, - hf_repo="cais/mmlu", - hf_subset="high_school_macroeconomics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_macroeconomics_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_macroeconomics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_macroeconomics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_macroeconomics_helm = LightevalTaskConfig( - name="mmlu:high_school_macroeconomics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_macroeconomics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_mathematics_original = LightevalTaskConfig( - name="mmlu:high_school_mathematics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_mathematics, - hf_repo="cais/mmlu", - hf_subset="high_school_mathematics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_mathematics_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_mathematics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_mathematics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_mathematics_helm = LightevalTaskConfig( - name="mmlu:high_school_mathematics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_mathematics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_microeconomics_original = LightevalTaskConfig( - name="mmlu:high_school_microeconomics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_microeconomics, - hf_repo="cais/mmlu", - hf_subset="high_school_microeconomics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_microeconomics_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_microeconomics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_microeconomics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_microeconomics_helm = LightevalTaskConfig( - name="mmlu:high_school_microeconomics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_microeconomics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_physics_original = LightevalTaskConfig( - name="mmlu:high_school_physics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_physics, - hf_repo="cais/mmlu", - hf_subset="high_school_physics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_physics_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_physics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_physics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_physics_helm = LightevalTaskConfig( - name="mmlu:high_school_physics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_physics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_psychology_original = LightevalTaskConfig( - name="mmlu:high_school_psychology", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_psychology, - hf_repo="cais/mmlu", - hf_subset="high_school_psychology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_psychology_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_psychology", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_psychology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_psychology_helm = LightevalTaskConfig( - name="mmlu:high_school_psychology", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_psychology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_statistics_original = LightevalTaskConfig( - name="mmlu:high_school_statistics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_statistics, - hf_repo="cais/mmlu", - hf_subset="high_school_statistics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_statistics_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_statistics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_statistics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_statistics_helm = LightevalTaskConfig( - name="mmlu:high_school_statistics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_statistics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_us_history_original = LightevalTaskConfig( - name="mmlu:high_school_us_history", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_us_history, - hf_repo="cais/mmlu", - hf_subset="high_school_us_history", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_us_history_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_us_history", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_us_history", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_us_history_helm = LightevalTaskConfig( - name="mmlu:high_school_us_history", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_us_history", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_world_history_original = LightevalTaskConfig( - name="mmlu:high_school_world_history", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_high_school_world_history, - hf_repo="cais/mmlu", - hf_subset="high_school_world_history", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_world_history_leaderboard = LightevalTaskConfig( - name="mmlu:high_school_world_history", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="high_school_world_history", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_high_school_world_history_helm = LightevalTaskConfig( - name="mmlu:high_school_world_history", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="high_school_world_history", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_human_aging_original = LightevalTaskConfig( - name="mmlu:human_aging", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_human_aging, - hf_repo="cais/mmlu", - hf_subset="human_aging", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_human_aging_leaderboard = LightevalTaskConfig( - name="mmlu:human_aging", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="human_aging", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_human_aging_helm = LightevalTaskConfig( - name="mmlu:human_aging", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="human_aging", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_human_sexuality_original = LightevalTaskConfig( - name="mmlu:human_sexuality", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_human_sexuality, - hf_repo="cais/mmlu", - hf_subset="human_sexuality", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_human_sexuality_leaderboard = LightevalTaskConfig( - name="mmlu:human_sexuality", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="human_sexuality", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_human_sexuality_helm = LightevalTaskConfig( - name="mmlu:human_sexuality", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="human_sexuality", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_international_law_original = LightevalTaskConfig( - name="mmlu:international_law", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_international_law, - hf_repo="cais/mmlu", - hf_subset="international_law", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_international_law_leaderboard = LightevalTaskConfig( - name="mmlu:international_law", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="international_law", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_international_law_helm = LightevalTaskConfig( - name="mmlu:international_law", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="international_law", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_jurisprudence_original = LightevalTaskConfig( - name="mmlu:jurisprudence", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_jurisprudence, - hf_repo="cais/mmlu", - hf_subset="jurisprudence", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_jurisprudence_leaderboard = LightevalTaskConfig( - name="mmlu:jurisprudence", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="jurisprudence", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_jurisprudence_helm = LightevalTaskConfig( - name="mmlu:jurisprudence", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="jurisprudence", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_logical_fallacies_original = LightevalTaskConfig( - name="mmlu:logical_fallacies", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_logical_fallacies, - hf_repo="cais/mmlu", - hf_subset="logical_fallacies", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_logical_fallacies_leaderboard = LightevalTaskConfig( - name="mmlu:logical_fallacies", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="logical_fallacies", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_logical_fallacies_helm = LightevalTaskConfig( - name="mmlu:logical_fallacies", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="logical_fallacies", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_machine_learning_original = LightevalTaskConfig( - name="mmlu:machine_learning", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_machine_learning, - hf_repo="cais/mmlu", - hf_subset="machine_learning", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_machine_learning_leaderboard = LightevalTaskConfig( - name="mmlu:machine_learning", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="machine_learning", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_machine_learning_helm = LightevalTaskConfig( - name="mmlu:machine_learning", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="machine_learning", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_management_original = LightevalTaskConfig( - name="mmlu:management", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_management, - hf_repo="cais/mmlu", - hf_subset="management", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_management_leaderboard = LightevalTaskConfig( - name="mmlu:management", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="management", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_management_helm = LightevalTaskConfig( - name="mmlu:management", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="management", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_marketing_original = LightevalTaskConfig( - name="mmlu:marketing", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_marketing, - hf_repo="cais/mmlu", - hf_subset="marketing", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_marketing_leaderboard = LightevalTaskConfig( - name="mmlu:marketing", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="marketing", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_marketing_helm = LightevalTaskConfig( - name="mmlu:marketing", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="marketing", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_medical_genetics_original = LightevalTaskConfig( - name="mmlu:medical_genetics", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_medical_genetics, - hf_repo="cais/mmlu", - hf_subset="medical_genetics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_medical_genetics_leaderboard = LightevalTaskConfig( - name="mmlu:medical_genetics", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="medical_genetics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_medical_genetics_helm = LightevalTaskConfig( - name="mmlu:medical_genetics", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="medical_genetics", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_miscellaneous_original = LightevalTaskConfig( - name="mmlu:miscellaneous", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_miscellaneous, - hf_repo="cais/mmlu", - hf_subset="miscellaneous", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_miscellaneous_leaderboard = LightevalTaskConfig( - name="mmlu:miscellaneous", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="miscellaneous", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_miscellaneous_helm = LightevalTaskConfig( - name="mmlu:miscellaneous", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="miscellaneous", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_moral_disputes_original = LightevalTaskConfig( - name="mmlu:moral_disputes", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_moral_disputes, - hf_repo="cais/mmlu", - hf_subset="moral_disputes", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_moral_disputes_leaderboard = LightevalTaskConfig( - name="mmlu:moral_disputes", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="moral_disputes", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_moral_disputes_helm = LightevalTaskConfig( - name="mmlu:moral_disputes", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="moral_disputes", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_moral_scenarios_original = LightevalTaskConfig( - name="mmlu:moral_scenarios", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_moral_scenarios, - hf_repo="cais/mmlu", - hf_subset="moral_scenarios", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_moral_scenarios_leaderboard = LightevalTaskConfig( - name="mmlu:moral_scenarios", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="moral_scenarios", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_moral_scenarios_helm = LightevalTaskConfig( - name="mmlu:moral_scenarios", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="moral_scenarios", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_nutrition_original = LightevalTaskConfig( - name="mmlu:nutrition", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_nutrition, - hf_repo="cais/mmlu", - hf_subset="nutrition", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_nutrition_leaderboard = LightevalTaskConfig( - name="mmlu:nutrition", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="nutrition", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_nutrition_helm = LightevalTaskConfig( - name="mmlu:nutrition", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="nutrition", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_philosophy_original = LightevalTaskConfig( - name="mmlu:philosophy", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_philosophy, - hf_repo="cais/mmlu", - hf_subset="philosophy", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_philosophy_leaderboard = LightevalTaskConfig( - name="mmlu:philosophy", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="philosophy", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_philosophy_helm = LightevalTaskConfig( - name="mmlu:philosophy", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="philosophy", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_prehistory_original = LightevalTaskConfig( - name="mmlu:prehistory", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_prehistory, - hf_repo="cais/mmlu", - hf_subset="prehistory", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_prehistory_leaderboard = LightevalTaskConfig( - name="mmlu:prehistory", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="prehistory", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_prehistory_helm = LightevalTaskConfig( - name="mmlu:prehistory", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="prehistory", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_professional_accounting_original = LightevalTaskConfig( - name="mmlu:professional_accounting", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_professional_accounting, - hf_repo="cais/mmlu", - hf_subset="professional_accounting", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_professional_accounting_leaderboard = LightevalTaskConfig( - name="mmlu:professional_accounting", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="professional_accounting", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_professional_accounting_helm = LightevalTaskConfig( - name="mmlu:professional_accounting", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="professional_accounting", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_professional_law_original = LightevalTaskConfig( - name="mmlu:professional_law", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_professional_law, - hf_repo="cais/mmlu", - hf_subset="professional_law", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_professional_law_leaderboard = LightevalTaskConfig( - name="mmlu:professional_law", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="professional_law", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_professional_law_helm = LightevalTaskConfig( - name="mmlu:professional_law", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="professional_law", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_professional_medicine_original = LightevalTaskConfig( - name="mmlu:professional_medicine", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_professional_medicine, - hf_repo="cais/mmlu", - hf_subset="professional_medicine", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_professional_medicine_leaderboard = LightevalTaskConfig( - name="mmlu:professional_medicine", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="professional_medicine", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_professional_medicine_helm = LightevalTaskConfig( - name="mmlu:professional_medicine", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="professional_medicine", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_professional_psychology_original = LightevalTaskConfig( - name="mmlu:professional_psychology", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_professional_psychology, - hf_repo="cais/mmlu", - hf_subset="professional_psychology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_professional_psychology_leaderboard = LightevalTaskConfig( - name="mmlu:professional_psychology", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="professional_psychology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_professional_psychology_helm = LightevalTaskConfig( - name="mmlu:professional_psychology", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="professional_psychology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_public_relations_original = LightevalTaskConfig( - name="mmlu:public_relations", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_public_relations, - hf_repo="cais/mmlu", - hf_subset="public_relations", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_public_relations_leaderboard = LightevalTaskConfig( - name="mmlu:public_relations", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="public_relations", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_public_relations_helm = LightevalTaskConfig( - name="mmlu:public_relations", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="public_relations", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_security_studies_original = LightevalTaskConfig( - name="mmlu:security_studies", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_security_studies, - hf_repo="cais/mmlu", - hf_subset="security_studies", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_security_studies_leaderboard = LightevalTaskConfig( - name="mmlu:security_studies", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="security_studies", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_security_studies_helm = LightevalTaskConfig( - name="mmlu:security_studies", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="security_studies", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_sociology_original = LightevalTaskConfig( - name="mmlu:sociology", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_sociology, - hf_repo="cais/mmlu", - hf_subset="sociology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_sociology_leaderboard = LightevalTaskConfig( - name="mmlu:sociology", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="sociology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_sociology_helm = LightevalTaskConfig( - name="mmlu:sociology", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="sociology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_us_foreign_policy_original = LightevalTaskConfig( - name="mmlu:us_foreign_policy", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_us_foreign_policy, - hf_repo="cais/mmlu", - hf_subset="us_foreign_policy", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_us_foreign_policy_leaderboard = LightevalTaskConfig( - name="mmlu:us_foreign_policy", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="us_foreign_policy", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_us_foreign_policy_helm = LightevalTaskConfig( - name="mmlu:us_foreign_policy", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="us_foreign_policy", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_virology_original = LightevalTaskConfig( - name="mmlu:virology", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_virology, - hf_repo="cais/mmlu", - hf_subset="virology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_virology_leaderboard = LightevalTaskConfig( - name="mmlu:virology", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="virology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_virology_helm = LightevalTaskConfig( - name="mmlu:virology", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="virology", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mmlu_world_religions_original = LightevalTaskConfig( - name="mmlu:world_religions", - suite=["original", "mmlu"], - prompt_function=prompt.mmlu_world_religions, - hf_repo="cais/mmlu", - hf_subset="world_religions", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_world_religions_leaderboard = LightevalTaskConfig( - name="mmlu:world_religions", - suite=["leaderboard", "mmlu"], - prompt_function=prompt.mmlu_harness, - hf_repo="lighteval/mmlu", - hf_subset="world_religions", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mmlu_world_religions_helm = LightevalTaskConfig( - name="mmlu:world_religions", - suite=["helm", "helm_general"], - prompt_function=prompt.mmlu_helm, - hf_repo="lighteval/mmlu", - hf_subset="world_religions", - hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -mnist_ascii_bigbench = LightevalTaskConfig( - name="mnist_ascii", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="mnist_ascii", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -modified_arithmetic_bigbench = LightevalTaskConfig( - name="modified_arithmetic", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="modified_arithmetic", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -moral_permissibility_bigbench = LightevalTaskConfig( - name="moral_permissibility", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="moral_permissibility", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -movie_dialog_same_or_different_bigbench = LightevalTaskConfig( - name="movie_dialog_same_or_different", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="movie_dialog_same_or_different", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -movie_recommendation_bigbench = LightevalTaskConfig( - name="movie_recommendation", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="movie_recommendation", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mtnt2019_en_fr_lighteval = LightevalTaskConfig( - name="mtnt2019:en-fr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="mtnt2019_en-fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=200, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -mtnt2019_en_ja_lighteval = LightevalTaskConfig( - name="mtnt2019:en-ja", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="mtnt2019_en-ja", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=200, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -mtnt2019_fr_en_lighteval = LightevalTaskConfig( - name="mtnt2019:fr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="mtnt2019_fr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=200, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -mtnt2019_ja_en_lighteval = LightevalTaskConfig( - name="mtnt2019:ja-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="mtnt2019_ja-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=200, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -mult_data_wrangling_bigbench = LightevalTaskConfig( - name="mult_data_wrangling", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="mult_data_wrangling", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -multiemo_bigbench = LightevalTaskConfig( - name="multiemo", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="multiemo", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -musr_murder_mysteries = LightevalTaskConfig( - name="musr:murder_mysteries", - suite=["lighteval"], - prompt_function=prompt.musr, - hf_repo="TAUR-Lab/MuSR", - hf_subset="default", - hf_avail_splits=["murder_mysteries"], - evaluation_splits=["murder_mysteries"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -musr_object_placements = LightevalTaskConfig( - name="musr:object_placements", - suite=["lighteval"], - prompt_function=prompt.musr, - hf_repo="TAUR-Lab/MuSR", - hf_subset="default", - hf_avail_splits=["object_placements"], - evaluation_splits=["object_placements"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -musr_team_allocation = LightevalTaskConfig( - name="musr:team_allocation", - suite=["lighteval"], - prompt_function=prompt.musr, - hf_repo="TAUR-Lab/MuSR", - hf_subset="default", - hf_avail_splits=["team_allocation"], - evaluation_splits=["team_allocation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -mutual_lighteval = LightevalTaskConfig( - name="mutual", - suite=["lighteval"], - prompt_function=prompt.mutual, - hf_repo="lighteval/mutual_harness", - hf_subset="mutual", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.recall_at_k, Metrics.recall_at_k(sample_params={"k": 2}), Metrics.mrr], - stop_sequence=["\n"], - version=0, -) -mutual_plus_lighteval = LightevalTaskConfig( - name="mutual_plus", - suite=["lighteval"], - prompt_function=prompt.mutual, - hf_repo="lighteval/mutual_harness", - hf_subset="mutual_plus", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.recall_at_k, Metrics.recall_at_k(sample_params={"k": 2}), Metrics.mrr], - stop_sequence=["\n"], - version=0, -) -narrativeqa_helm = LightevalTaskConfig( - name="narrativeqa", - suite=["helm", "helm_general"], - prompt_function=prompt.narrativeqa, - hf_repo="lighteval/narrative_qa_helm", - hf_subset="default", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - Metrics.rougeL, - Metrics.bleu_1, - Metrics.bleu_4, - ], - stop_sequence=["\n"], - version=0, -) -natural_instructions_bigbench = LightevalTaskConfig( - name="natural_instructions", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="natural_instructions", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5], - stop_sequence=["\n"], - version=0, -) -natural_questions = LightevalTaskConfig( - name="natural_questions", - prompt_function=get_qa_prompt_function( - Language.ENGLISH, - lambda line: {"question": line["question"], "choices": [line["answer"]]}, - ), - suite=("lighteval",), - hf_repo="lighteval/small_natural_questions", - hf_subset="default", - evaluation_splits=("test",), - few_shots_split="few_shot", - generation_size=250, - stop_sequence=["\n", "Question:", "question:"], - metrics=( - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ), -) -navigate_bigbench = LightevalTaskConfig( - name="navigate", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="navigate", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -nonsense_words_grammar_bigbench = LightevalTaskConfig( - name="nonsense_words_grammar", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="nonsense_words_grammar", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -novel_concepts_bigbench_lite = LightevalTaskConfig( - name="novel_concepts", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="novel_concepts", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -numeracy_linear_example_helm = LightevalTaskConfig( - name="numeracy:linear_example", - suite=["helm"], - prompt_function=prompt.numeracy, - hf_repo="lighteval/numeracy", - hf_subset="linear_example", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -numeracy_linear_standard_helm = LightevalTaskConfig( - name="numeracy:linear_standard", - suite=["helm"], - prompt_function=prompt.numeracy, - hf_repo="lighteval/numeracy", - hf_subset="linear_standard", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -numeracy_parabola_example_helm = LightevalTaskConfig( - name="numeracy:parabola_example", - suite=["helm"], - prompt_function=prompt.numeracy, - hf_repo="lighteval/numeracy", - hf_subset="parabola_example", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -numeracy_parabola_standard_helm = LightevalTaskConfig( - name="numeracy:parabola_standard", - suite=["helm"], - prompt_function=prompt.numeracy, - hf_repo="lighteval/numeracy", - hf_subset="parabola_standard", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -numeracy_paraboloid_example_helm = LightevalTaskConfig( - name="numeracy:paraboloid_example", - suite=["helm"], - prompt_function=prompt.numeracy, - hf_repo="lighteval/numeracy", - hf_subset="paraboloid_example", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -numeracy_paraboloid_standard_helm = LightevalTaskConfig( - name="numeracy:paraboloid_standard", - suite=["helm"], - prompt_function=prompt.numeracy, - hf_repo="lighteval/numeracy", - hf_subset="paraboloid_standard", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -numeracy_plane_example_helm = LightevalTaskConfig( - name="numeracy:plane_example", - suite=["helm"], - prompt_function=prompt.numeracy, - hf_repo="lighteval/numeracy", - hf_subset="plane_example", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -numeracy_plane_standard_helm = LightevalTaskConfig( - name="numeracy:plane_standard", - suite=["helm"], - prompt_function=prompt.numeracy, - hf_repo="lighteval/numeracy", - hf_subset="plane_standard", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ], - stop_sequence=["\n"], - version=0, -) -object_counting_bigbench = LightevalTaskConfig( - name="object_counting", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="object_counting", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -odd_one_out_bigbench = LightevalTaskConfig( - name="odd_one_out", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="odd_one_out", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -openbookqa_helm = LightevalTaskConfig( - name="openbookqa", - suite=["helm", "commonsense_scenario", "helm_general"], - prompt_function=prompt.openbookqa_helm, - hf_repo="openbookqa", - hf_subset="main", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -openbookqa_lighteval = LightevalTaskConfig( - name="openbookqa", - suite=["lighteval"], - prompt_function=prompt.openbookqa, - hf_repo="openbookqa", - hf_subset="main", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -operators_bigbench_lite = LightevalTaskConfig( - name="operators", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="operators", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -paragraph_segmentation_bigbench = LightevalTaskConfig( - name="paragraph_segmentation", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="paragraph_segmentation", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -parsinlu_qa_bigbench = LightevalTaskConfig( - name="parsinlu_qa", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="parsinlu_qa", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -parsinlu_reading_comprehension_bigbench_lite = LightevalTaskConfig( - name="parsinlu_reading_comprehension", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="parsinlu_reading_comprehension", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=None, - version=0, -) -penguins_in_a_table_bigbench = LightevalTaskConfig( - name="penguins_in_a_table", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="penguins_in_a_table", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -periodic_elements_bigbench = LightevalTaskConfig( - name="periodic_elements", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="periodic_elements", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -persian_idioms_bigbench = LightevalTaskConfig( - name="persian_idioms", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="persian_idioms", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -phrase_relatedness_bigbench = LightevalTaskConfig( - name="phrase_relatedness", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="phrase_relatedness", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -physical_intuition_bigbench = LightevalTaskConfig( - name="physical_intuition", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="physical_intuition", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -physics_bigbench = LightevalTaskConfig( - name="physics", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="physics", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -physics_questions_bigbench = LightevalTaskConfig( - name="physics_questions", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="physics_questions", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -piqa_lighteval = LightevalTaskConfig( - name="piqa", - suite=["lighteval"], - prompt_function=prompt.piqa_harness, - hf_repo="ybisk/piqa", - hf_subset="plain_text", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -piqa_helm = LightevalTaskConfig( - name="piqa", - suite=["helm", "commonsense_scenario"], - prompt_function=prompt.piqa_helm, - hf_repo="ybisk/piqa", - hf_subset="plain_text", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -play_dialog_same_or_different_bigbench_lite = LightevalTaskConfig( - name="play_dialog_same_or_different", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="play_dialog_same_or_different", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -polish_sequence_labeling_bigbench = LightevalTaskConfig( - name="polish_sequence_labeling", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="polish_sequence_labeling", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.f1_score], - stop_sequence=["\n"], - version=0, -) -presuppositions_as_nli_bigbench = LightevalTaskConfig( - name="presuppositions_as_nli", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="presuppositions_as_nli", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -prost_lighteval = LightevalTaskConfig( - name="prost", - suite=["lighteval"], - prompt_function=prompt.prost, - hf_repo="lighteval/prost", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -pubmedqa_lighteval = LightevalTaskConfig( - name="pubmedqa", - suite=["lighteval"], - prompt_function=prompt.pubmed_qa, - hf_repo="pubmed_qa", - hf_subset="pqa_labeled", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -pubmedqa_helm = LightevalTaskConfig( - name="pubmedqa", - suite=["helm"], - prompt_function=prompt.pubmed_qa_helm, - hf_repo="pubmed_qa", - hf_subset="pqa_labeled", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -qa4mre_2011_lighteval = LightevalTaskConfig( - name="qa4mre:2011", - suite=["lighteval"], - prompt_function=prompt.qa4mre, - hf_repo="qa4mre", - hf_subset="2011.main.EN", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -qa4mre_2012_lighteval = LightevalTaskConfig( - name="qa4mre:2012", - suite=["lighteval"], - prompt_function=prompt.qa4mre, - hf_repo="qa4mre", - hf_subset="2012.main.EN", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -qa4mre_2013_lighteval = LightevalTaskConfig( - name="qa4mre:2013", - suite=["lighteval"], - prompt_function=prompt.qa4mre, - hf_repo="qa4mre", - hf_subset="2013.main.EN", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -qa_wikidata_bigbench = LightevalTaskConfig( - name="qa_wikidata", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="qa_wikidata", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.bleurt, - Metrics.bleu, - Metrics.rouge_t5, - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -qasper_lighteval = LightevalTaskConfig( - name="qasper", - suite=["lighteval"], - prompt_function=prompt.qasper, - hf_repo="allenai/qasper", - hf_subset="qasper", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer})], - stop_sequence=["\n"], - version=0, -) -qasper_ll_lighteval = LightevalTaskConfig( - name="qasper_ll", - suite=["lighteval"], - prompt_function=prompt.qasper_ll, - hf_repo="allenai/qasper", - hf_subset="qasper", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -quac_helm = LightevalTaskConfig( - name="quac", - suite=["helm"], - prompt_function=prompt.quac, - hf_repo="lighteval/quac_helm", - hf_subset="default", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.f1_score, - ], - stop_sequence=["\n"], - version=0, -) -question_selection_bigbench = LightevalTaskConfig( - name="question_selection", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="question_selection", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -race_high_lighteval = LightevalTaskConfig( - name="race:high", - suite=["lighteval", "race"], - prompt_function=prompt.race, - hf_repo="EleutherAI/race", - hf_subset="high", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -raft_ade_corpus_v2_helm = LightevalTaskConfig( - name="raft:ade_corpus_v2", - suite=["helm", "helm_general"], - prompt_function=prompt.raft_ade_corpus_v2, - hf_repo="ought/raft", - hf_subset="ade_corpus_v2", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -raft_banking_77_helm = LightevalTaskConfig( - name="raft:banking_77", - suite=["helm", "helm_general"], - prompt_function=prompt.raft_banking_77, - hf_repo="ought/raft", - hf_subset="banking_77", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -raft_neurips_impact_statement_risks_helm = LightevalTaskConfig( - name="raft:neurips_impact_statement_risks", - suite=["helm", "helm_general"], - prompt_function=prompt.raft_neurips_impact_statement_risks, - hf_repo="ought/raft", - hf_subset="neurips_impact_statement_risks", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -raft_one_stop_english_helm = LightevalTaskConfig( - name="raft:one_stop_english", - suite=["helm", "helm_general"], - prompt_function=prompt.raft_one_stop_english, - hf_repo="ought/raft", - hf_subset="one_stop_english", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -raft_overruling_helm = LightevalTaskConfig( - name="raft:overruling", - suite=["helm", "helm_general"], - prompt_function=prompt.raft_overruling, - hf_repo="ought/raft", - hf_subset="overruling", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -raft_semiconductor_org_types_helm = LightevalTaskConfig( - name="raft:semiconductor_org_types", - suite=["helm", "helm_general"], - prompt_function=prompt.raft_semiconductor_org_types, - hf_repo="ought/raft", - hf_subset="semiconductor_org_types", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -raft_systematic_review_inclusion_helm = LightevalTaskConfig( - name="raft:systematic_review_inclusion", - suite=["helm", "helm_general"], - prompt_function=prompt.raft_systematic_review_inclusion, - hf_repo="ought/raft", - hf_subset="systematic_review_inclusion", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -raft_tai_safety_research_helm = LightevalTaskConfig( - name="raft:tai_safety_research", - suite=["helm", "helm_general"], - prompt_function=prompt.raft_tai_safety_research, - hf_repo="ought/raft", - hf_subset="tai_safety_research", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -raft_terms_of_service_helm = LightevalTaskConfig( - name="raft:terms_of_service", - suite=["helm", "helm_general"], - prompt_function=prompt.raft_terms_of_service, - hf_repo="ought/raft", - hf_subset="terms_of_service", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -raft_tweet_eval_hate_helm = LightevalTaskConfig( - name="raft:tweet_eval_hate", - suite=["helm", "helm_general"], - prompt_function=prompt.raft_tweet_eval_hate, - hf_repo="ought/raft", - hf_subset="tweet_eval_hate", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -raft_twitter_complaints_helm = LightevalTaskConfig( - name="raft:twitter_complaints", - suite=["helm", "helm_general"], - prompt_function=prompt.raft_twitter_complaints, - hf_repo="ought/raft", - hf_subset="twitter_complaints", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=30, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score_macro, - Metrics.f1_score_micro, - ], - stop_sequence=["\n"], - version=0, -) -real_or_fake_text_bigbench = LightevalTaskConfig( - name="real_or_fake_text", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="real_or_fake_text", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -real_toxicity_prompts_helm = LightevalTaskConfig( - name="real_toxicity_prompts", - suite=["helm"], - prompt_function=prompt.real_toxicity_prompts, - hf_repo="allenai/real-toxicity-prompts", - hf_subset="default", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[Metrics.prediction_perplexity], - stop_sequence=["\n"], - version=0, -) -reasoning_about_colored_objects_bigbench = LightevalTaskConfig( - name="reasoning_about_colored_objects", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="reasoning_about_colored_objects", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -repeat_copy_logic_bigbench_lite = LightevalTaskConfig( - name="repeat_copy_logic", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="repeat_copy_logic", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -rephrase_bigbench = LightevalTaskConfig( - name="rephrase", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="rephrase", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.rouge_t5, - Metrics.bleu, - Metrics.loglikelihood_acc, - Metrics.exact_match(sample_params={"strip_strings": False}), - ], - stop_sequence=["\n"], - version=0, -) -rhyming_bigbench = LightevalTaskConfig( - name="rhyming", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="rhyming", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -riddle_sense_bigbench = LightevalTaskConfig( - name="riddle_sense", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="riddle_sense", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -ruin_names_bigbench = LightevalTaskConfig( - name="ruin_names", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="ruin_names", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -salient_translation_error_detection_bigbench = LightevalTaskConfig( - name="salient_translation_error_detection", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="salient_translation_error_detection", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -scientific_press_release_bigbench = LightevalTaskConfig( - name="scientific_press_release", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="scientific_press_release", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -sciq_lighteval = LightevalTaskConfig( - name="sciq", - suite=["lighteval"], - prompt_function=prompt.sciq, - hf_repo="sciq", - hf_subset="default", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -semantic_parsing_in_context_sparc_bigbench = LightevalTaskConfig( - name="semantic_parsing_in_context_sparc", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="semantic_parsing_in_context_sparc", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -semantic_parsing_spider_bigbench = LightevalTaskConfig( - name="semantic_parsing_spider", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="semantic_parsing_spider", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -sentence_ambiguity_bigbench = LightevalTaskConfig( - name="sentence_ambiguity", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="sentence_ambiguity", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -similarities_abstraction_bigbench = LightevalTaskConfig( - name="similarities_abstraction", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="similarities_abstraction", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -simp_turing_concept_bigbench = LightevalTaskConfig( - name="simp_turing_concept", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="simp_turing_concept", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -simpleqa = LightevalTaskConfig( - name="simpleqa", - suite=["lighteval"], - prompt_function=prompt.simpleqa, - hf_repo="lighteval/SimpleQA", - hf_subset="default", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split="few_shot", - few_shots_select=None, - generation_size=2048, - metrics=[Metrics.simpleqa_judge], - stop_sequence=["\n"], - version=0, -) -simple_arithmetic_json_bigbench = LightevalTaskConfig( - name="simple_arithmetic_json", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="simple_arithmetic_json", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -simple_arithmetic_json_multiple_choice_bigbench = LightevalTaskConfig( - name="simple_arithmetic_json_multiple_choice", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="simple_arithmetic_json_multiple_choice", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -simple_arithmetic_json_subtasks_bigbench = LightevalTaskConfig( - name="simple_arithmetic_json_subtasks", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="simple_arithmetic_json_subtasks", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -simple_arithmetic_multiple_targets_json_bigbench = LightevalTaskConfig( - name="simple_arithmetic_multiple_targets_json", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="simple_arithmetic_multiple_targets_json", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -simple_ethical_questions_bigbench = LightevalTaskConfig( - name="simple_ethical_questions", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="simple_ethical_questions", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -simple_text_editing_bigbench = LightevalTaskConfig( - name="simple_text_editing", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="simple_text_editing", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -siqa_helm = LightevalTaskConfig( - name="siqa", - suite=["helm", "commonsense_scenario"], - prompt_function=prompt.siqa, - hf_repo="allenai/social_i_qa", - hf_subset="default", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -snarks_bigbench = LightevalTaskConfig( - name="snarks", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="snarks", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -social_iqa_bigbench = LightevalTaskConfig( - name="social_iqa", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="social_iqa", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -social_support_bigbench = LightevalTaskConfig( - name="social_support", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="social_support", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.f1_score_macro], - stop_sequence=["\n"], - version=0, -) -sports_understanding_bigbench = LightevalTaskConfig( - name="sports_understanding", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="sports_understanding", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -squad_v2 = LightevalTaskConfig( - name="squad_v2", - prompt_function=get_qa_prompt_function( - Language.ENGLISH, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="rajpurkar/squad_v2", - hf_subset="squad_v2", - hf_filter=lambda line: any(ans for ans in line["answers"]["text"] if len(ans) > 0), - evaluation_splits=("validation",), - few_shots_split="train", - stop_sequence=["\n", "Question:", "question:"], - generation_size=200, - metrics=( - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - Metrics.f1_score(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - ), -) -storycloze_2016_lighteval = LightevalTaskConfig( - name="storycloze:2016", - suite=["lighteval", "storycloze"], - prompt_function=prompt.storycloze, - hf_repo="MoE-UNC/story_cloze", - hf_subset="2016", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -storycloze_2018_lighteval = LightevalTaskConfig( - name="storycloze:2018", - suite=["lighteval", "storycloze"], - prompt_function=prompt.storycloze, - hf_repo="MoE-UNC/story_cloze", - hf_subset="2018", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -strange_stories_bigbench_lite = LightevalTaskConfig( - name="strange_stories", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="strange_stories", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -strategyqa_bigbench_lite = LightevalTaskConfig( - name="strategyqa", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="strategyqa", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -sufficient_information_bigbench = LightevalTaskConfig( - name="sufficient_information", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="sufficient_information", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -suicide_risk_bigbench = LightevalTaskConfig( - name="suicide_risk", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="suicide_risk", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -summarization_cnn_dm_helm = LightevalTaskConfig( - name="summarization:cnn-dm", - suite=["helm", "helm_general"], - prompt_function=prompt.cnn_dm, - hf_repo="lighteval/summarization", - hf_subset="cnn-dm", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metrics=[ - Metrics.rouge1, - Metrics.rouge2, - Metrics.rougeL, - Metrics.faithfulness, - Metrics.extractiveness, - Metrics.bert_score, - ], - stop_sequence=["\n"], - version=0, -) -summarization_xsum_helm = LightevalTaskConfig( - name="summarization:xsum", - suite=["helm", "helm_general"], - prompt_function=prompt.xsum, - hf_repo="lighteval/summarization", - hf_subset="xsum", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=64, - metrics=[ - Metrics.rouge1, - Metrics.rouge2, - Metrics.rougeL, - Metrics.faithfulness, - Metrics.extractiveness, - Metrics.bert_score, - ], - stop_sequence=["\n"], - version=0, -) -summarization_xsum_sampled_helm = LightevalTaskConfig( - name="summarization:xsum-sampled", - suite=["helm"], - prompt_function=prompt.xsum, - hf_repo="lighteval/summarization", - hf_subset="xsum-sampled", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=64, - metrics=[ - Metrics.rouge1, - Metrics.rouge2, - Metrics.rougeL, - Metrics.faithfulness, - Metrics.extractiveness, - Metrics.bert_score, - ], - stop_sequence=["\n"], - version=0, -) -super_glue_boolq_lighteval = LightevalTaskConfig( - name="super_glue:boolq", - suite=["lighteval", "superglue"], - prompt_function=prompt.boolq_harness, - hf_repo="super_glue", - hf_subset="boolq", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -super_glue_cb_lighteval = LightevalTaskConfig( - name="super_glue:cb", - suite=["lighteval", "superglue"], - prompt_function=prompt.cb, - hf_repo="super_glue", - hf_subset="cb", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc, Metrics.multi_f1_numeric], - stop_sequence=["\n"], - version=0, -) -super_glue_copa_lighteval = LightevalTaskConfig( - name="super_glue:copa", - suite=["lighteval", "superglue"], - prompt_function=prompt.copa, - hf_repo="super_glue", - hf_subset="copa", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -super_glue_rte_lighteval = LightevalTaskConfig( - name="super_glue:rte", - suite=["lighteval", "superglue"], - prompt_function=prompt.rte, - hf_repo="super_glue", - hf_subset="rte", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -super_glue_multirc_lighteval = LightevalTaskConfig( - name="super_glue:multirc", - suite=["lighteval", "superglue"], - prompt_function=prompt.multirc, - hf_repo="super_glue", - hf_subset="multirc", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -super_glue_wic_lighteval = LightevalTaskConfig( - name="super_glue:wic", - suite=["lighteval", "superglue"], - prompt_function=prompt.wic, - hf_repo="super_glue", - hf_subset="wic", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -super_glue_wsc_lighteval = LightevalTaskConfig( - name="super_glue:wsc", - suite=["lighteval", "superglue"], - prompt_function=prompt.wsc, - hf_repo="super_glue", - hf_subset="wsc", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -swahili_english_proverbs_bigbench = LightevalTaskConfig( - name="swahili_english_proverbs", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="swahili_english_proverbs", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -swag_lighteval = LightevalTaskConfig( - name="swag", - suite=["lighteval"], - prompt_function=prompt.swag, - hf_repo="swag", - hf_subset="regular", - hf_avail_splits=["train", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm(ignore_first_space=True)}), - ], - stop_sequence=["\n"], - version=0, -) -swedish_to_german_proverbs_bigbench = LightevalTaskConfig( - name="swedish_to_german_proverbs", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="swedish_to_german_proverbs", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -symbol_interpretation_bigbench_lite = LightevalTaskConfig( - name="symbol_interpretation", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="symbol_interpretation", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -synthetic_reasoning_induction_helm = LightevalTaskConfig( - name="synthetic_reasoning:induction", - suite=["helm"], - prompt_function=prompt.synthetic_reasoning, - hf_repo="lighteval/synthetic_reasoning", - hf_subset="induction", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=50, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -synthetic_reasoning_natural_easy_helm = LightevalTaskConfig( - name="synthetic_reasoning:natural_easy", - suite=["helm"], - prompt_function=prompt.synthetic_reasoning_natural, - hf_repo="lighteval/synthetic_reasoning_natural", - hf_subset="easy", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[Metrics.exact_match, Metrics.f1_score], - stop_sequence=["\n"], - version=0, -) -synthetic_reasoning_natural_hard_helm = LightevalTaskConfig( - name="synthetic_reasoning:natural_hard", - suite=["helm"], - prompt_function=prompt.synthetic_reasoning_natural, - hf_repo="lighteval/synthetic_reasoning_natural", - hf_subset="hard", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[Metrics.exact_match, Metrics.f1_score], - stop_sequence=["\n"], - version=0, -) -synthetic_reasoning_pattern_match_helm = LightevalTaskConfig( - name="synthetic_reasoning:pattern_match", - suite=["helm"], - prompt_function=prompt.synthetic_reasoning, - hf_repo="lighteval/synthetic_reasoning", - hf_subset="pattern_match", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=50, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -synthetic_reasoning_variable_substitution_helm = LightevalTaskConfig( - name="synthetic_reasoning:variable_substitution", - suite=["helm"], - prompt_function=prompt.synthetic_reasoning, - hf_repo="lighteval/synthetic_reasoning", - hf_subset="variable_substitution", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=50, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -tellmewhy_bigbench = LightevalTaskConfig( - name="tellmewhy", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="tellmewhy", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5], - stop_sequence=["\n"], - version=0, -) -temporal_sequences_bigbench = LightevalTaskConfig( - name="temporal_sequences", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="temporal_sequences", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -tense_bigbench = LightevalTaskConfig( - name="tense", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="tense", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -the_pile_arxiv_helm = LightevalTaskConfig( - name="the_pile:arxiv", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="arxiv", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_bibliotik_helm = LightevalTaskConfig( - name="the_pile:bibliotik", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="bibliotik", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_commoncrawl_helm = LightevalTaskConfig( - name="the_pile:commoncrawl", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="commoncrawl", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_dm_mathematics_helm = LightevalTaskConfig( - name="the_pile:dm-mathematics", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="dm-mathematics", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_enron_helm = LightevalTaskConfig( - name="the_pile:enron", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="enron", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_europarl_helm = LightevalTaskConfig( - name="the_pile:europarl", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="europarl", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_freelaw_helm = LightevalTaskConfig( - name="the_pile:freelaw", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="freelaw", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_github_helm = LightevalTaskConfig( - name="the_pile:github", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="github", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_gutenberg_helm = LightevalTaskConfig( - name="the_pile:gutenberg", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="gutenberg", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_hackernews_helm = LightevalTaskConfig( - name="the_pile:hackernews", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="hackernews", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_nih_exporter_helm = LightevalTaskConfig( - name="the_pile:nih-exporter", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="nih-exporter", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_opensubtitles_helm = LightevalTaskConfig( - name="the_pile:opensubtitles", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="opensubtitles", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_openwebtext2_helm = LightevalTaskConfig( - name="the_pile:openwebtext2", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="openwebtext2", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_pubmed_abstracts_helm = LightevalTaskConfig( - name="the_pile:pubmed-abstracts", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="pubmed-abstracts", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_pubmed_central_helm = LightevalTaskConfig( - name="the_pile:pubmed-central", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="pubmed-central", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_stackexchange_helm = LightevalTaskConfig( - name="the_pile:stackexchange", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="stackexchange", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_upsto_helm = LightevalTaskConfig( - name="the_pile:upsto", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="uspto", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_wikipedia_helm = LightevalTaskConfig( - name="the_pile:wikipedia", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="wikipedia", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -the_pile_youtubesubtitles_helm = LightevalTaskConfig( - name="the_pile:youtubesubtitles", - suite=["helm"], - prompt_function=prompt.the_pile, - hf_repo="lighteval/pile_helm", - hf_subset="youtubesubtitles", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -timedial_bigbench = LightevalTaskConfig( - name="timedial", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="timedial", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -toxigen_lighteval = LightevalTaskConfig( - name="toxigen", - suite=["lighteval"], - prompt_function=prompt.toxigen, - hf_repo="skg/toxigen-data", - hf_subset="annotated", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()}), - ], - stop_sequence=["\n"], - version=0, -) -topical_chat_bigbench = LightevalTaskConfig( - name="topical_chat", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="topical_chat", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.bleurt], - stop_sequence=["\n"], - version=0, -) -tracking_shuffled_objects_bigbench = LightevalTaskConfig( - name="tracking_shuffled_objects", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="tracking_shuffled_objects", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -triviaqa_lighteval = LightevalTaskConfig( - name="triviaqa", - suite=["lighteval"], - prompt_function=prompt.triviaqa, - hf_repo="trivia_qa", - hf_subset="rc.nocontext", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=20, - metrics=[ - Metrics.exact_match(sample_params={"strip_strings": True, "normalize_pred": harness_triviaqa_normalizer}) - ], - stop_sequence=["\n", ".", ","], - version=0, -) -truthfulqa_gen_lighteval = LightevalTaskConfig( - name="truthfulqa:gen", - suite=["lighteval"], - prompt_function=prompt.truthful_qa_generative, - hf_repo="truthful_qa", - hf_subset="generation", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=200, - metrics=[Metrics.bleu, Metrics.rouge_t5], - stop_sequence=["\n"], - version=0, -) -truthfulqa_mc_leaderboard = LightevalTaskConfig( - name="truthfulqa:mc", - suite=["leaderboard"], - prompt_function=prompt.truthful_qa_multiple_choice, - hf_repo="truthful_qa", - hf_subset="multiple_choice", - hf_avail_splits=["validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.truthfulqa_mc_metrics], - stop_sequence=["\n"], - version=0, -) -truthfulqa_helm = LightevalTaskConfig( - name="truthfulqa", - suite=["helm", "helm_general"], - prompt_function=prompt.truthful_qa_helm, - hf_repo="lighteval/truthfulqa_helm", - hf_subset="default", - hf_avail_splits=["train", "valid"], - evaluation_splits=["valid"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -twitterAAE_aa_helm = LightevalTaskConfig( - name="twitterAAE:aa", - suite=["helm"], - prompt_function=prompt.twitter_aae, - hf_repo="lighteval/twitterAAE", - hf_subset="aa", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -twitterAAE_white_helm = LightevalTaskConfig( - name="twitterAAE:white", - suite=["helm"], - prompt_function=prompt.twitter_aae, - hf_repo="lighteval/twitterAAE", - hf_subset="white", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -understanding_fables_bigbench = LightevalTaskConfig( - name="understanding_fables", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="understanding_fables", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -undo_permutation_bigbench = LightevalTaskConfig( - name="undo_permutation", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="undo_permutation", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -unit_conversion_bigbench = LightevalTaskConfig( - name="unit_conversion", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="unit_conversion", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -unit_interpretation_bigbench = LightevalTaskConfig( - name="unit_interpretation", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="unit_interpretation", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -unnatural_in_context_learning_bigbench = LightevalTaskConfig( - name="unnatural_in_context_learning", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="unnatural_in_context_learning", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -unscramble_anagrams1_lighteval = LightevalTaskConfig( - name="unscramble:anagrams1", - suite=["lighteval", "unscramble"], - prompt_function=prompt.unscramble, - hf_repo="lighteval/GPT3_unscramble", - hf_subset="default", - hf_avail_splits=["mid_word_1_anagrams"], - evaluation_splits=["mid_word_1_anagrams"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -unscramble_anagrams2_lighteval = LightevalTaskConfig( - name="unscramble:anagrams2", - suite=["lighteval", "unscramble"], - prompt_function=prompt.unscramble, - hf_repo="lighteval/GPT3_unscramble", - hf_subset="default", - hf_avail_splits=["mid_word_2_anagrams"], - evaluation_splits=["mid_word_2_anagrams"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -unscramble_cycle_letters_lighteval = LightevalTaskConfig( - name="unscramble:cycle_letters", - suite=["lighteval", "unscramble"], - prompt_function=prompt.unscramble, - hf_repo="lighteval/GPT3_unscramble", - hf_subset="default", - hf_avail_splits=["cycle_letters_in_word"], - evaluation_splits=["cycle_letters_in_word"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -unscramble_random_insertion_lighteval = LightevalTaskConfig( - name="unscramble:random_insertion", - suite=["lighteval", "unscramble"], - prompt_function=prompt.unscramble, - hf_repo="lighteval/GPT3_unscramble", - hf_subset="default", - hf_avail_splits=["random_insertion_in_word"], - evaluation_splits=["random_insertion_in_word"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -unscramble_reversed_words_lighteval = LightevalTaskConfig( - name="unscramble:reversed_words", - suite=["lighteval", "unscramble"], - prompt_function=prompt.unscramble, - hf_repo="lighteval/GPT3_unscramble", - hf_subset="default", - hf_avail_splits=["reversed_words"], - evaluation_splits=["reversed_words"], - few_shots_split=None, - few_shots_select=None, - generation_size=5, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -vitaminc_fact_verification_bigbench_lite = LightevalTaskConfig( - name="vitaminc_fact_verification", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="vitaminc_fact_verification", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -webqs_lighteval = LightevalTaskConfig( - name="webqs", - suite=["lighteval"], - prompt_function=prompt.webqs, - hf_repo="web_questions", - hf_subset="default", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.acc_golds_likelihood], - stop_sequence=["\n"], - version=0, -) -what_is_the_tao_bigbench = LightevalTaskConfig( - name="what_is_the_tao", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="what_is_the_tao", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -which_wiki_edit_bigbench = LightevalTaskConfig( - name="which_wiki_edit", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="which_wiki_edit", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -wikifact_applies_to_jurisdiction_helm = LightevalTaskConfig( - name="wikifact:applies_to_jurisdiction", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="applies_to_jurisdiction", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_atomic_number_helm = LightevalTaskConfig( - name="wikifact:atomic_number", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="atomic_number", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_author_helm = LightevalTaskConfig( - name="wikifact:author", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="author", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_award_received_helm = LightevalTaskConfig( - name="wikifact:award_received", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="award_received", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_basic_form_of_government_helm = LightevalTaskConfig( - name="wikifact:basic_form_of_government", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="basic_form_of_government", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_capital_helm = LightevalTaskConfig( - name="wikifact:capital", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="capital", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_capital_of_helm = LightevalTaskConfig( - name="wikifact:capital_of", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="capital_of", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_central_bank_helm = LightevalTaskConfig( - name="wikifact:central_bank", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="central_bank", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_composer_helm = LightevalTaskConfig( - name="wikifact:composer", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="composer", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_continent_helm = LightevalTaskConfig( - name="wikifact:continent", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="continent", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_country_helm = LightevalTaskConfig( - name="wikifact:country", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="country", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_country_of_citizenship_helm = LightevalTaskConfig( - name="wikifact:country_of_citizenship", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="country_of_citizenship", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_country_of_origin_helm = LightevalTaskConfig( - name="wikifact:country_of_origin", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="country_of_origin", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_creator_helm = LightevalTaskConfig( - name="wikifact:creator", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="creator", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_currency_helm = LightevalTaskConfig( - name="wikifact:currency", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="currency", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_defendant_helm = LightevalTaskConfig( - name="wikifact:defendant", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="defendant", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_developer_helm = LightevalTaskConfig( - name="wikifact:developer", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="developer", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_diplomatic_relation_helm = LightevalTaskConfig( - name="wikifact:diplomatic_relation", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="diplomatic_relation", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_director_helm = LightevalTaskConfig( - name="wikifact:director", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="director", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_discoverer_or_inventor_helm = LightevalTaskConfig( - name="wikifact:discoverer_or_inventor", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="discoverer_or_inventor", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_drug_or_therapy_used_for_treatment_helm = LightevalTaskConfig( - name="wikifact:drug_or_therapy_used_for_treatment", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="drug_or_therapy_used_for_treatment", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_educated_at_helm = LightevalTaskConfig( - name="wikifact:educated_at", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="educated_at", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_electron_configuration_helm = LightevalTaskConfig( - name="wikifact:electron_configuration", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="electron_configuration", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_employer_helm = LightevalTaskConfig( - name="wikifact:employer", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="employer", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_field_of_work_helm = LightevalTaskConfig( - name="wikifact:field_of_work", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="field_of_work", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_file_extension_helm = LightevalTaskConfig( - name="wikifact:file_extension", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="file_extension", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_genetic_association_helm = LightevalTaskConfig( - name="wikifact:genetic_association", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="genetic_association", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_genre_helm = LightevalTaskConfig( - name="wikifact:genre", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="genre", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_has_part_helm = LightevalTaskConfig( - name="wikifact:has_part", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="has_part", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_head_of_government_helm = LightevalTaskConfig( - name="wikifact:head_of_government", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="head_of_government", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_head_of_state_helm = LightevalTaskConfig( - name="wikifact:head_of_state", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="head_of_state", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_headquarters_location_helm = LightevalTaskConfig( - name="wikifact:headquarters_location", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="headquarters_location", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_industry_helm = LightevalTaskConfig( - name="wikifact:industry", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="industry", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_influenced_by_helm = LightevalTaskConfig( - name="wikifact:influenced_by", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="influenced_by", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_instance_of_helm = LightevalTaskConfig( - name="wikifact:instance_of", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="instance_of", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_instrument_helm = LightevalTaskConfig( - name="wikifact:instrument", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="instrument", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_language_of_work_or_name_helm = LightevalTaskConfig( - name="wikifact:language_of_work_or_name", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="language_of_work_or_name", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_languages_spoken_written_or_signed_helm = LightevalTaskConfig( - name="wikifact:languages_spoken_written_or_signed", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="languages_spoken_written_or_signed", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_laws_applied_helm = LightevalTaskConfig( - name="wikifact:laws_applied", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="laws_applied", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_located_in_the_administrative_territorial_entity_helm = LightevalTaskConfig( - name="wikifact:located_in_the_administrative_territorial_entity", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="located_in_the_administrative_territorial_entity", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_location_helm = LightevalTaskConfig( - name="wikifact:location", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="location", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_location_of_discovery_helm = LightevalTaskConfig( - name="wikifact:location_of_discovery", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="location_of_discovery", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_location_of_formation_helm = LightevalTaskConfig( - name="wikifact:location_of_formation", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="location_of_formation", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_majority_opinion_by_helm = LightevalTaskConfig( - name="wikifact:majority_opinion_by", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="majority_opinion_by", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_manufacturer_helm = LightevalTaskConfig( - name="wikifact:manufacturer", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="manufacturer", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_measured_physical_quantity_helm = LightevalTaskConfig( - name="wikifact:measured_physical_quantity", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="measured_physical_quantity", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_medical_condition_treated_helm = LightevalTaskConfig( - name="wikifact:medical_condition_treated", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="medical_condition_treated", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_member_of_helm = LightevalTaskConfig( - name="wikifact:member_of", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="member_of", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_member_of_political_party_helm = LightevalTaskConfig( - name="wikifact:member_of_political_party", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="member_of_political_party", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_member_of_sports_team_helm = LightevalTaskConfig( - name="wikifact:member_of_sports_team", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="member_of_sports_team", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_movement_helm = LightevalTaskConfig( - name="wikifact:movement", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="movement", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_named_after_helm = LightevalTaskConfig( - name="wikifact:named_after", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="named_after", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_native_language_helm = LightevalTaskConfig( - name="wikifact:native_language", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="native_language", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_number_of_processor_cores_helm = LightevalTaskConfig( - name="wikifact:number_of_processor_cores", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="number_of_processor_cores", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_occupation_helm = LightevalTaskConfig( - name="wikifact:occupation", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="occupation", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_office_held_by_head_of_government_helm = LightevalTaskConfig( - name="wikifact:office_held_by_head_of_government", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="office_held_by_head_of_government", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_office_held_by_head_of_state_helm = LightevalTaskConfig( - name="wikifact:office_held_by_head_of_state", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="office_held_by_head_of_state", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_official_language_helm = LightevalTaskConfig( - name="wikifact:official_language", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="official_language", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_operating_system_helm = LightevalTaskConfig( - name="wikifact:operating_system", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="operating_system", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_original_language_of_film_or_TV_show_helm = LightevalTaskConfig( - name="wikifact:original_language_of_film_or_TV_show", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="original_language_of_film_or_TV_show", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_original_network_helm = LightevalTaskConfig( - name="wikifact:original_network", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="original_network", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_overrules_helm = LightevalTaskConfig( - name="wikifact:overrules", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="overrules", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_owned_by_helm = LightevalTaskConfig( - name="wikifact:owned_by", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="owned_by", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_part_of_helm = LightevalTaskConfig( - name="wikifact:part_of", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="part_of", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_participating_team_helm = LightevalTaskConfig( - name="wikifact:participating_team", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="participating_team", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_place_of_birth_helm = LightevalTaskConfig( - name="wikifact:place_of_birth", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="place_of_birth", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_place_of_death_helm = LightevalTaskConfig( - name="wikifact:place_of_death", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="place_of_death", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_plaintiff_helm = LightevalTaskConfig( - name="wikifact:plaintiff", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="plaintiff", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_position_held_helm = LightevalTaskConfig( - name="wikifact:position_held", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="position_held", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_position_played_on_team_helm = LightevalTaskConfig( - name="wikifact:position_played_on_team", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="position_played_on_team", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_programming_language_helm = LightevalTaskConfig( - name="wikifact:programming_language", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="programming_language", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_recommended_unit_of_measurement_helm = LightevalTaskConfig( - name="wikifact:recommended_unit_of_measurement", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="recommended_unit_of_measurement", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_record_label_helm = LightevalTaskConfig( - name="wikifact:record_label", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="record_label", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_religion_helm = LightevalTaskConfig( - name="wikifact:religion", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="religion", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_repealed_by_helm = LightevalTaskConfig( - name="wikifact:repealed_by", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="repealed_by", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_shares_border_with_helm = LightevalTaskConfig( - name="wikifact:shares_border_with", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="shares_border_with", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_solved_by_helm = LightevalTaskConfig( - name="wikifact:solved_by", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="solved_by", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_statement_describes_helm = LightevalTaskConfig( - name="wikifact:statement_describes", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="statement_describes", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_stock_exchange_helm = LightevalTaskConfig( - name="wikifact:stock_exchange", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="stock_exchange", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_subclass_of_helm = LightevalTaskConfig( - name="wikifact:subclass_of", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="subclass_of", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_subsidiary_helm = LightevalTaskConfig( - name="wikifact:subsidiary", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="subsidiary", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_symptoms_and_signs_helm = LightevalTaskConfig( - name="wikifact:symptoms_and_signs", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="symptoms_and_signs", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_therapeutic_area_helm = LightevalTaskConfig( - name="wikifact:therapeutic_area", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="therapeutic_area", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_time_of_discovery_or_invention_helm = LightevalTaskConfig( - name="wikifact:time_of_discovery_or_invention", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="time_of_discovery_or_invention", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_twinned_administrative_body_helm = LightevalTaskConfig( - name="wikifact:twinned_administrative_body", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="twinned_administrative_body", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikifact_work_location_helm = LightevalTaskConfig( - name="wikifact:work_location", - suite=["helm"], - prompt_function=prompt.wikifact, - hf_repo="lighteval/wikifact", - hf_subset="work_location", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=8, - metrics=[ - Metrics.exact_match, - Metrics.exact_match(sample_params={"normalize_gold": helm_normalizer, "normalize_pred": helm_normalizer}), - Metrics.exact_match(sample_params={"type_exact_match": "prefix"}), - Metrics.exact_match( - sample_params={ - "normalize_gold": helm_normalizer, - "normalize_pred": helm_normalizer, - "type_exact_match": "prefix", - } - ), - ], - stop_sequence=["\n"], - version=0, -) -wikitext_2_lighteval = LightevalTaskConfig( - name="wikitext:2", - suite=["lighteval"], - prompt_function=prompt.wikitext, - hf_repo="wikitext", - hf_subset="wikitext-2-raw-v1", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -wikitext_103_document_level_harness = LightevalTaskConfig( - name="wikitext:103:document_level", - suite=["harness"], - prompt_function=prompt.wikitext_harness, - hf_repo="EleutherAI/wikitext_document_level", - hf_subset="wikitext-103-raw-v1", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -wikitext_103_document_level_helm = LightevalTaskConfig( - name="wikitext:103:document_level", - suite=["helm"], - prompt_function=prompt.wikitext_helm, - hf_repo="EleutherAI/wikitext_document_level", - hf_subset="wikitext-103-raw-v1", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], - stop_sequence=["\n"], - version=0, -) -wino_x_german_bigbench = LightevalTaskConfig( - name="wino_x_german", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="wino_x_german", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -winogrande_leaderboard = LightevalTaskConfig( - name="winogrande", - suite=["leaderboard"], - prompt_function=prompt.winogrande, - hf_repo="winogrande", - hf_subset="winogrande_xl", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -winowhy_bigbench_lite = LightevalTaskConfig( - name="winowhy", - suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function=prompt.bigbench_whitespace_after_query, - hf_repo="tasksource/bigbench", - hf_subset="winowhy", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -wmt08_cs_en_lighteval = LightevalTaskConfig( - name="wmt08:cs-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt08_cs-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt08_de_en_lighteval = LightevalTaskConfig( - name="wmt08:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt08_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt08_en_cs_lighteval = LightevalTaskConfig( - name="wmt08:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt08_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt08_en_de_lighteval = LightevalTaskConfig( - name="wmt08:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt08_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt08_en_es_lighteval = LightevalTaskConfig( - name="wmt08:en-es", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt08_en-es", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt08_en_fr_lighteval = LightevalTaskConfig( - name="wmt08:en-fr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt08_en-fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt08_en_hu_lighteval = LightevalTaskConfig( - name="wmt08:en-hu", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt08_en-hu", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt08_es_en_lighteval = LightevalTaskConfig( - name="wmt08:es-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt08_es-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt08_fr_en_lighteval = LightevalTaskConfig( - name="wmt08:fr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt08_fr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt08_hu_en_lighteval = LightevalTaskConfig( - name="wmt08:hu-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt08_hu-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt09_cs_en_lighteval = LightevalTaskConfig( - name="wmt09:cs-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt09_cs-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt09_de_en_lighteval = LightevalTaskConfig( - name="wmt09:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt09_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt09_en_cs_lighteval = LightevalTaskConfig( - name="wmt09:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt09_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt09_en_de_lighteval = LightevalTaskConfig( - name="wmt09:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt09_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt09_en_es_lighteval = LightevalTaskConfig( - name="wmt09:en-es", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt09_en-es", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt09_en_fr_lighteval = LightevalTaskConfig( - name="wmt09:en-fr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt09_en-fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt09_en_hu_lighteval = LightevalTaskConfig( - name="wmt09:en-hu", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt09_en-hu", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt09_en_it_lighteval = LightevalTaskConfig( - name="wmt09:en-it", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt09_en-it", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt09_es_en_lighteval = LightevalTaskConfig( - name="wmt09:es-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt09_es-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt09_fr_en_lighteval = LightevalTaskConfig( - name="wmt09:fr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt09_fr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt09_hu_en_lighteval = LightevalTaskConfig( - name="wmt09:hu-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt09_hu-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt09_it_en_lighteval = LightevalTaskConfig( - name="wmt09:it-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt09_it-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt10_cs_en_lighteval = LightevalTaskConfig( - name="wmt10:cs-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt10_cs-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt10_de_en_lighteval = LightevalTaskConfig( - name="wmt10:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt10_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt10_en_cs_lighteval = LightevalTaskConfig( - name="wmt10:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt10_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt10_en_de_lighteval = LightevalTaskConfig( - name="wmt10:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt10_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt10_en_es_lighteval = LightevalTaskConfig( - name="wmt10:en-es", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt10_en-es", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt10_en_fr_lighteval = LightevalTaskConfig( - name="wmt10:en-fr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt10_en-fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt10_es_en_lighteval = LightevalTaskConfig( - name="wmt10:es-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt10_es-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt10_fr_en_lighteval = LightevalTaskConfig( - name="wmt10:fr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt10_fr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt11_cs_en_lighteval = LightevalTaskConfig( - name="wmt11:cs-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt11_cs-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt11_de_en_lighteval = LightevalTaskConfig( - name="wmt11:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt11_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt11_en_cs_lighteval = LightevalTaskConfig( - name="wmt11:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt11_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt11_en_de_lighteval = LightevalTaskConfig( - name="wmt11:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt11_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt11_en_es_lighteval = LightevalTaskConfig( - name="wmt11:en-es", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt11_en-es", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt11_en_fr_lighteval = LightevalTaskConfig( - name="wmt11:en-fr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt11_en-fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt11_es_en_lighteval = LightevalTaskConfig( - name="wmt11:es-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt11_es-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt11_fr_en_lighteval = LightevalTaskConfig( - name="wmt11:fr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt11_fr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt12_cs_en_lighteval = LightevalTaskConfig( - name="wmt12:cs-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt12_cs-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt12_de_en_lighteval = LightevalTaskConfig( - name="wmt12:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt12_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt12_en_cs_lighteval = LightevalTaskConfig( - name="wmt12:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt12_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt12_en_de_lighteval = LightevalTaskConfig( - name="wmt12:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt12_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt12_en_es_lighteval = LightevalTaskConfig( - name="wmt12:en-es", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt12_en-es", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt12_en_fr_lighteval = LightevalTaskConfig( - name="wmt12:en-fr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt12_en-fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt12_es_en_lighteval = LightevalTaskConfig( - name="wmt12:es-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt12_es-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt12_fr_en_lighteval = LightevalTaskConfig( - name="wmt12:fr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt12_fr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt13_cs_en_lighteval = LightevalTaskConfig( - name="wmt13:cs-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt13_cs-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt13_de_en_lighteval = LightevalTaskConfig( - name="wmt13:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt13_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt13_en_cs_lighteval = LightevalTaskConfig( - name="wmt13:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt13_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt13_en_de_lighteval = LightevalTaskConfig( - name="wmt13:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt13_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt13_en_es_lighteval = LightevalTaskConfig( - name="wmt13:en-es", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt13_en-es", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt13_en_fr_lighteval = LightevalTaskConfig( - name="wmt13:en-fr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt13_en-fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt13_en_ru_lighteval = LightevalTaskConfig( - name="wmt13:en-ru", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt13_en-ru", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt13_es_en_lighteval = LightevalTaskConfig( - name="wmt13:es-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt13_es-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt13_fr_en_lighteval = LightevalTaskConfig( - name="wmt13:fr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt13_fr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt13_ru_en_lighteval = LightevalTaskConfig( - name="wmt13:ru-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt13_ru-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_cs_en_lighteval = LightevalTaskConfig( - name="wmt14:cs-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt14_cs-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_de_en_lighteval = LightevalTaskConfig( - name="wmt14:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt14_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_en_cs_lighteval = LightevalTaskConfig( - name="wmt14:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt14_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_en_de_lighteval = LightevalTaskConfig( - name="wmt14:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt14_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_en_fr_lighteval = LightevalTaskConfig( - name="wmt14:en-fr", - suite=["lighteval", "gpt3_benchmarks"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="wmt14", - hf_subset="fr-en", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_en_fr_lighteval = LightevalTaskConfig( - name="wmt14:en-fr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt14_en-fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_en_hi_lighteval = LightevalTaskConfig( - name="wmt14:en-hi", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt14_en-hi", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_en_ru_lighteval = LightevalTaskConfig( - name="wmt14:en-ru", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt14_en-ru", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_fr_en_lighteval = LightevalTaskConfig( - name="wmt14:fr-en", - suite=["lighteval", "gpt3_benchmarks"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="wmt14", - hf_subset="fr-en", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_fr_en_lighteval = LightevalTaskConfig( - name="wmt14:fr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt14_fr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_hi_en_lighteval = LightevalTaskConfig( - name="wmt14:hi-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt14_hi-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_ru_en_lighteval = LightevalTaskConfig( - name="wmt14:ru-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt14_ru-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt14_cs_en_helm = LightevalTaskConfig( - name="wmt14:cs-en", - suite=["helm"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/wmt_14", - hf_subset="cs-en", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.bleu], - stop_sequence=["\n"], - version=0, -) -wmt14_de_en_helm = LightevalTaskConfig( - name="wmt14:de-en", - suite=["helm"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/wmt_14", - hf_subset="de-en", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.bleu], - stop_sequence=["\n"], - version=0, -) -wmt14_fr_en_helm = LightevalTaskConfig( - name="wmt14:fr-en", - suite=["helm"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/wmt_14", - hf_subset="fr-en", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.bleu], - stop_sequence=["\n"], - version=0, -) -wmt14_hi_en_helm = LightevalTaskConfig( - name="wmt14:hi-en", - suite=["helm"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/wmt_14", - hf_subset="hi-en", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.bleu], - stop_sequence=["\n"], - version=0, -) -wmt14_ru_en_helm = LightevalTaskConfig( - name="wmt14:ru-en", - suite=["helm"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/wmt_14", - hf_subset="ru-en", - hf_avail_splits=["train", "test", "validation"], - evaluation_splits=["validation", "test"], - few_shots_split=None, - few_shots_select=None, - generation_size=100, - metrics=[Metrics.bleu], - stop_sequence=["\n"], - version=0, -) -wmt15_cs_en_lighteval = LightevalTaskConfig( - name="wmt15:cs-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt15_cs-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt15_de_en_lighteval = LightevalTaskConfig( - name="wmt15:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt15_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt15_en_cs_lighteval = LightevalTaskConfig( - name="wmt15:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt15_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt15_en_de_lighteval = LightevalTaskConfig( - name="wmt15:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt15_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt15_en_fi_lighteval = LightevalTaskConfig( - name="wmt15:en-fi", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt15_en-fi", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt15_en_fr_lighteval = LightevalTaskConfig( - name="wmt15:en-fr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt15_en-fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt15_en_ru_lighteval = LightevalTaskConfig( - name="wmt15:en-ru", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt15_en-ru", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt15_fi_en_lighteval = LightevalTaskConfig( - name="wmt15:fi-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt15_fi-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt15_fr_en_lighteval = LightevalTaskConfig( - name="wmt15:fr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt15_fr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt15_ru_en_lighteval = LightevalTaskConfig( - name="wmt15:ru-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt15_ru-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_cs_en_lighteval = LightevalTaskConfig( - name="wmt16:cs-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt16_cs-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_de_en_lighteval = LightevalTaskConfig( - name="wmt16:de-en", - suite=["lighteval", "gpt3_benchmarks"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="wmt16", - hf_subset="de-en", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_de_en_lighteval = LightevalTaskConfig( - name="wmt16:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt16_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_en_cs_lighteval = LightevalTaskConfig( - name="wmt16:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt16_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_en_de_lighteval = LightevalTaskConfig( - name="wmt16:en-de", - suite=["lighteval", "gpt3_benchmarks"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="wmt16", - hf_subset="de-en", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_en_de_lighteval = LightevalTaskConfig( - name="wmt16:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt16_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_en_fi_lighteval = LightevalTaskConfig( - name="wmt16:en-fi", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt16_en-fi", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_en_ro_lighteval = LightevalTaskConfig( - name="wmt16:en-ro", - suite=["lighteval", "gpt3_benchmarks"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="wmt16", - hf_subset="ro-en", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_en_ro_lighteval = LightevalTaskConfig( - name="wmt16:en-ro", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt16_en-ro", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_en_ru_lighteval = LightevalTaskConfig( - name="wmt16:en-ru", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt16_en-ru", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_en_tr_lighteval = LightevalTaskConfig( - name="wmt16:en-tr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt16_en-tr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_fi_en_lighteval = LightevalTaskConfig( - name="wmt16:fi-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt16_fi-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_ro_en_lighteval = LightevalTaskConfig( - name="wmt16:ro-en", - suite=["lighteval", "gpt3_benchmarks"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="wmt16", - hf_subset="ro-en", - hf_avail_splits=["train", "validation", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_ro_en_lighteval = LightevalTaskConfig( - name="wmt16:ro-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt16_ro-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_ru_en_lighteval = LightevalTaskConfig( - name="wmt16:ru-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt16_ru-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt16_tr_en_lighteval = LightevalTaskConfig( - name="wmt16:tr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt16_tr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_cs_en_lighteval = LightevalTaskConfig( - name="wmt17:cs-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_cs-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_de_en_lighteval = LightevalTaskConfig( - name="wmt17:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_en_cs_lighteval = LightevalTaskConfig( - name="wmt17:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_en_de_lighteval = LightevalTaskConfig( - name="wmt17:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_en_fi_lighteval = LightevalTaskConfig( - name="wmt17:en-fi", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_en-fi", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_en_lv_lighteval = LightevalTaskConfig( - name="wmt17:en-lv", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_en-lv", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_en_ru_lighteval = LightevalTaskConfig( - name="wmt17:en-ru", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_en-ru", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_en_tr_lighteval = LightevalTaskConfig( - name="wmt17:en-tr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_en-tr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_en_zh_lighteval = LightevalTaskConfig( - name="wmt17:en-zh", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_en-zh", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_fi_en_lighteval = LightevalTaskConfig( - name="wmt17:fi-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_fi-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_lv_en_lighteval = LightevalTaskConfig( - name="wmt17:lv-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_lv-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_ru_en_lighteval = LightevalTaskConfig( - name="wmt17:ru-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_ru-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_tr_en_lighteval = LightevalTaskConfig( - name="wmt17:tr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_tr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt17_zh_en_lighteval = LightevalTaskConfig( - name="wmt17:zh-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt17_zh-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_cs_en_lighteval = LightevalTaskConfig( - name="wmt18:cs-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_cs-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_de_en_lighteval = LightevalTaskConfig( - name="wmt18:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_en_cs_lighteval = LightevalTaskConfig( - name="wmt18:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_en_de_lighteval = LightevalTaskConfig( - name="wmt18:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_en_et_lighteval = LightevalTaskConfig( - name="wmt18:en-et", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_en-et", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_en_fi_lighteval = LightevalTaskConfig( - name="wmt18:en-fi", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_en-fi", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_en_ru_lighteval = LightevalTaskConfig( - name="wmt18:en-ru", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_en-ru", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_en_tr_lighteval = LightevalTaskConfig( - name="wmt18:en-tr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_en-tr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_en_zh_lighteval = LightevalTaskConfig( - name="wmt18:en-zh", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_en-zh", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_et_en_lighteval = LightevalTaskConfig( - name="wmt18:et-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_et-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_fi_en_lighteval = LightevalTaskConfig( - name="wmt18:fi-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_fi-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_ru_en_lighteval = LightevalTaskConfig( - name="wmt18:ru-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_ru-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_tr_en_lighteval = LightevalTaskConfig( - name="wmt18:tr-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_tr-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt18_zh_en_lighteval = LightevalTaskConfig( - name="wmt18:zh-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt18_zh-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_cs_de_lighteval = LightevalTaskConfig( - name="wmt19:cs-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_cs-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_de_cs_lighteval = LightevalTaskConfig( - name="wmt19:de-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_de-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_de_en_lighteval = LightevalTaskConfig( - name="wmt19:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_de_fr_lighteval = LightevalTaskConfig( - name="wmt19:de-fr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_de-fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_en_cs_lighteval = LightevalTaskConfig( - name="wmt19:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_en_de_lighteval = LightevalTaskConfig( - name="wmt19:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_en_fi_lighteval = LightevalTaskConfig( - name="wmt19:en-fi", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_en-fi", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_en_gu_lighteval = LightevalTaskConfig( - name="wmt19:en-gu", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_en-gu", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_en_kk_lighteval = LightevalTaskConfig( - name="wmt19:en-kk", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_en-kk", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_en_lt_lighteval = LightevalTaskConfig( - name="wmt19:en-lt", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_en-lt", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_en_ru_lighteval = LightevalTaskConfig( - name="wmt19:en-ru", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_en-ru", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_en_zh_lighteval = LightevalTaskConfig( - name="wmt19:en-zh", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_en-zh", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_fi_en_lighteval = LightevalTaskConfig( - name="wmt19:fi-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_fi-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_fr_de_lighteval = LightevalTaskConfig( - name="wmt19:fr-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_fr-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_gu_en_lighteval = LightevalTaskConfig( - name="wmt19:gu-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_gu-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_kk_en_lighteval = LightevalTaskConfig( - name="wmt19:kk-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_kk-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_lt_en_lighteval = LightevalTaskConfig( - name="wmt19:lt-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_lt-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_ru_en_lighteval = LightevalTaskConfig( - name="wmt19:ru-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_ru-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt19_zh_en_lighteval = LightevalTaskConfig( - name="wmt19:zh-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt19_zh-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_cs_en_lighteval = LightevalTaskConfig( - name="wmt20:cs-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_cs-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_de_en_lighteval = LightevalTaskConfig( - name="wmt20:de-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_de-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_de_fr_lighteval = LightevalTaskConfig( - name="wmt20:de-fr", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_de-fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_en_cs_lighteval = LightevalTaskConfig( - name="wmt20:en-cs", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_en-cs", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_en_de_lighteval = LightevalTaskConfig( - name="wmt20:en-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_en-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_en_iu_lighteval = LightevalTaskConfig( - name="wmt20:en-iu", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_en-iu", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_en_ja_lighteval = LightevalTaskConfig( - name="wmt20:en-ja", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_en-ja", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_en_km_lighteval = LightevalTaskConfig( - name="wmt20:en-km", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_en-km", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_en_pl_lighteval = LightevalTaskConfig( - name="wmt20:en-pl", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_en-pl", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_en_ps_lighteval = LightevalTaskConfig( - name="wmt20:en-ps", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_en-ps", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_en_ru_lighteval = LightevalTaskConfig( - name="wmt20:en-ru", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_en-ru", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_en_ta_lighteval = LightevalTaskConfig( - name="wmt20:en-ta", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_en-ta", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_en_zh_lighteval = LightevalTaskConfig( - name="wmt20:en-zh", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_en-zh", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_fr_de_lighteval = LightevalTaskConfig( - name="wmt20:fr-de", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_fr-de", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_iu_en_lighteval = LightevalTaskConfig( - name="wmt20:iu-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_iu-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_ja_en_lighteval = LightevalTaskConfig( - name="wmt20:ja-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_ja-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_km_en_lighteval = LightevalTaskConfig( - name="wmt20:km-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_km-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_pl_en_lighteval = LightevalTaskConfig( - name="wmt20:pl-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_pl-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_ps_en_lighteval = LightevalTaskConfig( - name="wmt20:ps-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_ps-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_ru_en_lighteval = LightevalTaskConfig( - name="wmt20:ru-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_ru-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_ta_en_lighteval = LightevalTaskConfig( - name="wmt20:ta-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_ta-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -wmt20_zh_en_lighteval = LightevalTaskConfig( - name="wmt20:zh-en", - suite=["lighteval", "sacrebleu"], - prompt_function=prompt.wmt_reverse_alphabetical, - hf_repo="lighteval/sacrebleu_manual", - hf_subset="wmt20_zh-en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=None, - metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], - stop_sequence=["\n"], - version=0, -) -word_sorting_bigbench = LightevalTaskConfig( - name="word_sorting", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="word_sorting", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -word_unscrambling_bigbench = LightevalTaskConfig( - name="word_unscrambling", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="tasksource/bigbench", - hf_subset="word_unscrambling", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], - stop_sequence=["\n"], - version=0, -) -wsc273_lighteval = LightevalTaskConfig( - name="wsc273", - suite=["lighteval"], - prompt_function=prompt.wsc273, - hf_repo="lighteval/winograd_wsc", - hf_subset="wsc273", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xcopa_en_lighteval = LightevalTaskConfig( - name="xcopa:en", - suite=["lighteval"], - prompt_function=prompt.xcopa_en, - hf_repo="cambridgeltl/xcopa", - hf_subset="default", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xcopa_et_lighteval = LightevalTaskConfig( - name="xcopa:et", - suite=["lighteval"], - prompt_function=prompt.xcopa_et, - hf_repo="cambridgeltl/xcopa", - hf_subset="et", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xcopa_ht_lighteval = LightevalTaskConfig( - name="xcopa:ht", - suite=["lighteval"], - prompt_function=prompt.xcopa_ht, - hf_repo="cambridgeltl/xcopa", - hf_subset="ht", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xcopa_it_lighteval = LightevalTaskConfig( - name="xcopa:it", - suite=["lighteval"], - prompt_function=prompt.xcopa_it, - hf_repo="cambridgeltl/xcopa", - hf_subset="it", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xcopa_id_lighteval = LightevalTaskConfig( - name="xcopa:id", - suite=["lighteval"], - prompt_function=prompt.xcopa_id, - hf_repo="cambridgeltl/xcopa", - hf_subset="id", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xcopa_qu_lighteval = LightevalTaskConfig( - name="xcopa:qu", - suite=["lighteval"], - prompt_function=prompt.xcopa_qu, - hf_repo="cambridgeltl/xcopa", - hf_subset="qu", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xcopa_sw_lighteval = LightevalTaskConfig( - name="xcopa:sw", - suite=["lighteval"], - prompt_function=prompt.xcopa_sw, - hf_repo="cambridgeltl/xcopa", - hf_subset="sw", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xcopa_zh_lighteval = LightevalTaskConfig( - name="xcopa:zh", - suite=["lighteval"], - prompt_function=prompt.xcopa_zh, - hf_repo="cambridgeltl/xcopa", - hf_subset="zh", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xcopa_ta_lighteval = LightevalTaskConfig( - name="xcopa:ta", - suite=["lighteval"], - prompt_function=prompt.xcopa_ta, - hf_repo="cambridgeltl/xcopa", - hf_subset="ta", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xcopa_th_lighteval = LightevalTaskConfig( - name="xcopa:th", - suite=["lighteval"], - prompt_function=prompt.xcopa_th, - hf_repo="cambridgeltl/xcopa", - hf_subset="th", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xcopa_tr_lighteval = LightevalTaskConfig( - name="xcopa:tr", - suite=["lighteval"], - prompt_function=prompt.xcopa_tr, - hf_repo="cambridgeltl/xcopa", - hf_subset="tr", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xcopa_vi_lighteval = LightevalTaskConfig( - name="xcopa:vi", - suite=["lighteval"], - prompt_function=prompt.xcopa_vi, - hf_repo="cambridgeltl/xcopa", - hf_subset="vi", - hf_avail_splits=["test", "train", "validation"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xstory_cloze_en_lighteval = LightevalTaskConfig( - name="xstory_cloze:en", - suite=["lighteval"], - prompt_function=prompt.storycloze, - hf_repo="juletxara/xstory_cloze", - hf_subset="en", - hf_avail_splits=["training", "eval"], - evaluation_splits=["eval"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xstory_cloze_ru_lighteval = LightevalTaskConfig( - name="xstory_cloze:ru", - suite=["lighteval"], - prompt_function=prompt.storycloze, - hf_repo="juletxara/xstory_cloze", - hf_subset="ru", - hf_avail_splits=["training", "eval"], - evaluation_splits=["eval"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xstory_cloze_zh_lighteval = LightevalTaskConfig( - name="xstory_cloze:zh", - suite=["lighteval"], - prompt_function=prompt.storycloze, - hf_repo="juletxara/xstory_cloze", - hf_subset="zh", - hf_avail_splits=["training", "eval"], - evaluation_splits=["eval"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xstory_cloze_es_lighteval = LightevalTaskConfig( - name="xstory_cloze:es", - suite=["lighteval"], - prompt_function=prompt.storycloze, - hf_repo="juletxara/xstory_cloze", - hf_subset="es", - hf_avail_splits=["training", "eval"], - evaluation_splits=["eval"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xstory_cloze_ar_lighteval = LightevalTaskConfig( - name="xstory_cloze:ar", - suite=["lighteval"], - prompt_function=prompt.storycloze, - hf_repo="juletxara/xstory_cloze", - hf_subset="ar", - hf_avail_splits=["training", "eval"], - evaluation_splits=["eval"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xstory_cloze_hi_lighteval = LightevalTaskConfig( - name="xstory_cloze:hi", - suite=["lighteval"], - prompt_function=prompt.storycloze, - hf_repo="juletxara/xstory_cloze", - hf_subset="hi", - hf_avail_splits=["training", "eval"], - evaluation_splits=["eval"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xstory_cloze_id_lighteval = LightevalTaskConfig( - name="xstory_cloze:id", - suite=["lighteval"], - prompt_function=prompt.storycloze, - hf_repo="juletxara/xstory_cloze", - hf_subset="id", - hf_avail_splits=["training", "eval"], - evaluation_splits=["eval"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xstory_cloze_te_lighteval = LightevalTaskConfig( - name="xstory_cloze:te", - suite=["lighteval"], - prompt_function=prompt.storycloze, - hf_repo="juletxara/xstory_cloze", - hf_subset="te", - hf_avail_splits=["training", "eval"], - evaluation_splits=["eval"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xstory_cloze_sw_lighteval = LightevalTaskConfig( - name="xstory_cloze:sw", - suite=["lighteval"], - prompt_function=prompt.storycloze, - hf_repo="juletxara/xstory_cloze", - hf_subset="sw", - hf_avail_splits=["training", "eval"], - evaluation_splits=["eval"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xstory_cloze_eu_lighteval = LightevalTaskConfig( - name="xstory_cloze:eu", - suite=["lighteval"], - prompt_function=prompt.storycloze, - hf_repo="juletxara/xstory_cloze", - hf_subset="eu", - hf_avail_splits=["training", "eval"], - evaluation_splits=["eval"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xstory_cloze_my_lighteval = LightevalTaskConfig( - name="xstory_cloze:my", - suite=["lighteval"], - prompt_function=prompt.storycloze, - hf_repo="juletxara/xstory_cloze", - hf_subset="my", - hf_avail_splits=["training", "eval"], - evaluation_splits=["eval"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xwinograd_en_lighteval = LightevalTaskConfig( - name="xwinograd:en", - suite=["lighteval"], - prompt_function=prompt.winogrande, - hf_repo="Muennighoff/xwinograd", - hf_subset="en", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xwinograd_fr_lighteval = LightevalTaskConfig( - name="xwinograd:fr", - suite=["lighteval"], - prompt_function=prompt.winogrande, - hf_repo="Muennighoff/xwinograd", - hf_subset="fr", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xwinograd_jp_lighteval = LightevalTaskConfig( - name="xwinograd:jp", - suite=["lighteval"], - prompt_function=prompt.winogrande, - hf_repo="Muennighoff/xwinograd", - hf_subset="jp", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xwinograd_pt_lighteval = LightevalTaskConfig( - name="xwinograd:pt", - suite=["lighteval"], - prompt_function=prompt.winogrande, - hf_repo="Muennighoff/xwinograd", - hf_subset="pt", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xwinograd_ru_lighteval = LightevalTaskConfig( - name="xwinograd:ru", - suite=["lighteval"], - prompt_function=prompt.winogrande, - hf_repo="Muennighoff/xwinograd", - hf_subset="ru", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) -xwinograd_zh_lighteval = LightevalTaskConfig( - name="xwinograd:zh", - suite=["lighteval"], - prompt_function=prompt.winogrande, - hf_repo="Muennighoff/xwinograd", - hf_subset="zh", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=-1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, -) - -# MMLU-Redux-2 Tasks -_MMLU_REDUX_2_SUBSETS = [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions", -] - - -_mmlu_redux_2_tasks = { - subset: LightevalTaskConfig( - name=f"mmlu_redux_2:{subset}", - suite=["lighteval"], - prompt_function=lambda line, task_name=None, s=subset: prompt.mmlu_redux_2(line, s, task_name), - hf_repo="edinburgh-dawg/mmlu-redux-2.0", - hf_subset=subset, - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select=None, - generation_size=1, - metrics=[ - Metrics.loglikelihood_acc, - Metrics.pass_at_k_letters(sample_params={"k": 1}), - ], - stop_sequence=["\n"], - version=0, - ) - for subset in _MMLU_REDUX_2_SUBSETS -} - -mmlu_redux_2_abstract_algebra = _mmlu_redux_2_tasks["abstract_algebra"] -mmlu_redux_2_anatomy = _mmlu_redux_2_tasks["anatomy"] -mmlu_redux_2_astronomy = _mmlu_redux_2_tasks["astronomy"] -mmlu_redux_2_business_ethics = _mmlu_redux_2_tasks["business_ethics"] -mmlu_redux_2_clinical_knowledge = _mmlu_redux_2_tasks["clinical_knowledge"] -mmlu_redux_2_college_biology = _mmlu_redux_2_tasks["college_biology"] -mmlu_redux_2_college_chemistry = _mmlu_redux_2_tasks["college_chemistry"] -mmlu_redux_2_college_computer_science = _mmlu_redux_2_tasks["college_computer_science"] -mmlu_redux_2_college_mathematics = _mmlu_redux_2_tasks["college_mathematics"] -mmlu_redux_2_college_medicine = _mmlu_redux_2_tasks["college_medicine"] -mmlu_redux_2_college_physics = _mmlu_redux_2_tasks["college_physics"] -mmlu_redux_2_computer_security = _mmlu_redux_2_tasks["computer_security"] -mmlu_redux_2_conceptual_physics = _mmlu_redux_2_tasks["conceptual_physics"] -mmlu_redux_2_econometrics = _mmlu_redux_2_tasks["econometrics"] -mmlu_redux_2_electrical_engineering = _mmlu_redux_2_tasks["electrical_engineering"] -mmlu_redux_2_elementary_mathematics = _mmlu_redux_2_tasks["elementary_mathematics"] -mmlu_redux_2_formal_logic = _mmlu_redux_2_tasks["formal_logic"] -mmlu_redux_2_global_facts = _mmlu_redux_2_tasks["global_facts"] -mmlu_redux_2_high_school_biology = _mmlu_redux_2_tasks["high_school_biology"] -mmlu_redux_2_high_school_chemistry = _mmlu_redux_2_tasks["high_school_chemistry"] -mmlu_redux_2_high_school_computer_science = _mmlu_redux_2_tasks["high_school_computer_science"] -mmlu_redux_2_high_school_european_history = _mmlu_redux_2_tasks["high_school_european_history"] -mmlu_redux_2_high_school_geography = _mmlu_redux_2_tasks["high_school_geography"] -mmlu_redux_2_high_school_government_and_politics = _mmlu_redux_2_tasks["high_school_government_and_politics"] -mmlu_redux_2_high_school_macroeconomics = _mmlu_redux_2_tasks["high_school_macroeconomics"] -mmlu_redux_2_high_school_mathematics = _mmlu_redux_2_tasks["high_school_mathematics"] -mmlu_redux_2_high_school_microeconomics = _mmlu_redux_2_tasks["high_school_microeconomics"] -mmlu_redux_2_high_school_physics = _mmlu_redux_2_tasks["high_school_physics"] -mmlu_redux_2_high_school_psychology = _mmlu_redux_2_tasks["high_school_psychology"] -mmlu_redux_2_high_school_statistics = _mmlu_redux_2_tasks["high_school_statistics"] -mmlu_redux_2_high_school_us_history = _mmlu_redux_2_tasks["high_school_us_history"] -mmlu_redux_2_high_school_world_history = _mmlu_redux_2_tasks["high_school_world_history"] -mmlu_redux_2_human_aging = _mmlu_redux_2_tasks["human_aging"] -mmlu_redux_2_human_sexuality = _mmlu_redux_2_tasks["human_sexuality"] -mmlu_redux_2_international_law = _mmlu_redux_2_tasks["international_law"] -mmlu_redux_2_jurisprudence = _mmlu_redux_2_tasks["jurisprudence"] -mmlu_redux_2_logical_fallacies = _mmlu_redux_2_tasks["logical_fallacies"] -mmlu_redux_2_machine_learning = _mmlu_redux_2_tasks["machine_learning"] -mmlu_redux_2_management = _mmlu_redux_2_tasks["management"] -mmlu_redux_2_marketing = _mmlu_redux_2_tasks["marketing"] -mmlu_redux_2_medical_genetics = _mmlu_redux_2_tasks["medical_genetics"] -mmlu_redux_2_miscellaneous = _mmlu_redux_2_tasks["miscellaneous"] -mmlu_redux_2_moral_disputes = _mmlu_redux_2_tasks["moral_disputes"] -mmlu_redux_2_moral_scenarios = _mmlu_redux_2_tasks["moral_scenarios"] -mmlu_redux_2_nutrition = _mmlu_redux_2_tasks["nutrition"] -mmlu_redux_2_philosophy = _mmlu_redux_2_tasks["philosophy"] -mmlu_redux_2_prehistory = _mmlu_redux_2_tasks["prehistory"] -mmlu_redux_2_professional_accounting = _mmlu_redux_2_tasks["professional_accounting"] -mmlu_redux_2_professional_law = _mmlu_redux_2_tasks["professional_law"] -mmlu_redux_2_professional_medicine = _mmlu_redux_2_tasks["professional_medicine"] -mmlu_redux_2_professional_psychology = _mmlu_redux_2_tasks["professional_psychology"] -mmlu_redux_2_public_relations = _mmlu_redux_2_tasks["public_relations"] -mmlu_redux_2_security_studies = _mmlu_redux_2_tasks["security_studies"] -mmlu_redux_2_sociology = _mmlu_redux_2_tasks["sociology"] -mmlu_redux_2_us_foreign_policy = _mmlu_redux_2_tasks["us_foreign_policy"] -mmlu_redux_2_virology = _mmlu_redux_2_tasks["virology"] -mmlu_redux_2_world_religions = _mmlu_redux_2_tasks["world_religions"] diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py deleted file mode 100644 index 247a0c3a2..000000000 --- a/src/lighteval/tasks/extended/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - - -import lighteval.tasks.extended.hle.main as hle -import lighteval.tasks.extended.ifbench.main as ifbench -import lighteval.tasks.extended.ifeval.main as ifeval -import lighteval.tasks.extended.lcb.main as lcb -import lighteval.tasks.extended.mix_eval.main as mix_eval -import lighteval.tasks.extended.mt_bench.main as mt_bench -import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench -import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks - - -AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, ifbench, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb] diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py deleted file mode 100644 index 5d6c107bc..000000000 --- a/src/lighteval/tasks/multilingual/tasks.py +++ /dev/null @@ -1,4368 +0,0 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -from functools import partial -from itertools import permutations - -from langcodes import Language as LangCodeLanguage -from langcodes import standardize_tag - -from lighteval.metrics.dynamic_metrics import ( - LogLikelihoodAccMetric, - MultilingualQuasiExactMatchMetric, - MultilingualQuasiF1ScoreMetric, -) -from lighteval.metrics.metrics import Metrics -from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm -from lighteval.tasks.default_prompts import LETTER_INDICES -from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.multilingual.adapters import ( - agieval_adapter, - alghafa_adapter, - ceval_adapter, - enem_adapter, - get_m3exam_adapter, - get_mkqa_adapter, - sciqa_adapter, - thai_exams_adapter, - winogrand_adapter, - xcodah_adapter, -) -from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation, normalize_subset -from lighteval.tasks.templates.boolq import get_boolq_prompt_function -from lighteval.tasks.templates.continuation import get_continuation_prompt_function -from lighteval.tasks.templates.copa import get_copa_prompt_function -from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function -from lighteval.tasks.templates.multichoice import get_mcq_prompt_function -from lighteval.tasks.templates.nli import get_nli_prompt_function -from lighteval.tasks.templates.qa import get_qa_prompt_function -from lighteval.tasks.templates.translation import get_translation_prompt_function -from lighteval.tasks.templates.utils.formulation import ( - CFFormulation, - HybridFormulation, - MCFFormulation, -) -from lighteval.tasks.templates.utils.translation_literals import TRANSLATION_LITERALS -from lighteval.utils.language import Language, iso_639_3_ind_to_iso_639_3_macro, manage_duplicate_language_codes - - -TASKS_TABLE = [] -# ------------------------------- NLI Tasks ------------------------------- # -# NLI (Natural Language Inference) tasks involve determining the logical relationship -# between two given sentences: a premise and a hypothesis. The goal is to classify -# whether the hypothesis is entailed by, contradicts, or is neutral with respect to -# the premise. After our inspection we found the neutral label to be quite ambiguous -# and decided to exclude it. But you can easily add it by modifying the adapters - - -# The XNLI dataset is a multilingual variant of MultiNLI -# https://aclanthology.org/D18-1269/ -xnli_tasks = [ - LightevalTaskConfig( - name=f"xnli_{language.value}_{formulation.name.lower()}", - suite=["lighteval"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=None), - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - prompt_function=get_nli_prompt_function( - language=language, - adapter=lambda line: { - "premise": line["premise"], - "hypothesis": line["hypothesis"], - # Since we ignore the neutral label - "gold_idx": {0: 0, 2: 1}[line["label"]], - }, - relations=["entailment", "contradiction"], - formulation=formulation, - ), - hf_filter=lambda line: line["label"] in [0, 2], - hf_repo="facebook/xnli", - hf_subset=standardize_tag(language.value), - evaluation_splits=["validation"], - few_shots_split="train", - ) - for language in [ - Language.ARABIC, - Language.ENGLISH, - Language.FRENCH, - Language.SPANISH, - Language.BULGARIAN, - Language.GERMAN, - Language.GREEK, - Language.ENGLISH, - Language.FRENCH, - Language.HINDI, - Language.RUSSIAN, - Language.SWAHILI, - Language.THAI, - Language.TURKISH, - Language.URDU, - Language.VIETNAMESE, - Language.CHINESE, - ] - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - - -# Improvement on XNLI with better translation, from our experience models tend to -# perform better on XNLI2.0 than XNLI -# https://arxiv.org/abs/2301.06527 -xnli2_tasks = [ - LightevalTaskConfig( - name=f"xnli2.0_{language.value}_{formulation.name.lower()}", - suite=["lighteval"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=None), - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - prompt_function=get_nli_prompt_function( - language=language, - adapter=lambda line: { - "premise": line["premise"], - "hypothesis": line["hypothesis"], - # Since we ignore the neutral label - "gold_idx": {0: 0, 2: 1}[line["label"]], - }, - relations=["entailment", "contradiction"], - formulation=formulation, - ), - hf_filter=lambda line: line["label"] in [0, 2] - and line["premise"] is not None - and line["hypothesis"] is not None, - hf_repo=f"Harsit/xnli2.0_train_{LangCodeLanguage(standardize_tag(language.value)).language_name().lower()}", - hf_subset="default", - evaluation_splits=["train"], - hf_avail_splits=["train"], - ) - for language in [ - Language.ENGLISH, - Language.FRENCH, - Language.PUNJABI, - Language.GUJARATI, - Language.KANNADA, - Language.ASSAMESE, - Language.BENGALI, - Language.MARATHI, - Language.SANSKRIT, - Language.TAMIL, - Language.GERMAN, - Language.ENGLISH, - Language.URDU, - Language.VIETNAMESE, - Language.TURKISH, - Language.THAI, - Language.SWAHILI, - Language.SPANISH, - Language.RUSSIAN, - Language.HINDI, - Language.GREEK, - Language.CHINESE, - Language.BULGARIAN, - Language.ARABIC, - # Theoretically also: Bhojpuri, Gujarati, Odiya - ] - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -# Another variant of XNLI, with emphasis on Indic languages -# https://arxiv.org/abs/2204.08776 -xnli_indic_tasks = [ - LightevalTaskConfig( - name=f"indicnxnli_{language.value}_{formulation.name.lower()}", - suite=["lighteval"], - prompt_function=get_nli_prompt_function( - language=language, - adapter=lambda line: { - "premise": line["premise"], - "hypothesis": line["hypothesis"], - # Since we ignore the neutral label - "gold_idx": {0: 0, 2: 1}[line["label"]], - }, - relations=["entailment", "contradiction"], - formulation=formulation, - ), - hf_repo="Divyanshu/indicxnli", - hf_subset=standardize_tag(language.value), - # Ignore neutral - hf_filter=lambda x: int(x["label"]) in [0, 2], - evaluation_splits=["validation"], - few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=None), - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for language in [ - Language.ASSAMESE, - Language.BENGALI, - Language.GUJARATI, - Language.HINDI, - Language.KANNADA, - Language.MALAYALAM, - Language.MARATHI, - Language.ORIYA, - Language.PUNJABI, - Language.TAMIL, - Language.TELUGU, - ] - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -# African XNLI: African XNLI -# From https://arxiv.org/abs/2406.03368. Human translated MMLU. -afri_xnli_tasks = [ - LightevalTaskConfig( - name=f"afri_xnli_{language.value}_{formulation.name.lower()}", - suite=("lighteval",), - prompt_function=get_nli_prompt_function( - language=language, - adapter=lambda line: { - "premise": line["premise"], - "hypothesis": line["hypothesis"], - # Since we ignore the neutral label - "gold_idx": {0: 0, 2: 1}[line["label"]], - }, - relations=["entailment", "contradiction"], - formulation=formulation, - ), - hf_repo="masakhane/afrixnli", - hf_subset=language.value, - hf_filter=lambda x: int(x["label"]) in [0, 2], - evaluation_splits=("test",), - few_shots_split="validation", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=None), - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for language in [ - Language.AMHARIC, - # Language.EWE, - Language.FRENCH, - # Language.HAUSA, - # Language.IGBO, - # Language.KINYARWANDA, - # Language.LINGALA, - # Language.LUGANDA, - # Language.OROMO, - # Language.SHONA, - # Language.SOTHO, - Language.SWAHILI, - # Language.TWI, - # Language.WOLOF, - # Language.XHOSA, - Language.YORUBA, - # Language.ZULU, - ] - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -# PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification -# This dataset contains paraphrase identification pairs in multiple languages. -# It's derived from PAWS (Paraphrase Adversaries from Word Scrambling) and -# We treat paraphrase as entailment and non-paraphrase as contradiction -# https://arxiv.org/abs/1908.11828 - -paws_x_tasks = [ - LightevalTaskConfig( - name=f"pawsx_{language.value}_{formulation.name.lower()}", - suite=("lighteval",), - prompt_function=get_nli_prompt_function( - language=language, - adapter=lambda line: { - "premise": line["sentence1"], - "hypothesis": line["sentence2"], - # Since we ignore the neutral label - "gold_idx": int(line["label"]), - }, - relations=["entailment", "contradiction"], - formulation=formulation, - ), - hf_repo="google-research-datasets/paws-x", - hf_subset=standardize_tag(language.value), - evaluation_splits=("test",), - few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=None), - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for language in [ - Language.GERMAN, - Language.ENGLISH, - Language.SPANISH, - Language.FRENCH, - Language.JAPANESE, - Language.KOREAN, - Language.CHINESE, - ] - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -# Russian Commitment Bank (RCB) is a large-scale NLI dataset with Russian sentences, -# collected from the web and crowdsourcing. -# https://arxiv.org/abs/2401.04531 -rcb_tasks = [ - LightevalTaskConfig( - name=f"rcb_{Language.RUSSIAN.value}_{formulation.name.lower()}", - prompt_function=get_nli_prompt_function( - language=Language.RUSSIAN, - adapter=lambda line: { - "premise": line["inputs"]["premise"], - "hypothesis": line["inputs"]["hypothesis"], - # Since we ignore the neutral label - "gold_idx": int(line["outputs"]) - 1, - }, - relations=["entailment", "contradiction"], - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="ai-forever/MERA", - hf_subset="rcb", - # Ignore neutral label - hf_filter=lambda x: int(x["outputs"] or "0") in [1, 2], - evaluation_splits=("train",), - few_shots_split="validation", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=None), - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -# Native Chinese NLI dataset based. -# https://arxiv.org/pdf/2010.05444 -# We find this benchmark to have really good signal compared to other Chinese NLI -ocnli_tasks = [ - LightevalTaskConfig( - name=f"ocnli_{Language.CHINESE.value}_{formulation.name.lower()}", - prompt_function=get_nli_prompt_function( - language=Language.CHINESE, - adapter=lambda line: { - "premise": line["sentence1"], - "hypothesis": line["sentence2"], - # Since we ignore the neutral label - "gold_idx": {1: 0, 2: 1}[line["label"]], - }, - relations=["entailment", "contradiction"], - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="clue/clue", - hf_subset="ocnli", - # Only keep the positive and negative examples - hf_filter=lambda x: int(x["label"]) in [1, 2], - evaluation_splits=("validation",), - few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=None), - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -# https://arxiv.org/abs/2004.05986 -# Native Chinese NLI dataset based on MNLI approach (Machine Translated) -cmnli_tasks = [ - LightevalTaskConfig( - name=f"cmnli_{Language.CHINESE.value}_{formulation.name.lower()}", - prompt_function=get_nli_prompt_function( - language=Language.CHINESE, - adapter=lambda line: { - "premise": line["sentence1"], - "hypothesis": line["sentence2"], - # Since we ignore the neutral label - "gold_idx": {"entailment": 0, "contradiction": 1}[line["label"]], - }, - relations=["entailment", "contradiction"], - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="fenffef/cmnli", - hf_subset="default", - hf_filter=lambda x: x["label"] in ["entailment", "contradiction"], - # Only keep the positive and negative examples - evaluation_splits=("validation",), - few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=None), - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -TASKS_TABLE.extend( - [ - *xnli_tasks, - *xnli2_tasks, - *xnli_indic_tasks, - *paws_x_tasks, - *rcb_tasks, - *ocnli_tasks, - *cmnli_tasks, - *afri_xnli_tasks, - ] -) -# ------------------------------- Copa Tasks ------------------------------- # -# COPA (Choice of Plausible Alternatives) tasks involve determining the most plausible cause or effect -# for a given premise. These tasks test common sense reasoning and causal inference abilities. - -# XCOPA: Cross-lingual Choice of Plausible Alternatives -# Paper: https://aclanthology.org/2020.emnlp-main.185/ -# XCOPA extends the original English COPA task to 11 typologically diverse languages. -xcopa_tasks = [ - LightevalTaskConfig( - name=f"xcopa_{language.value}_{formulation.name.lower()}", - suite=["lighteval"], - prompt_function=get_copa_prompt_function( - language, - adapter=lambda line: { - "context": line["premise"], - "cause_effect": line["question"], - "continuations": [line["choice1"], line["choice2"]], - "gold_idx": int(line["label"]), - }, - formulation=formulation, - ), - hf_repo=("OALL/AlGhafa-Arabic-LLM-Benchmark-Translated" if language == Language.ARABIC else "xcopa"), - hf_subset=("copa_ext_ar" if language == Language.ARABIC else standardize_tag(language.value)), - evaluation_splits=["test"], - few_shots_split="validation", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for language in [ - Language.ARABIC, - Language.ESTONIAN, - Language.INDONESIAN, - Language.ITALIAN, - Language.SWAHILI, - Language.TAMIL, - Language.THAI, - Language.TURKISH, - Language.VIETNAMESE, - Language.CHINESE, - Language.HAITIAN, - Language.QUECHUA, - ] - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -# IndicCOPA: COPA for Indic Languages -# Paper: https://arxiv.org/pdf/2212.05409 -# IndicCOPA extends COPA to 15 Indic languages, providing a valuable resource for -# evaluating common sense reasoning in these languages. -copa_indic_tasks = [ - LightevalTaskConfig( - name=f"indicxcopa_{language.value}_{formulation.name.lower()}", - suite=["lighteval"], - prompt_function=get_copa_prompt_function( - language, - adapter=lambda line: { - "context": line["premise"], - "cause_effect": line["question"], - "continuations": [line["choice1"], line["choice2"]], - "gold_idx": int(line["label"]), - }, - formulation=formulation, - ), - hf_repo="ai4bharat/IndicCOPA", - hf_subset=f"translation-{standardize_tag(language.value)}", - hf_revision="d356ef19a4eb287e88a51d07a56b73ba88c7f188", - evaluation_splits=["test"], - hf_avail_splits=["test"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for language in [ - Language.ASSAMESE, - Language.BENGALI, - Language.GUJARATI, - Language.HINDI, - Language.KANNADA, - Language.MALAYALAM, - Language.MARATHI, - Language.NEPALI, - Language.ORIYA, - Language.PUNJABI, - Language.SANSKRIT, - Language.SINDHI, - Language.TAMIL, - Language.TELUGU, - Language.URDU, - # Optionally: Maithili, Santali, Sindhi, Konkani - ] - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -# PARus: Plausible Alternatives for Russian -# Paper: https://russiansuperglue.com/tasks/task_info/PARus -# PARus is the Russian adaptation of the COPA task, part of the Russian SuperGLUE benchmark. -# It evaluates common sense reasoning and causal inference abilities in Russian language models. -parus_tasks = [ - LightevalTaskConfig( - name=f"parus_{Language.RUSSIAN.value}_{formulation.name.lower()}", - suite=["lighteval"], - prompt_function=get_copa_prompt_function( - language=Language.RUSSIAN, - adapter=lambda line: { - "context": line["inputs"]["premise"], - "cause_effect": line["meta"]["task"], - "continuations": [line["inputs"]["choice1"], line["inputs"]["choice2"]], - "gold_idx": int(line["outputs"]) - 1, - }, - formulation=formulation, - ), - hf_repo="ai-forever/MERA", - hf_subset="parus", - evaluation_splits=["train"], - few_shots_split="validation", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - - -TASKS_TABLE.extend([*xcopa_tasks, *copa_indic_tasks, *parus_tasks]) -# ------------------------------- Hellaswag Tasks ------------------------------- # -# Hellaswag is a commonsense reasoning task that requires models to complete a given scenario -# with the most plausible ending. It tests the model's ability to understand and reason about -# everyday situations and human behavior. - -# MLMM-Hellaswag: Multilingual adaptation of Hellaswag -# Paper: https://arxiv.org/abs/2306.07610 -# This is a multilingual version of Hellaswag, part of the MLMM (Massive Language Model Meta-Evaluation) benchmark. -# It evaluates commonsense reasoning abilities across multiple languages. -mlmm_hellaswag_tasks = [ - LightevalTaskConfig( - name=f"mlmm_hellaswag_{lang.value}_{formulation.name.lower()}", - suite=["lighteval"], - prompt_function=get_hellaswag_prompt_function( - language=lang, - adapter=lambda line: { - # We don't use activity_label as they are not available - "ctx_a": line["ctx_a"], - "ctx_b": line["ctx_b"], - "continuations": line["endings"], - "gold_idx": int(line["label"]), - }, - formulation=formulation, - ), - hf_repo="jon-tow/okapi_hellaswag", - hf_subset=standardize_tag(lang.value), - hf_revision="96ed8e0dfc6172dad1d3df338d7b8ba6c1ff9d83", - evaluation_splits=["validation"], - hf_avail_splits=["validation"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for lang in [ - Language.ARABIC, - Language.BENGALI, - Language.CATALAN, - Language.DANISH, - Language.GERMAN, - Language.SPANISH, - Language.BASQUE, - Language.FRENCH, - Language.GUJARATI, - Language.HINDI, - Language.CROATIAN, - Language.HUNGARIAN, - Language.ARMENIAN, - Language.INDONESIAN, - Language.ICELANDIC, - Language.ITALIAN, - Language.KANNADA, - Language.MALAYALAM, - Language.MARATHI, - Language.NORWEGIAN, - Language.NEPALI, - Language.DUTCH, - Language.PORTUGUESE, - Language.ROMANIAN, - Language.RUSSIAN, - Language.SLOVAK, - Language.SERBIAN, - Language.SWEDISH, - Language.TAMIL, - Language.TELUGU, - Language.UKRAINIAN, - Language.VIETNAMESE, - Language.CHINESE, - ] - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -# Hellaswag Turkish -# This is a Turkish adaptation of the Hellaswag task. -# While there's no specific paper for this version, it has been found to work well for evaluating -# Turkish language models on commonsense reasoning tasks. - -# We don't handle them in single task as there is quite a lot of differences (dataset/subset, dot replacement, etc.) -# which would make it hard to read -hellaswag_tur_tasks = [ - LightevalTaskConfig( - name=f"community_hellaswag_{Language.TURKISH.value}_{formulation.name.lower()}", - suite=["lighteval"], - prompt_function=get_hellaswag_prompt_function( - language=Language.TURKISH, - adapter=lambda line: { - "ctx_a": line["ctx_a"], - "ctx_b": line["ctx_b"], - "continuations": line["endings"], - "gold_idx": int(line["label"]), - }, - formulation=formulation, - # https://github.com/malhajar17/lm-evaluation-harness_turkish/blob/main/lm_eval/tasks/hellaswag_tr-v0.2/utils.py - wikihow_artifacts=[" [title]", " [başlık]", " [adım]", " [header]"], - ), - hf_repo="malhajar/hellaswag_tr-v0.2", - hf_subset="default", - evaluation_splits=["validation"], - hf_avail_splits=["validation"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -# Hellaswag Thai -# This is a Thai adaptation of the Hellaswag task. -# Similar to the Turkish version, there's no specific paper, but it has been found to be effective -# for evaluating Thai language models on commonsense reasoning tasks. -hellaswag_tha_tasks = [ - LightevalTaskConfig( - name=f"community_hellaswag_{Language.THAI.value}_{formulation.name.lower()}", - suite=["lighteval"], - prompt_function=get_hellaswag_prompt_function( - language=Language.THAI, - adapter=lambda line: { - "ctx_a": line["ctx_a"], - "ctx_b": line["ctx_b"], - "continuations": line["endings"], - "gold_idx": int(line["label"]), - }, - formulation=formulation, - wikihow_artifacts=[" [ชื่อ]", " [ส่วนหัว]", " [ขั้นตอน]", " [header]", " [Header]"], - ), - hf_repo="lighteval/hellaswag_thai", - hf_subset="default", - evaluation_splits=["validation"], - few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -hellaswag_hin_tasks = [ - LightevalTaskConfig( - name=f"community_hellaswag_{Language.HINDI.value}_{formulation.name.lower()}", - suite=["lighteval"], - prompt_function=get_hellaswag_prompt_function( - language=Language.HINDI, - adapter=lambda line: { - "ctx_a": line["ctx_a"], - "continuations": line["endings"], - "gold_idx": int(line["label"]), - }, - formulation=formulation, - ), - hf_repo="ai4bharat/hellaswag-hi", - hf_filter=lambda line: all(len(choice.strip()) > 0 for choice in line["endings"]), - hf_subset="hi", - evaluation_splits=("validation",), - few_shots_split="validation", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - ], - ), - ) - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -hellaswag_tel_tasks = [ - LightevalTaskConfig( - name=f"community_hellaswag_{Language.TELUGU.value}_{formulation.name.lower()}", - suite=["lighteval"], - prompt_function=get_hellaswag_prompt_function( - language=Language.TELUGU, - adapter=lambda line: { - "ctx_a": line["ctx_a"], - "continuations": line["endings"], - "gold_idx": int(line["label"]), - }, - formulation=formulation, - ), - hf_repo="LightFury9/hellaswag-telugu", - hf_subset="default", - evaluation_splits=("valid",), - few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] -] - -TASKS_TABLE.extend( - [ - *mlmm_hellaswag_tasks, - *hellaswag_tur_tasks, - *hellaswag_tha_tasks, - *hellaswag_hin_tasks, - *hellaswag_tel_tasks, - ] -) -# ------------------------------- RC Tasks ------------------------------- # -# Reading Comprehension (RC) tasks evaluate a model's ability to understand and extract information from text passages. -# These tasks typically involve answering questions based on given contexts, spanning multiple languages and formats. -# Add RC tasks supporting about 130 unique languages/scripts. - -# SQuAD - like - -# XQuAD: Cross-lingual Question Answering Dataset, extending SQuAD to 11 languages. -# https://arxiv.org/abs/1910.11856 -xquad_tasks = [ - LightevalTaskConfig( - name=f"xquad_{language.value}", - prompt_function=get_qa_prompt_function( - language, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="google/xquad", - hf_subset=f"xquad.{standardize_tag(language.value)}", - evaluation_splits=("validation",), - few_shots_split="validation", - generation_size=400, - stop_sequence=("\n",), - metrics=( - MultilingualQuasiExactMatchMetric(language, "prefix"), - MultilingualQuasiF1ScoreMetric(language), - ), - ) - for language in [ - Language.ARABIC, - Language.GERMAN, - Language.GREEK, - Language.ENGLISH, - Language.SPANISH, - Language.HINDI, - Language.ROMANIAN, - Language.RUSSIAN, - Language.THAI, - Language.TURKISH, - Language.VIETNAMESE, - Language.CHINESE, - ] -] - -# GermanQuAD: High-quality German QA dataset with 13,722 questions -# https://arxiv.org/abs/2104.12741 -germanquad_tasks = [ - LightevalTaskConfig( - name=f"germanquad_{Language.GERMAN.value}", - prompt_function=get_qa_prompt_function( - Language.GERMAN, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="deepset/germanquad", - hf_subset="plain_text", - hf_revision="fff05ceaf2ffbe5b65c7e0c57e678f7b7e1a0581", - hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), - evaluation_splits=("test",), - few_shots_split="train", - generation_size=400, - stop_sequence=("\n",), - metrics=( - MultilingualQuasiExactMatchMetric(Language.GERMAN, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.GERMAN), - ), - ) -] - - -# SQuAD-it: Italian translation of the SQuAD dataset -# https://github.com/crux82/squad-it -squad_it_tasks = [ - LightevalTaskConfig( - name=f"squad_{Language.ITALIAN.value}", - prompt_function=get_qa_prompt_function( - Language.ITALIAN, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="crux82/squad_it", - hf_subset="default", - hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), - evaluation_splits=("test",), - few_shots_split="train", - generation_size=400, - stop_sequence=("\n",), - metrics=( - MultilingualQuasiExactMatchMetric(Language.ITALIAN, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.ITALIAN), - ), - ) -] - - -# ThaiQA: A question answering dataset for the Thai language. -thaiqa_tasks = [ - LightevalTaskConfig( - name=f"thaiqa_{Language.THAI.value}", - prompt_function=get_qa_prompt_function( - Language.THAI, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["answer"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="lighteval/thaiqa_squad_fixed", - hf_subset="default", - evaluation_splits=("train",), - few_shots_split="validation", - generation_size=400, - stop_sequence=("\n",), - metrics=( - MultilingualQuasiExactMatchMetric(Language.THAI, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.THAI), - ), - ) -] - -# SberQuAD: A large-scale Russian reading comprehension dataset. -# https://arxiv.org/abs/1912.09723 -sber_squad_tasks = [ - LightevalTaskConfig( - name=f"sber_squad_{Language.RUSSIAN.value}", - prompt_function=get_qa_prompt_function( - Language.RUSSIAN, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="kuznetsoffandrey/sberquad", - hf_subset="sberquad", - evaluation_splits=("validation",), - few_shots_split="train", - metrics=( - MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.RUSSIAN), - ), - generation_size=400, - stop_sequence=("\n",), - ) -] - -# FaQuAD: A Portuguese Reading Comprehension Dataset -# https://arxiv.org/abs/2007.15671 -faquad_tasks = [ - LightevalTaskConfig( - name=f"faquad_{Language.PORTUGUESE.value}", - prompt_function=get_qa_prompt_function( - Language.PORTUGUESE, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="eraldoluis/faquad", - hf_subset="plain_text", - hf_revision="205ba826a2282a4a5aa9bd3651e55ee4f2da1546", - hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), - evaluation_splits=("validation",), - few_shots_split="train", - metrics=( - MultilingualQuasiExactMatchMetric(Language.PORTUGUESE, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.PORTUGUESE), - ), - generation_size=400, - stop_sequence=("\n",), - ) -] - - -# SQuAD-es: Spanish translation of the Stanford Question Answering Dataset -# https://huggingface.co/datasets/ccasimiro/squad_es -squad_es_tasks = [ - LightevalTaskConfig( - name=f"squad_{Language.SPANISH.value}", - prompt_function=get_qa_prompt_function( - Language.SPANISH, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="ccasimiro/squad_es", - hf_subset="v2.0.0", - hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), - evaluation_splits=("validation",), - few_shots_split="train", - metrics=( - MultilingualQuasiExactMatchMetric(Language.SPANISH, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.SPANISH), - ), - generation_size=400, - stop_sequence=("\n",), - ) -] - - -# ARCD: Arabic Reading Comprehension Dataset. -# https://arxiv.org/pdf/1906.05394 -arcd_tasks = [ - LightevalTaskConfig( - name=f"arcd_{Language.ARABIC.value}", - prompt_function=get_qa_prompt_function( - Language.ARABIC, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="hsseinmz/arcd", - hf_subset="plain_text", - evaluation_splits=("validation",), - few_shots_split="train", - metrics=( - MultilingualQuasiExactMatchMetric(Language.ARABIC, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.ARABIC), - ), - generation_size=400, - stop_sequence=("\n",), - ) -] - -# KenSwQuAD: A question answering dataset for Kenyan Swahili. -# https://arxiv.org/abs/2205.02364 -kenswquad_tasks = [ - LightevalTaskConfig( - name=f"kenswquad_{Language.SWAHILI.value}", - prompt_function=get_qa_prompt_function( - Language.SWAHILI, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [line["answer"]], - }, - ), - suite=("lighteval",), - hf_repo="lighteval/KenSwQuAD", - hf_subset="default", - evaluation_splits=("test",), - few_shots_split="validation", - metrics=( - MultilingualQuasiExactMatchMetric(Language.SWAHILI, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.SWAHILI), - ), - generation_size=400, - stop_sequence=("\n",), - ) -] - -# ChineseSquad: A reading comprehension dataset for Chinese. -# https://github.com/pluto-junzeng/ChineseSquad -chinese_squad_tasks = [ - LightevalTaskConfig( - name=f"chinese_squad_{Language.CHINESE.value}", - prompt_function=get_qa_prompt_function( - Language.CHINESE, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="lighteval/ChineseSquad", - hf_subset="default", - evaluation_splits=("validation",), - few_shots_split="train", - metrics=( - MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.CHINESE), - ), - generation_size=400, - stop_sequence=("\n",), - ) -] - -# CMRC 2018: A span-extraction machine reading comprehension dataset for Chinese. -# https://arxiv.org/abs/1810.07366 -cmrc2018_tasks = [ - LightevalTaskConfig( - name=f"cmrc2018_{Language.CHINESE.value}", - prompt_function=get_qa_prompt_function( - Language.CHINESE, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="clue/clue", - hf_subset="cmrc2018", - evaluation_splits=("trial",), - few_shots_split="train", - generation_size=400, - metrics=( - MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.CHINESE), - ), - stop_sequence=("\n",), - ) -] - -# IndicQA: A reading comprehension dataset for 11 Indian languages. -# https://arxiv.org/abs/2407.13522 -indicqa_tasks = [ - LightevalTaskConfig( - name=f"indicqa_{language.value}", - prompt_function=get_qa_prompt_function( - language, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="ai4bharat/IndicQA", - hf_subset=f"indicqa.{LangCodeLanguage.get(language.value).language}", - hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), - hf_revision="92d96092ae229950973dac3b9998f8b3a8949b0a", - evaluation_splits=("test",), - hf_avail_splits=("test",), - generation_size=400, - metrics=( - MultilingualQuasiExactMatchMetric(language, "prefix"), - MultilingualQuasiF1ScoreMetric(language), - ), - stop_sequence=("\n",), - ) - for language in [ - Language.ASSAMESE, - Language.BENGALI, - Language.GUJARATI, - Language.HINDI, - Language.KANNADA, - Language.MALAYALAM, - Language.MARATHI, - Language.ORIYA, - Language.PUNJABI, - Language.TAMIL, - Language.TELUGU, - ] -] - -# FQuAD v2: French Question Answering Dataset version 2. -# https://arxiv.org/abs/2002.06071 -fquad_v2_tasks = [ - LightevalTaskConfig( - name=f"fquadv2_{Language.FRENCH.value}", - prompt_function=get_qa_prompt_function( - Language.FRENCH, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="manu/fquad2_test", - hf_subset="default", - evaluation_splits=("test_hasAns",), - few_shots_split="valid_hasAns", - generation_size=400, - stop_sequence=("\n",), - metrics=( - MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.FRENCH), - ), - ) -] - -# TQuAD v2: Turkish Question Answering Dataset version 2. -tquad_v2_tasks = [ - LightevalTaskConfig( - name=f"tquadv2_{Language.TURKISH.value}", - prompt_function=get_qa_prompt_function( - Language.TURKISH, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [a["text"] for a in line["answers"]], - }, - ), - suite=("lighteval",), - hf_repo="erdometo/tquad2", - hf_subset="default", - evaluation_splits=("validation",), - few_shots_split="train", - generation_size=400, - stop_sequence=("\n",), - metrics=( - MultilingualQuasiExactMatchMetric(Language.TURKISH, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.TURKISH), - ), - ) -] - -# Other QA tasks for RC - -# TyDi QA: A benchmark for information-seeking question answering in typologically diverse languages. -# https://arxiv.org/abs/2003.05002 -tydiqa_tasks = [ - LightevalTaskConfig( - name=f"tydiqa_{language.value}", - prompt_function=get_qa_prompt_function( - language, - lambda line: { - "question": line["question"], - "context": line["context"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="google-research-datasets/tydiqa", - hf_subset="secondary_task", - evaluation_splits=("validation",), - few_shots_split="train", - generation_size=400, - stop_sequence=("\n",), - metrics=( - MultilingualQuasiExactMatchMetric(language, "prefix"), - MultilingualQuasiF1ScoreMetric(language), - ), - ) - for language in [ - Language.ENGLISH, - Language.ARABIC, - Language.BENGALI, - Language.FINNISH, - Language.INDONESIAN, - Language.JAPANESE, - Language.KOREAN, - Language.SWAHILI, - Language.RUSSIAN, - Language.TELUGU, - Language.THAI, - ] -] - -# C3: A Chinese Challenge Corpus for Cross-lingual and Cross-modal Tasks -# Reading comprehension task part of clue -# Paper: https://arxiv.org/abs/2004.05986 -c3_tasks = [ - LightevalTaskConfig( - name=f"c3_{Language.CHINESE.value}_{formulation.name.lower()}", - suite=("lighteval",), - prompt_function=get_mcq_prompt_function( - Language.CHINESE, - lambda line: { - "question": line["question"], - "choices": line["choice"], - "gold_idx": line["choice"].index(line["answer"]), - "context": " ".join(line["context"]), - }, - formulation=formulation, - ), - hf_repo="clue/clue", - hf_subset="c3", - evaluation_splits=("validation",), - few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# Other MCF tasks for RC -# RACE: Reading Comprehension from Examinations -# RACE is a large-scale reading comprehension dataset collected from English exams for middle and high school Chinese students. -# This Arabic version is a translation of the original RACE dataset, adapted for Arabic language evaluation. -# Paper: https://aclanthology.org/2023.arabicnlp-1.21/ -race_ar_task = [ - LightevalTaskConfig( - name=f"alghafa_race_{Language.ARABIC.value}_{formulation.name.lower()}", - prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation), - suite=["lighteval"], - hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", - hf_subset="race_ar", - hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", - hf_avail_splits=["test", "validation"], - evaluation_splits=["test"], - few_shots_split="validation", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] -# SOQAL: A large-scale Arabic reading comprehension dataset. -# https://arxiv.org/abs/1906.05394 -soqal_tasks = [ - LightevalTaskConfig( - name=f"soqal_{Language.ARABIC.value}_{formulation.name.lower()}", - hf_subset="multiple_choice_grounded_statement_soqal_task", - prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation), - evaluation_splits=["test"], - few_shots_split="validation", - suite=["lighteval"], - hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# MLQA (MultiLingual Question Answering) is a benchmark dataset for evaluating cross-lingual question answering performance. -# It consists of QA instances in 7 languages: English, Arabic, German, Spanish, Hindi, Vietnamese, and Chinese. -# The dataset is derived from the SQuAD v1.1 dataset, with questions and contexts translated by professional translators. -# Paper: https://arxiv.org/abs/1910.07475 -mlqa_tasks = [ - LightevalTaskConfig( - name=f"mlqa_{lang.value}", - prompt_function=get_qa_prompt_function( - lang, - lambda line: { - "context": line["context"], - "question": line["question"], - "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], - }, - ), - suite=("lighteval",), - hf_repo="facebook/mlqa", - hf_subset=f"mlqa.{standardize_tag(lang.value)}.{standardize_tag(lang.value)}", - hf_revision="397ed406c1a7902140303e7faf60fff35b58d285", - evaluation_splits=("test",), - hf_avail_splits=["test"], - generation_size=400, - stop_sequence=("\n",), - metrics=[ - MultilingualQuasiExactMatchMetric(lang, "prefix"), - MultilingualQuasiF1ScoreMetric(lang), - ], - ) - for lang in [ - Language.ARABIC, - Language.GERMAN, - Language.SPANISH, - Language.CHINESE, - Language.HINDI, - Language.VIETNAMESE, - ] -] - -# Belebele: A large-scale reading comprehension dataset covering 122 languages. -# https://arxiv.org/abs/2308.16884 -belebele_tasks = [ - LightevalTaskConfig( - name=f"belebele_{language}_{formulation.name.lower()}", - prompt_function=get_mcq_prompt_function( - iso_639_3_ind_to_iso_639_3_macro[LangCodeLanguage.get(language).to_alpha3()], - lambda line: { - "question": line["question"], - "context": line["flores_passage"], - "choices": [line[f"mc_answer{i}"] for i in range(1, 5)], - "gold_idx": int(line["correct_answer_num"]) - 1, - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="facebook/belebele", - hf_subset=language, - evaluation_splits=("test",), - hf_avail_splits=["test"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] - for language in [ - "acm_Arab", - "arz_Arab", - "ceb_Latn", - "fin_Latn", - "hin_Deva", - "ita_Latn", - "khm_Khmr", - "lvs_Latn", - "npi_Deva", - "pol_Latn", - "slv_Latn", - "swe_Latn", - # "tso_Latn", - # "xho_Latn", - "afr_Latn", - "asm_Beng", - "ces_Latn", - "fra_Latn", - "hin_Latn", - "jav_Latn", - # "kin_Latn", - "mal_Mlym", - "npi_Latn", - "por_Latn", - # "sna_Latn", - "swh_Latn", - "tur_Latn", - "yor_Latn", - "als_Latn", - "azj_Latn", - "ckb_Arab", - # "fuv_Latn", - "hrv_Latn", - "jpn_Jpan", - "kir_Cyrl", - "mar_Deva", - # "nso_Latn", - "snd_Arab", - "tam_Taml", - "ukr_Cyrl", - "zho_Hans", - "amh_Ethi", - # "bam_Latn", - "dan_Latn", - # "gaz_Latn", - "hun_Latn", - # "kac_Latn", - "kor_Hang", - "mkd_Cyrl", - # "nya_Latn", - "ron_Latn", - "som_Latn", - "tel_Telu", - "urd_Arab", - "zho_Hant", - "apc_Arab", - "ben_Beng", - "deu_Latn", - # "grn_Latn", - "hye_Armn", - "kan_Knda", - "lao_Laoo", - "mlt_Latn", - "ory_Orya", - "rus_Cyrl", - # "sot_Latn", - "tgk_Cyrl", - "urd_Latn", - "zsm_Latn", - "arb_Arab", - "ben_Latn", - "ell_Grek", - "guj_Gujr", - # "ibo_Latn", - "kat_Geor", - # "lin_Latn", - # "mri_Latn", - "pan_Guru", - # "shn_Mymr", - "spa_Latn", - "tgl_Latn", - "uzn_Latn", - # "zul_Latn", - "arb_Latn", - # "bod_Tibt", - "eng_Latn", - # "hat_Latn", - # "ilo_Latn", - "kaz_Cyrl", - "lit_Latn", - "mya_Mymr", - "pbt_Arab", - "sin_Latn", - "srp_Cyrl", - "tha_Thai", - "vie_Latn", - "ars_Arab", - "bul_Cyrl", - "est_Latn", - # "hau_Latn", - "ind_Latn", - # "kea_Latn", - # "lug_Latn", - "nld_Latn", - "pes_Arab", - "sin_Sinh", - # "ssw_Latn", - # "tir_Ethi", - "war_Latn", - "ary_Arab", - "cat_Latn", - "eus_Latn", - "heb_Hebr", - "isl_Latn", - # "khk_Cyrl", - # "luo_Latn", - "nob_Latn", - "plt_Latn", - "slk_Latn", - # "sun_Latn", - # "tsn_Latn", - # "wol_Latn", - ] -] - -TASKS_TABLE.extend( - [ - *xquad_tasks, - *thaiqa_tasks, - *sber_squad_tasks, - *arcd_tasks, - *kenswquad_tasks, - *chinese_squad_tasks, - *cmrc2018_tasks, - *indicqa_tasks, - *fquad_v2_tasks, - *tquad_v2_tasks, - *tydiqa_tasks, - *soqal_tasks, - *race_ar_task, - *belebele_tasks, - *c3_tasks, - *squad_it_tasks, - *squad_es_tasks, - *faquad_tasks, - *germanquad_tasks, - ] -) - -# ------------------------------- GK Tasks ------------------------------- # -# General Knowledge (GK) tasks evaluate a model's broad understanding across various domains. -# These tasks typically involve answering questions on diverse subjects, testing the model's ability to recall and apply general information. - - -# -------------------------------- MMLU -------------------------------- # -# MMLU (Massive Multitask Language Understanding) -# A comprehensive test of world knowledge, covering 57 subjects across STEM, humanities, social sciences, and more. -# Note that all MMLU tasks uses PMI normalization, this makes the computation 2x slower, however we found this metric to be less noisy and yield better results than the others. -# Paper: https://arxiv.org/abs/2009.03300 -MMLU_SUBSETS = [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions", -] - -# Meta MMLU: A multilingual version of MMLU (using google translation) -# Paper: https://arxiv.org/abs/2407.21783 -meta_mmlu_tasks = [ - LightevalTaskConfig( - name=f"meta_mmlu_{language.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - language, - lambda line: { - "question": line["input_question"], - "choices": [v for _, v in sorted(line["input_choice_list"].items(), key=lambda x: x[0])], - "gold_idx": LETTER_INDICES.index(line["input_correct_responses"][0]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="meta-llama/Meta-Llama-3.1-8B-Instruct-evals", - hf_subset=f"Llama-3.1-8B-Instruct-evals__multilingual_mmlu_{standardize_tag(language.value)}__details", - hf_filter=partial( - lambda language, subset, line: line["subtask_name"] - == f"mmlu_{standardize_tag(language.value)}_chat.{subset}", - language, - subset, - ), - evaluation_splits=("latest",), - hf_avail_splits=["latest"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for subset in MMLU_SUBSETS - for language in [ - Language.GERMAN, - Language.SPANISH, - Language.FRENCH, - Language.HINDI, - Language.ITALIAN, - Language.PORTUGUESE, - Language.THAI, - ] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# MLMM MMLU: Another multilingual version of MMLU -# Paper: https://github.com/nlp-uoregon/mlmm-evaluation -mlmm_mmlu_tasks = [ - LightevalTaskConfig( - name=f"mlmm_mmlu_{language.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - language, - lambda line: { - "question": line["question"], - "choices": line["choices"], - "gold_idx": LETTER_INDICES.index(line["answer"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="jon-tow/okapi_mmlu", - hf_subset=standardize_tag(language.value), - hf_revision="refs/pr/1", - hf_filter=partial(lambda subset, line: line["id"].split("/")[0] == subset, subset), - evaluation_splits=("test",), - few_shots_split="dev", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for subset in MMLU_SUBSETS - for language in [ - Language.RUSSIAN, - Language.GERMAN, - Language.CHINESE, - Language.FRENCH, - Language.SPANISH, - Language.ITALIAN, - Language.DUTCH, - Language.VIETNAMESE, - Language.INDONESIAN, - Language.ARABIC, - Language.HUNGARIAN, - Language.ROMANIAN, - Language.DANISH, - Language.SLOVAK, - Language.UKRAINIAN, - Language.CATALAN, - Language.SERBIAN, - Language.CROATIAN, - Language.HINDI, - Language.BENGALI, - Language.TAMIL, - Language.NEPALI, - Language.MALAYALAM, - Language.MARATHI, - Language.TELUGU, - Language.KANNADA, - ] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -openai_mmlu_tasks = [ - LightevalTaskConfig( - name=f"openai_mmlu_{language[0].value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - language[0], - lambda line: { - "question": line["Question"], - "choices": [line["A"], line["B"], line["C"], line["D"]], - "gold_idx": LETTER_INDICES.index(line["Answer"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="openai/MMMLU", - hf_subset=language[1], - evaluation_splits=("test",), - hf_avail_splits=["test"], - hf_filter=partial(lambda subset, x: x["Subject"].lower() == subset, subset), - hf_revision="038c7808122969ead7456361af05cb8f47d247f8", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for subset in MMLU_SUBSETS - for language in [ - (Language.ARABIC, "AR_XY"), - (Language.BENGALI, "BN_BD"), - (Language.GERMAN, "DE_DE"), - (Language.SPANISH, "ES_LA"), - (Language.FRENCH, "FR_FR"), - (Language.HINDI, "HI_IN"), - (Language.INDONESIAN, "ID_ID"), - (Language.ITALIAN, "IT_IT"), - (Language.JAPANESE, "JA_JP"), - (Language.KOREAN, "KO_KR"), - (Language.PORTUGUESE, "PT_BR"), - (Language.SWAHILI, "SW_KE"), - (Language.YORUBA, "YO_NG"), - (Language.CHINESE, "ZH_CN"), - ] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# Translated MMLU using both professional and non-professional translators. Contains tags for cultural sensitivity. -# CA: Cultural Agnostic -# CS: Cultural Specific -# UNK: Not annotated -# ALL: All of the above -# https://huggingface.co/papers/2412.03304 -global_mmlu_tasks = [ - LightevalTaskConfig( - name=f"global_mmlu_{sensitivity_label.lower()}_{language.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - language, - lambda line: { - "question": line["question"], - "choices": [line["option_a"], line["option_b"], line["option_c"], line["option_d"]], - "gold_idx": LETTER_INDICES.index(line["answer"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="CohereForAI/Global-MMLU", - hf_subset=standardize_tag(language.value), - evaluation_splits=("test",), - few_shots_split="dev", - hf_filter=partial( - lambda subset, sensitivity_label, x: x["subject"].lower() == subset - and ( - sensitivity_label == "ALL" or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK") - ) - and all(x[f"option_{opt}"] is not None and x[f"option_{opt}"].strip() for opt in "abcd"), - subset, - sensitivity_label, - ), - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for subset in MMLU_SUBSETS - for language in [ - Language.AMHARIC, - Language.ARABIC, - Language.BENGALI, - Language.CHINESE, - Language.CZECH, - Language.GERMAN, - Language.ENGLISH, - Language.SPANISH, - Language.FRENCH, - Language.HEBREW, - Language.HINDI, - Language.INDONESIAN, - Language.ITALIAN, - Language.JAPANESE, - Language.KOREAN, - Language.MALAY, - Language.DUTCH, - Language.NORWEGIAN, - Language.POLISH, - Language.PORTUGUESE, - Language.ROMANIAN, - Language.RUSSIAN, - Language.SERBIAN, - Language.SWEDISH, - Language.SWAHILI, - Language.TAMIL, - Language.TELUGU, - Language.THAI, - Language.TURKISH, - Language.UKRAINIAN, - Language.URDU, - Language.VIETNAMESE, - Language.YORUBA, - Language.ZULU, - ] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] - for sensitivity_label in ["ALL", "CA", "CS", "UNK"] -] - - -# There are only these subsets in the African MMLU -AFRI_MMLU_SUBSETS = [ - "elementary_mathematics", - "high_school_mathematics", - "high_school_geography", - "high_school_microeconomics", - "international_law", - "global_facts", -] -# African MMLU: African Massive Multitask Language Understanding -# From https://arxiv.org/abs/2406.03368. Human translated MMLU. -afri_mmlu_tasks = [ - LightevalTaskConfig( - name=f"afri_mmlu_{language.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - language, - lambda line: { - "question": line["question"], - "choices": line["choices"], - "gold_idx": LETTER_INDICES.index(line["answer"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="masakhane/afrimmlu", - # Temporary until the pr is merged - hf_revision="refs/pr/1", - hf_subset=language.value, - hf_filter=partial(lambda subset, line: line["subject"] == subset, subset), - evaluation_splits=("test",), - few_shots_split="dev", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for subset in AFRI_MMLU_SUBSETS - for language in [ - Language.AMHARIC, - # Language.EWE, - Language.FRENCH, - # Language.HAUSA, - # Language.IGBO, - # Language.KINYARWANDA, - # Language.LINGALA, - # Language.LUGANDA, - # Language.OROMO, - # Language.SHONA, - # Language.SOTHO, - Language.SWAHILI, - # Language.TWI, - # Language.WOLOF, - # Language.XHOSA, - Language.YORUBA, - # Language.ZULU, - ] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# RUMMLU: Russian Massive Multitask Language Understanding -# Paper: https://arxiv.org/html/2401.04531v2 -rummlu = [ - LightevalTaskConfig( - name=f"rummlu_{Language.RUSSIAN.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - Language.RUSSIAN, - lambda line: { - "question": line["inputs"]["text"], - "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]], - "gold_idx": LETTER_INDICES.index(line["outputs"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="ai-forever/MERA", - hf_subset="rummlu", - hf_filter=lambda x: x["meta"]["domain"] == subset, - evaluation_splits=("public_test",), - hf_avail_splits=["public_test"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for subset in MMLU_SUBSETS - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# MMLU Turkish: Turkish version of MMLU -# Translated using openai GPT -mmlu_turkish = [ - LightevalTaskConfig( - name=f"community_mmlu_{Language.TURKISH.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - Language.TURKISH, - lambda line: {"question": line["question"], "choices": line["choices"], "gold_idx": int(line["answer"])}, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="malhajar/mmlu_tr-v0.2", - hf_subset=subset, - evaluation_splits=("test",), - few_shots_split="dev", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for subset in MMLU_SUBSETS - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# CMMLU: Chinese Massive Multitask Language Understanding -# Native translation with some new categories -# Paper: https://arxiv.org/abs/2306.09212 -CMMLU_SUBSETS = [ - "agronomy", - "anatomy", - "ancient_chinese", - "arts", - "astronomy", - "business_ethics", - "chinese_civil_service_exam", - "chinese_driving_rule", - "chinese_food_culture", - "chinese_foreign_policy", - "chinese_history", - "chinese_literature", - "chinese_teacher_qualification", - "clinical_knowledge", - "college_actuarial_science", - "college_education", - "college_engineering_hydrology", - "college_law", - "college_mathematics", - "college_medical_statistics", - "college_medicine", - "computer_science", - "computer_security", - "conceptual_physics", - "construction_project_management", - "economics", - "education", - "electrical_engineering", - "elementary_chinese", - "elementary_commonsense", - "elementary_information_and_technology", - "elementary_mathematics", - "ethnology", - "food_science", - "genetics", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_geography", - "high_school_mathematics", - "high_school_physics", - "high_school_politics", - "human_sexuality", - "international_law", - "journalism", - "jurisprudence", - "legal_and_moral_basis", - "logical", - "machine_learning", - "management", - "marketing", - "marxist_theory", - "modern_chinese", - "nutrition", - "philosophy", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_study", - "sociology", - "sports_science", - "traditional_chinese_medicine", - "virology", - "world_history", - "world_religions", -] - -cmmlu_tasks = [ - LightevalTaskConfig( - name=f"cmmlu_{Language.CHINESE.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - Language.CHINESE, - lambda line: { - "question": line["Question"], - "choices": [line["A"], line["B"], line["C"], line["D"]], - "gold_idx": LETTER_INDICES.index(line["Answer"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="haonan-li/cmmlu", - hf_subset=subset, - evaluation_splits=("test",), - few_shots_split="dev", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for subset in CMMLU_SUBSETS - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# Arabic MMLU: Arabic version of MMLU -# Native translation with some new categories -# Paper: https://arxiv.org/html/2402.12840v1 -ARABIC_MMLU_SUBSETS = [ - "Islamic Studies", - "Islamic Studies (Middle School)", - "Islamic Studies (Primary School)", - "Islamic Studies (High School)", - "Driving Test", - "Natural Science (Middle School)", - "Natural Science (Primary School)", - "History (Middle School)", - "History (Primary School)", - "History (High School)", - "General Knowledge", - "General Knowledge (Middle School)", - "General Knowledge (Primary School)", - "Law (Professional)", - "Physics (High School)", - "Social Science (Middle School)", - "Social Science (Primary School)", - "Management (University)", - "Arabic Language (Middle School)", - "Arabic Language (Primary School)", - "Arabic Language (High School)", - "Political Science (University)", - "Philosophy (High School)", - "Accounting (University)", - "Computer Science (Middle School)", - "Computer Science (Primary School)", - "Computer Science (High School)", - "Computer Science (University)", - "Geography (Middle School)", - "Geography (Primary School)", - "Geography (High School)", - "Math (Primary School)", - "Biology (High School)", - "Economics (Middle School)", - "Economics (High School)", - "Economics (University)", - "Arabic Language (General)", - "Arabic Language (Grammar)", - "Civics (Middle School)", - "Civics (High School)", -] - -arabic_mmlu_tasks = [ - LightevalTaskConfig( - name=f"mmlu_{Language.ARABIC.value}_{formulation.name.lower()}:{normalize_subset(subset)}", - prompt_function=get_mcq_prompt_function( - Language.ARABIC, - lambda line: { - "context": line["Context"], - "question": line["Question"], - "choices": [str(o) for o in [line[f"Option {i}"] for i in range(1, 6)] if o], - "gold_idx": LETTER_INDICES.index(line["Answer Key"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="MBZUAI/ArabicMMLU", - hf_subset=subset, - evaluation_splits=("test",), - hf_avail_splits=["dev"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for subset in ARABIC_MMLU_SUBSETS - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - - -TURKISH_MMLU_SUBSET = [ - "Biology", - "Chemistry", - "Geography", - "History", - "Mathematics", - "Philosophy", - "Physics", - "Religion_and_Ethics", - "Turkish_Language_and_Literature", -] - -turkish_mmlu_tasks = [ - LightevalTaskConfig( - name=f"mmlu_{Language.TURKISH.value}_{formulation.name.lower()}:{normalize_subset(subset)}", - prompt_function=get_mcq_prompt_function( - Language.TURKISH, - lambda line: { - "question": line["question"], - "choices": line["choices"], - "gold_idx": LETTER_INDICES.index(line["answer"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="AYueksel/TurkishMMLU", - hf_subset=subset, - evaluation_splits=("test",), - few_shots_split="dev", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for subset in TURKISH_MMLU_SUBSET - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -TASKS_TABLE.extend( - [ - *meta_mmlu_tasks, - *mlmm_mmlu_tasks, - *rummlu, - *mmlu_turkish, - *cmmlu_tasks, - *openai_mmlu_tasks, - *arabic_mmlu_tasks, - *turkish_mmlu_tasks, - *afri_mmlu_tasks, - *global_mmlu_tasks, - ] -) - - -# ---------------------------- ARC ---------------------------- # -# ARC (AI2 Reasoning Challenge) is a dataset for question answering that requires reasoning. -# It consists of multiple-choice science questions from 3rd to 9th grade exams. -# The dataset is split into two parts: ARC-Easy and ARC-Challenge. -# ARC-Easy contains questions that can be answered correctly by both humans and simple baseline models. -# ARC-Challenge contains questions that are difficult for both humans and current AI systems. - -# Similar to MMLU, ARC tasks uses PMI normalization by default but only for the challenge set. - - -# github: https://github.com/nlp-uoregon/mlmm-evaluation -mlmm_arc_challenge_tasks = [ - LightevalTaskConfig( - name=f"mlmm_arc_{language.value}_{formulation.name.lower()}:challenge", - prompt_function=get_mcq_prompt_function( - language, - lambda line: { - "question": line["question"], - "choices": line["choices"]["text"], - "gold_idx": int(line["answerKey"]) - 1 - if line["answerKey"].isdigit() - else LETTER_INDICES.index(line["answerKey"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="jon-tow/okapi_arc_challenge", - hf_subset=standardize_tag(language.value), - hf_revision="823d5d7bfaf8974a3ab52a825b6cf4903b35dbc4", - evaluation_splits=("test",), - few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for language in [ - Language.RUSSIAN, - Language.GERMAN, - Language.CHINESE, - Language.FRENCH, - Language.SPANISH, - Language.ITALIAN, - Language.DUTCH, - Language.VIETNAMESE, - Language.INDONESIAN, - Language.ARABIC, - Language.HUNGARIAN, - Language.ROMANIAN, - Language.DANISH, - Language.SLOVAK, - Language.UKRAINIAN, - Language.CATALAN, - Language.SERBIAN, - Language.CROATIAN, - Language.HINDI, - Language.BENGALI, - Language.TAMIL, - Language.NEPALI, - Language.MALAYALAM, - Language.MARATHI, - Language.TELUGU, - Language.KANNADA, - ] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# Arabic ARC Easy -# It's based on the community arabic leaderboard task but uses -# the multilingual template -# Paper: https://aclanthology.org/2023.arabicnlp-1.21/ -arabic_ledarboard_arc_easy = [ - LightevalTaskConfig( - name=f"alghafa_arc_{Language.ARABIC.value}_{formulation.name.lower()}:easy", - prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation), - suite=["lighteval"], - hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", - hf_subset="arc_easy_ar", - hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", - evaluation_splits=["test"], - few_shots_split="validation", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - - -lumi_arc = [ - LightevalTaskConfig( - name=f"lumi_arc_{language.value}_{formulation.name.lower()}:challenge", - prompt_function=get_mcq_prompt_function( - language, - lambda line: { - "question": line["question"], - "choices": line["choices"]["text"], - "gold_idx": int(line["answerKey"]) - 1 - if line["answerKey"].isdigit() - else LETTER_INDICES.index(line["answerKey"]), - }, - formulation=formulation, - ), - suite=["lighteval"], - hf_repo="LumiOpen/arc_challenge_mt", - hf_subset=standardize_tag(language.value), - evaluation_splits=["test"], - few_shots_split="validation", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] - for language in [ - Language.DANISH, - Language.GERMAN, - Language.GREEK, - Language.SPANISH, - Language.FINNISH, - Language.HUNGARIAN, - Language.ITALIAN, - # Language.NORWEGIAN_BOKMAL, - Language.POLISH, - Language.PORTUGUESE, - Language.SWEDISH, - ] -] - -# Turkish ARC -# Comes from the Turkish leaderboard -turkish_arc_tasks = [ - LightevalTaskConfig( - name=f"community_arc_{Language.TURKISH.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - Language.TURKISH, - lambda line: { - "question": line["question"], - "choices": line["choices"]["text"], - "gold_idx": int(line["answerKey"]) - 1 - if line["answerKey"].isdigit() - else LETTER_INDICES.index(line["answerKey"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="malhajar/arc-tr", - hf_subset=f"ARC-{subset.capitalize()}", - evaluation_splits=("test",), - hf_avail_splits=["train"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ] - + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore - ), - ) - for subset in ["easy", "challenge"] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -hindi_arc_tasks = [ - LightevalTaskConfig( - name=f"community_arc_{Language.HINDI.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - Language.HINDI, - lambda line: { - "question": line["question"], - "choices": line["choices"]["text"], - "gold_idx": int(line["answerKey"]) - 1 - if line["answerKey"].isdigit() - else LETTER_INDICES.index(line["answerKey"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="ai4bharat/ai2_arc-hi", - hf_subset=f"ARC-{subset.capitalize()}", - evaluation_splits=("test",), - few_shots_split="validation", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ] - + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore - ), - ) - for subset in ["easy", "challenge"] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -arabic_arc_tasks = [ - LightevalTaskConfig( - name=f"alghafa_arc_{Language.ARABIC.value}_{formulation.name.lower()}:easy", - prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation), - suite=["lighteval"], - hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", - hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", - hf_subset="arc_easy_ar", - evaluation_splits=["test"], - few_shots_split="validation", - few_shots_select="sequential", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -swahili_arc_tasks = [ - LightevalTaskConfig( - name=f"community_arc_{Language.SWAHILI.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - Language.SWAHILI, - lambda line: { - "question": line["question"], - "choices": line["choices"]["text"], - "gold_idx": int(line["answerKey"]) - 1 - if line["answerKey"].isdigit() - else LETTER_INDICES.index(line["answerKey"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo=f"Mollel/ARC_{subset.capitalize()}_SWH", - hf_subset="default", - hf_revision="5347439d3193c8a0dabaab3819914bf076dc94d4" - if subset == "easy" - else "dc1df9df632d14c251594d9129fb833d2ca4429c", - evaluation_splits=("test",), - few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ] - + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore - ), - ) - for subset in ["easy", "challenge"] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - - -TASKS_TABLE.extend( - [ - *mlmm_arc_challenge_tasks, - *arabic_ledarboard_arc_easy, - *lumi_arc, - *turkish_arc_tasks, - *hindi_arc_tasks, - *swahili_arc_tasks, - *arabic_arc_tasks, - ] -) - -# ---------------------------- TruthfulQA ---------------------------- # -# TruthfulQA: Measuring How Models Mimic Human Falsehoods -# Paper: https://arxiv.org/abs/2109.07958 -# TruthfulQA is a benchmark dataset designed to measure the truthfulness of language models. -# It consists of questions that humans might answer incorrectly due to false beliefs or misconceptions. -# The task evaluates a model's ability to provide truthful answers and avoid common human biases. - -# github: https://github.com/nlp-uoregon/mlmm-evaluation -mlmm_truthfulqa_tasks = [ - LightevalTaskConfig( - name=f"mlmm_truthfulqa_{language.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - language, - partial( - lambda subset, line: { - "question": line["question"], - "choices": line[f"{subset}_targets"]["choices"], - "gold_idx": [ix for ix, label in enumerate(line[f"{subset}_targets"]["labels"]) if label == 1], # type: ignore - }, - subset, - ), - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="jon-tow/okapi_truthfulqa", - hf_subset=standardize_tag(language.value), - hf_revision="cdd5db1a66fd04105622109d1c2a5cbc8cde7586", - evaluation_splits=("validation",), - hf_avail_splits=["validation"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for subset in ["mc1", "mc2"] - for language in [ - Language.ARABIC, - Language.BENGALI, - Language.CATALAN, - Language.DANISH, - Language.GERMAN, - Language.SPANISH, - Language.BASQUE, - Language.FRENCH, - Language.GUJARATI, - Language.HINDI, - Language.CROATIAN, - Language.HUNGARIAN, - Language.ARMENIAN, - Language.INDONESIAN, - Language.ICELANDIC, - Language.ITALIAN, - Language.KANNADA, - Language.MALAYALAM, - Language.MARATHI, - Language.NORWEGIAN, - Language.NEPALI, - Language.DUTCH, - Language.PORTUGUESE, - Language.ROMANIAN, - Language.RUSSIAN, - Language.SLOVAK, - Language.SERBIAN, - Language.SWEDISH, - Language.TAMIL, - Language.TELUGU, - Language.UKRAINIAN, - Language.VIETNAMESE, - Language.CHINESE, - ] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# Turkish TruthfulQA -# Based on turkish leaderboard -turkish_truthfulqa = [ - LightevalTaskConfig( - name=f"community_truthfulqa_{Language.TURKISH.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - Language.TURKISH, - partial( - lambda subset, line: { - "question": line["question"], - "choices": line[f"{subset}_targets"]["choices"], - "gold_idx": [ix for ix, label in enumerate(line[f"{subset}_targets"]["labels"]) if label == 1], # type: ignore - }, - subset, - ), - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="malhajar/truthful_qa-tr-v0.2", - hf_subset="default", - evaluation_splits=("validation",), - hf_avail_splits=["validation"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for subset in ["mc1", "mc2"] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -TASKS_TABLE.extend( - [ - *mlmm_truthfulqa_tasks, - *turkish_truthfulqa, - ] -) - -# ---------------------------- Exams like tasks ---------------------------- # - -# Exams: A collection of exam questions from various countries and subjects -# Paper: https://arxiv.org/abs/2011.03080 -exams_subjects_by_lang: dict[Language, set[str]] = { - Language.ARABIC: {"Biology", "Islamic Studies", "Physics", "Science", "Social"}, - Language.BULGARIAN: {"Biology", "Chemistry", "Geography", "History", "Philosophy", "Physics"}, - Language.CROATIAN: { - "Biology", - "Chemistry", - "Ethics", - "Fine Arts", - "Geography", - "Geology", - "History", - "Informatics", - "Philosophy", - "Physics", - "Politics", - "Psychology", - "Religion", - "Sociology", - }, - Language.HUNGARIAN: { - "Agriculture", - "Agriculture (Mechanical knowledge)", - "Biology", - "Chemistry", - "Economics", - "Economics & Marketing", - "Economics Basics (Business)", - "Economics Basics (Theoretical)", - "Forestry", - "Geography", - "Landscaping", - "Physics", - "Politics", - "Tourism", - }, - Language.ITALIAN: { - "Biology", - "Chemistry", - "Ethics", - "Geography", - "Geology", - "History", - "Informatics", - "Philosophy", - "Physics", - "Politics", - "Psychology", - "Sociology", - }, - Language.SERBIAN: { - "Biology", - "Chemistry", - "Ethics", - "Geography", - "Geology", - "History", - "Informatics", - "Philosophy", - "Physics", - "Politics", - "Psychology", - "Religion", - "Sociology", - }, - Language.FRENCH: {"Economics", "Economics & Marketing", "Economics Basics (Theoretical)", "Geography", "Physics"}, - Language.GERMAN: { - "Chemistry", - "Economics", - "Economics & Marketing", - "Economics Basics (Theoretical)", - "Geography", - "Physics", - "Tourism", - }, - Language.SPANISH: {"Geography", "Physics"}, - Language.LITHUANIAN: {"Geology", "History"}, - Language.ALBANIAN: { - "Biology", - "Business", - "Chemistry", - "Fine Arts", - "History", - "Philosophy", - "Physics", - "Sociology", - }, - Language.MACEDONIAN: { - "Biology", - "Business", - "Chemistry", - "Fine Arts", - "History", - "Philosophy", - "Physics", - "Sociology", - }, - Language.TURKISH: { - "Biology", - "Business", - "Chemistry", - "Geography", - "History", - "Philosophy", - "Physics", - "Sociology", - }, - Language.POLISH: {"Professional"}, - Language.PORTUGUESE: {"Biology", "Economics", "Geology", "Philosophy"}, - Language.VIETNAMESE: {"Biology", "Chemistry", "Citizenship", "Geography", "History", "Physics"}, -} - -exams_tasks = [ - LightevalTaskConfig( - name=f"exams_{language.value}_{formulation.name.lower()}:{normalize_subset(subject)}", - prompt_function=get_mcq_prompt_function( - language, - lambda line: { - "question": line["question"]["stem"], - "choices": line["question"]["choices"]["text"], - "gold_idx": line["question"]["choices"]["label"].index(line["answerKey"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="mhardalov/exams", - hf_subset="multilingual", - # Weird bug in dataset - hf_filter=partial( - lambda language, subject, line: line["answerKey"] != "@" - and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name() - and line["info"]["subject"] == subject, - language, - subject, - ), - evaluation_splits=("test",), - few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for language in exams_subjects_by_lang.keys() - for subject in exams_subjects_by_lang[language] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# M3Exam: Multitask Multilingual Multimodal Evaluation Benchmark -# It also contains a multimodal version but we don't support that -# Paper: https://arxiv.org/abs/2306.05179 -m3exams_tasks = [ - LightevalTaskConfig( - name=f"m3exams_{language.value}_{formulation.name.lower()}", - suite=("lighteval",), - prompt_function=get_mcq_prompt_function( - language, - partial(get_m3exam_adapter, language), - formulation=formulation, - ), - hf_repo="chiayewken/m3exam", - hf_subset=LangCodeLanguage(standardize_tag(language.value)).language_name().lower(), - evaluation_splits=("test",), - few_shots_split="dev", - generation_size=-1, - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for language in [ - Language.AFRIKAANS, - Language.CHINESE, - Language.ENGLISH, - Language.ITALIAN, - Language.JAVANESE, - Language.PORTUGUESE, - Language.SWAHILI, - Language.THAI, - Language.VIETNAMESE, - ] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# Thai Exams -# We noticed very bad performance of models on this dataset -# However, it may just be because quality of the models themselves -# Paper: https://arxiv.org/abs/2312.13951 - -THAI_EXAMS_SUBSETS = ["a_level", "ic", "onet", "tgat", "tpat1"] - -thai_exams_tasks = [ - LightevalTaskConfig( - name=f"thai_exams_{Language.THAI.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function(Language.THAI, thai_exams_adapter, formulation=formulation), - suite=("lighteval",), - hf_repo="scb10x/thai_exam", - hf_subset=subset, - evaluation_splits=("test",), - few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for subset in THAI_EXAMS_SUBSETS - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -TASKS_TABLE.extend( - [ - *exams_tasks, - *m3exams_tasks, - *thai_exams_tasks, - ] -) - -# ------------------------------- XCSQA ------------------------------- # -# XCSQA (Cross-lingual Commonsense QA) is part of the XCSR (Cross-lingual Commonsense Reasoning) benchmark -# It is a multilingual extension of the CommonsenseQA dataset, covering 16 languages -# The task involves answering multiple-choice questions that require commonsense reasoning -# Uses PMI normalization -# Paper: https://arxiv.org/abs/2110.08462 -xcsqa_tasks = [ - LightevalTaskConfig( - name=f"xcsqa_{language.value}_{formulation.name.lower()}", - prompt_function=get_mcq_prompt_function( - language, - lambda line: { - "question": line["question"]["stem"], - "choices": line["question"]["choices"]["text"], - "gold_idx": line["question"]["choices"]["label"].index(line["answerKey"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="INK-USC/xcsr", - hf_subset=f"X-CSQA-{standardize_tag(language.value) if language != Language.JAPANESE else 'jap'}", - hf_filter=lambda x: all( - len(x["question"]["choices"]["text"][i].strip()) > 0 for i in range(len(x["question"]["choices"]["text"])) - ), - evaluation_splits=("validation",), - hf_avail_splits=["validation"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for language in [ - Language.ARABIC, - Language.GERMAN, - Language.ENGLISH, - Language.SPANISH, - Language.FRENCH, - Language.HINDI, - Language.ITALIAN, - Language.JAPANESE, - Language.DUTCH, - Language.POLISH, - Language.PORTUGUESE, - Language.RUSSIAN, - Language.SWAHILI, - Language.URDU, - Language.VIETNAMESE, - Language.CHINESE, - ] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -TASKS_TABLE.extend( - [ - *xcsqa_tasks, - ] -) - -# ------------------------------- PIQA ------------------------------- # -# PIQA: Physical Interaction Question Answering -# PIQA is a benchmark for testing physical commonsense reasoning. -# This Arabic version is a translation of the original PIQA dataset, adapted for Arabic language evaluation. -# It tests the ability to reason about physical interactions in everyday situations. -# Paper: https://arxiv.org/abs/1911.11641 -# Arabic version: https://aclanthology.org/2023.arabicnlp-1.21/ -piqa_ar_tasks = [ - LightevalTaskConfig( - name=f"alghafa_piqa_{Language.ARABIC.value}_{formulation.name.lower()}", - prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation), - suite=["lighteval"], - hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", - hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", - hf_subset="piqa_ar", - hf_avail_splits=["test", "validation"], - evaluation_splits=["test"], - few_shots_split="validation", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -TASKS_TABLE.extend( - [ - *piqa_ar_tasks, - ] -) - -# ------------------------------- OpenBookQA ------------------------------- # -# OpenBookQA: A Question-Answering Dataset for Open-Book Exams -# OpenBookQA is a question-answering dataset modeled after open-book exams for assessing human understanding of a subject. -# It consists of multiple-choice questions that require combining facts from a given open book with broad common knowledge. -# The task tests language models' ability to leverage provided information and apply common sense reasoning. -# Original paper: https://arxiv.org/abs/1809.02789 -# Arabic version: https://aclanthology.org/2023.arabicnlp-1.21/ -openbook_ara_tasks = [ - LightevalTaskConfig( - name=f"alghafa_openbookqa_{Language.ARABIC.value}_{formulation.name.lower()}", - prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation), - suite=["lighteval"], - hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", - hf_subset="openbook_qa_ext_ar", - hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", - evaluation_splits=["test"], - few_shots_split="validation", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# Spanish version of OpenBookQA from BSC Language Technology group -# Dataset: https://huggingface.co/datasets/BSC-LT/openbookqa-es -openbook_es_tasks = [ - LightevalTaskConfig( - name=f"openbookqa_{Language.SPANISH.value}_{formulation.name.lower()}", - prompt_function=get_mcq_prompt_function( - Language.SPANISH, - lambda line: { - "question": line["question_stem"], - "choices": line["choices"]["text"], - "gold_idx": LETTER_INDICES.index(line["answerKey"]), - }, - formulation=formulation, - ), - suite=["lighteval"], - hf_repo="BSC-LT/openbookqa-es", - hf_subset="default", - evaluation_splits=("test",), - few_shots_split="validation", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - - -# The Russian version is part of the MERA (Multilingual Enhanced Russian NLP Architectures) project. -# Paper: https://arxiv.org/abs/2401.04531 -openbook_rus_tasks = [ - LightevalTaskConfig( - name=f"mera_openbookqa_{Language.RUSSIAN.value}_{formulation.name.lower()}", - prompt_function=get_mcq_prompt_function( - Language.RUSSIAN, - lambda line: { - "question": line["inputs"]["question"], - "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]], - "gold_idx": LETTER_INDICES.index(line["outputs"]), - }, - formulation=formulation, - ), - suite=["lighteval"], - hf_repo="ai-forever/MERA", - hf_subset="ruopenbookqa", - evaluation_splits=("train",), - hf_avail_splits=["train"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -TASKS_TABLE.extend( - [ - *openbook_rus_tasks, - *openbook_ara_tasks, - *openbook_es_tasks, - ] -) - -# ------------------------------- SciQ ------------------------------- # -# SciQ: Science Question Answering -# SciQ is a question-answering dataset designed to evaluate the ability of language models to answer science questions. -# It consists of multiple-choice questions that require scientific reasoning and factual knowledge. - -# The Arabic version is part of the AlGhafa Arabic LLM Benchmark, a translation and adaptation of various English datasets. -# Paper: https://aclanthology.org/2023.arabicnlp-1.21/ -sciqa_ar_task = [ - LightevalTaskConfig( - name=f"alghafa_sciqa_{Language.ARABIC.value}_{formulation.name.lower()}", - prompt_function=get_mcq_prompt_function( - Language.ARABIC, - sciqa_adapter, - formulation=formulation, - ), - suite=["lighteval"], - hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", - hf_subset="sciq_ar", - hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", - hf_avail_splits=["test", "validation"], - evaluation_splits=["test"], - few_shots_split="validation", - few_shots_select="sequential", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -TASKS_TABLE.extend( - [ - *sciqa_ar_task, - ] -) - -# ------------------------------- Math Tasks ------------------------------- # - -# MathLogicQA is a dataset for evaluating mathematical reasoning in language models. -# It consists of multiple-choice questions that require logical reasoning and mathematical problem-solving. -# This Russian version is part of the MERA (Multilingual Evaluation of Reasoning Abilities) benchmark. -# MERA: https://github.com/ai-forever/MERA -mathlogicqa_rus_tasks = [ - LightevalTaskConfig( - name=f"mathlogic_qa_{Language.RUSSIAN.value}_{formulation.name.lower()}", - prompt_function=get_mcq_prompt_function( - Language.RUSSIAN, - lambda line: { - "question": line["inputs"]["text"], - "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]], - "gold_idx": LETTER_INDICES.index(line["outputs"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="ai-forever/MERA", - hf_subset="mathlogicqa", - evaluation_splits=("train",), - hf_avail_splits=["train"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - CFFormulation(), - MCFFormulation(), - HybridFormulation(), - ] -] - -cmath_tasks = [ - LightevalTaskConfig( - name=f"cmath_{Language.CHINESE.value}", - prompt_function=get_qa_prompt_function( - Language.CHINESE, - lambda line: { - "question": line["question"], - "choices": [line["golden"]], - }, - ), - suite=("lighteval",), - hf_repo="weitianwen/cmath", - hf_subset="default", - evaluation_splits=("test",), - few_shots_split="validation", - generation_size=25, - metrics=[ - MultilingualQuasiExactMatchMetric(Language.CHINESE, "full"), - ], - stop_sequence=("\n",), - ) -] - -mgsm_tasks = [ - LightevalTaskConfig( - name=f"mgsm_{language.value}", - prompt_function=get_qa_prompt_function( - language, - lambda line: { - "question": line["question"], - # The cot is available but we have no use: - # line["answer"] - "choices": [str(line["answer_number"])], - }, - ), - suite=("lighteval",), - hf_repo="juletxara/mgsm", - hf_subset=standardize_tag(language.value), - evaluation_splits=("test",), - few_shots_split="train", - generation_size=25, - metrics=[ - MultilingualQuasiExactMatchMetric(language, "full"), - ], - stop_sequence=("\n",), - ) - for language in [ - Language.ENGLISH, - Language.SPANISH, - Language.FRENCH, - Language.GERMAN, - Language.RUSSIAN, - Language.CHINESE, - Language.JAPANESE, - Language.THAI, - Language.SWAHILI, - Language.BENGALI, - Language.TELUGU, - ] -] -# African MGSM: MGSM for African Languages -# From https://arxiv.org/abs/2406.03368. Human translated MGSM. -afri_mgsm_tasks = [ - LightevalTaskConfig( - name=f"afri_mgsm_{language.value}", - prompt_function=get_qa_prompt_function( - language, - lambda line: { - "question": line["question"], - # The cot is available but we have no use: - # line["answer"] - "choices": [str(line["answer_number"])], - }, - ), - suite=("lighteval",), - hf_repo="masakhane/afrimgsm", - hf_subset=language.value, - evaluation_splits=("test",), - few_shots_split="train", - generation_size=25, - metrics=[ - MultilingualQuasiExactMatchMetric(language, "full"), - ], - stop_sequence=("\n",), - ) - for language in [ - Language.AMHARIC, - # Language.EWE, - Language.FRENCH, - # Language.HAUSA, - # Language.IGBO, - # Language.KINYARWANDA, - # Language.LINGALA, - # Language.LUGANDA, - # Language.OROMO, - # Language.SHONA, - # Language.SOTHO, - Language.SWAHILI, - # Language.TWI, - # Language.WOLOF, - # Language.XHOSA, - Language.YORUBA, - # Language.ZULU, - ] -] -TASKS_TABLE.extend( - [ - *cmath_tasks, - *mathlogicqa_rus_tasks, - *mgsm_tasks, - *afri_mgsm_tasks, - ] -) - -# ------------------------------- Misc ------------------------------- # - -# AGIEval: Chinese AGI Evaluation suite (Excluding the english subsets) -# Uses PMI normalization -# Paper: https://arxiv.org/abs/2304.06364 -CHINESE_AGIEVAL_SUBSET = [ - "gaokao-biology", - "gaokao-chinese", - "gaokao-chemistry", - "gaokao-geography", - "gaokao-history", - "gaokao-mathqa", - "gaokao-physics", - "logiqa-zh", - "jec-qa-kd", - "jec-qa-ca", -] - -agieval_tasks_zh = [ - LightevalTaskConfig( - name=f"agieval_{Language.CHINESE.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - Language.CHINESE, - partial( - agieval_adapter, - Language.CHINESE, - formulation, - ), - formulation=formulation, - ), - suite=("lighteval",), - hf_repo=f"hails/agieval-{subset}", - hf_subset="default", - evaluation_splits=("test",), - hf_avail_splits=["test"], - few_shots_split=None, - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - LogLikelihoodAccMetric(normalization=LogProbPMINorm()), - ], - ), - ) - for subset in CHINESE_AGIEVAL_SUBSET - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] -# C-Eval: Chinese Evaluation suite -# Similar to MMLu but with different categories -# Paper: https://arxiv.org/abs/2305.08322 -CEVAL_SUBSET = [ - "computer_network", - "operating_system", - "computer_architecture", - "college_programming", - "college_physics", - "college_chemistry", - "advanced_mathematics", - "probability_and_statistics", - "discrete_mathematics", - "electrical_engineer", - "metrology_engineer", - "high_school_mathematics", - "high_school_physics", - "high_school_chemistry", - "high_school_biology", - "middle_school_mathematics", - "middle_school_biology", - "middle_school_physics", - "middle_school_chemistry", - "veterinary_medicine", - "college_economics", - "business_administration", - "marxism", - "mao_zedong_thought", - "education_science", - "teacher_qualification", - "high_school_politics", - "high_school_geography", - "middle_school_politics", - "middle_school_geography", - "modern_chinese_history", - "ideological_and_moral_cultivation", - "logic", - "law", - "chinese_language_and_literature", - "art_studies", - "professional_tour_guide", - "legal_professional", - "high_school_chinese", - "high_school_history", - "middle_school_history", - "civil_servant", - "sports_science", - "plant_protection", - "basic_medicine", - "clinical_medicine", - "urban_and_rural_planner", - "accountant", - "fire_engineer", - "environmental_impact_assessment_engineer", - "tax_accountant", - "physician", -] - -ceval_tasks = [ - LightevalTaskConfig( - name=f"ceval_{Language.CHINESE.value}_{formulation.name.lower()}:{subset}", - prompt_function=get_mcq_prompt_function( - Language.CHINESE, - partial( - ceval_adapter, - Language.CHINESE, - formulation, - ), - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="ceval/ceval-exam", - hf_subset=subset, - evaluation_splits=("val",), - few_shots_split="dev", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for subset in CEVAL_SUBSET - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - - -# OAB Exams: A collection of questions from the Brazilian Bar Association exam -# The exam is required for anyone who wants to practice law in Brazil -# Dataset: https://huggingface.co/datasets/eduagarcia/oab_exams -oab_exams_tasks = [ - LightevalTaskConfig( - name=f"oab_exams_{Language.PORTUGUESE.value}_{formulation.name.lower()}", - prompt_function=get_mcq_prompt_function( - Language.PORTUGUESE, - lambda line: { - "question": line["question"], - "choices": line["choices"]["text"], - "gold_idx": LETTER_INDICES.index(line["answerKey"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="eduagarcia/oab_exams", - hf_subset="default", - evaluation_splits=("train",), - hf_avail_splits=["train"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -# ENEM (Exame Nacional do Ensino Médio) is a standardized Brazilian national secondary -# education examination. The exam is used both as a university admission test and as a -# high school evaluation test. -# Dataset: https://huggingface.co/datasets/maritaca-ai/enem -enem_tasks = [ - LightevalTaskConfig( - name=f"enem_{Language.PORTUGUESE.value}_{formulation.name.lower()}:{year}", - prompt_function=get_mcq_prompt_function( - Language.PORTUGUESE, - partial( - enem_adapter, - Language.PORTUGUESE, - ), - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="maritaca-ai/enem", - hf_subset=year, - evaluation_splits=("train",), - hf_avail_splits=["train"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for year in ["2022", "2023", "2024"] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - - -# WorldTree is a dataset for multi-hop inference in science question answering. -# It provides explanations for elementary science questions by combining facts from a semi-structured knowledge base. -# This Russian version is part of the MERA (Multilingual Evaluation of Reasoning Abilities) benchmark. -# MERA: https://github.com/ai-forever/MERA -worldtree_rus_tasks = [ - LightevalTaskConfig( - name=f"mera_worldtree_{Language.RUSSIAN.value}_{formulation.name.lower()}", - prompt_function=get_mcq_prompt_function( - Language.RUSSIAN, - lambda line: { - "question": line["inputs"]["question"], - "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]], - "gold_idx": LETTER_INDICES.index(line["outputs"]), - }, - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="ai-forever/MERA", - hf_subset="ruworldtree", - evaluation_splits=("train",), - hf_avail_splits=["train"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -TASKS_TABLE.extend( - [ - *agieval_tasks_zh, - *worldtree_rus_tasks, - *ceval_tasks, - *oab_exams_tasks, - *enem_tasks, - ] -) - - -# ------------------------------- Continuation Tasks ------------------------------- # -xcodah_tasks = [ - LightevalTaskConfig( - name=f"xcodah_{language.value}_{formulation.name.lower()}", - prompt_function=get_mcq_prompt_function(language, partial(xcodah_adapter, language), formulation=formulation), - suite=("lighteval",), - hf_repo="INK-USC/xcsr", - hf_subset=f"X-CODAH-{standardize_tag(language.value) if language != Language.JAPANESE else 'jap'}", - evaluation_splits=("validation",), - hf_avail_splits=["validation"], - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for language in [ - Language.ARABIC, - Language.GERMAN, - Language.ENGLISH, - Language.SPANISH, - Language.FRENCH, - Language.HINDI, - Language.ITALIAN, - Language.JAPANESE, - Language.DUTCH, - Language.POLISH, - Language.PORTUGUESE, - Language.RUSSIAN, - Language.SWAHILI, - Language.URDU, - Language.VIETNAMESE, - Language.CHINESE, - ] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -xstory_tasks = [ - LightevalTaskConfig( - name=f"xstory_cloze_{lang.value}_{formulation.name.lower()}", - prompt_function=get_continuation_prompt_function( - lang, - partial( - lambda lang, line: { - "context": TRANSLATION_LITERALS[lang].sentence_space.join( - [ - line["input_sentence_1"], - line["input_sentence_2"], - line["input_sentence_3"], - line["input_sentence_4"], - ] - ), - "continuations": [line["sentence_quiz1"], line["sentence_quiz2"]], - "gold_idx": int(line["answer_right_ending"]) - 1, # type: ignore - }, - lang, - ), - formulation=formulation, - ), - suite=("lighteval",), - hf_repo="juletxara/xstory_cloze", - hf_subset=standardize_tag(lang.value), - evaluation_splits=["eval"], - few_shots_split="train", - metrics=get_metrics_for_formulation( - formulation, - [ - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ), - ) - for lang in [ - Language.RUSSIAN, - Language.CHINESE, - Language.SPANISH, - Language.ARABIC, - Language.HINDI, - Language.INDONESIAN, - Language.TELUGU, - Language.SWAHILI, - Language.BASQUE, - Language.BURMESE, - ] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -TASKS_TABLE.extend( - [ - *xcodah_tasks, - *xstory_tasks, - ] -) - -# ------------------------------- Winogrande Tasks ------------------------------- # - -xwinograd_tasks = [ - LightevalTaskConfig( - name=f"xwinograd_{language.value}_{formulation.name.lower()}", - suite=("lighteval",), - prompt_function=get_continuation_prompt_function( - language, partial(winogrand_adapter, language), formulation=formulation - ), - hf_repo="Muennighoff/xwinograd", - hf_subset=standardize_tag(language.value) if language != Language.JAPANESE else "jp", - evaluation_splits=("test",), - hf_avail_splits=["test"], - metrics=[ - LogLikelihoodAccMetric(normalization=None), - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ) - for language in [ - Language.ENGLISH, - Language.FRENCH, - Language.JAPANESE, - Language.PORTUGUESE, - Language.RUSSIAN, - Language.CHINESE, - ] - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -winograd_turkish_task = [ - LightevalTaskConfig( - name=f"community_xwinograd_{Language.TURKISH.value}_{formulation.name.lower()}", - suite=("lighteval",), - prompt_function=get_continuation_prompt_function( - Language.TURKISH, partial(winogrand_adapter, Language.TURKISH), formulation=formulation - ), - hf_repo="malhajar/winogrande-tr-v0.2", - hf_subset="default", - evaluation_splits=("validation",), - few_shots_split="train", - metrics=[ - LogLikelihoodAccMetric(normalization=None), - LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), - LogLikelihoodAccMetric(normalization=LogProbCharNorm()), - ], - ) - for formulation in [ - MCFFormulation(), - CFFormulation(), - HybridFormulation(), - ] -] - -TASKS_TABLE.extend( - [ - *xwinograd_tasks, - *winograd_turkish_task, - ] -) - -# ------------------------------- General QA tasks ------------------------------- # - -MKQA_TASK_TO_ID = { - "entity": 0, - "long_answer": 1, - # "unanswerable": 2, - "date": 3, - "number": 4, - "number_with_unit": 5, - "short_phrase": 6, - "binary": 7, -} - -mkqa_tasks = [ - LightevalTaskConfig( - name=f"mkqa_{language.value}:{subset}", - prompt_function=get_qa_prompt_function(language, partial(get_mkqa_adapter, language)), - suite=("lighteval",), - hf_repo="apple/mkqa", - hf_subset="mkqa", - hf_revision="325131889721ae0ed885b76ecb8011369d75abad", - hf_filter=partial( - lambda language, subset, line: line["answers"][ - "zh_cn" if language == Language.CHINESE else standardize_tag(language.value) - ][0]["type"] - == MKQA_TASK_TO_ID[subset], - language, - subset, - ), - evaluation_splits=("train",), - hf_avail_splits=["train"], - stop_sequence=("\n",), - metrics=[ - MultilingualQuasiExactMatchMetric(language, "prefix"), - MultilingualQuasiF1ScoreMetric(language), - ] - if subset in ["entity", "long_answer", "short_phrase"] - else [ - MultilingualQuasiExactMatchMetric(language, "full"), - ], - ) - for subset in MKQA_TASK_TO_ID.keys() - for language in [ - Language.ARABIC, - Language.DANISH, - Language.GERMAN, - Language.ENGLISH, - Language.SPANISH, - Language.FINNISH, - Language.FRENCH, - Language.HEBREW, - Language.HUNGARIAN, - Language.ITALIAN, - Language.JAPANESE, - Language.KOREAN, - Language.KHMER, - Language.MALAY, - Language.DUTCH, - Language.NORWEGIAN, - Language.POLISH, - Language.PORTUGUESE, - Language.RUSSIAN, - Language.SWEDISH, - Language.THAI, - Language.TURKISH, - Language.VIETNAMESE, - Language.CHINESE, # Simplified - # Language.CHINESE_HONG_KONG, - # Language.CHINESE_TRADITIONAL, - ] -] - -mintaka_tasks = [ - LightevalTaskConfig( - name=f"mintaka_{lang.value}", - prompt_function=get_qa_prompt_function( - lang, - lambda line: { - "question": line["question"], - "choices": [line["answerText"]], - }, - ), - suite=("lighteval",), - hf_repo="AmazonScience/mintaka", - hf_subset=standardize_tag(lang.value), - evaluation_splits=("test",), - few_shots_split="train", - generation_size=400, - stop_sequence=("\n",), - metrics=[ - MultilingualQuasiExactMatchMetric(lang, "prefix"), - MultilingualQuasiF1ScoreMetric(lang), - ], - ) - for lang in [ - Language.ARABIC, - Language.GERMAN, - Language.ENGLISH, - Language.SPANISH, - Language.FRENCH, - Language.HINDI, - Language.ITALIAN, - Language.JAPANESE, - Language.PORTUGUESE, - ] -] - -french_triviqa_tasks = [ - LightevalTaskConfig( - name=f"community_triviaqa_{Language.FRENCH.value}", - prompt_function=get_qa_prompt_function( - Language.FRENCH, - lambda line: { - "question": line["Question"], - "choices": [line["Answer"]], - }, - ), - suite=("lighteval",), - hf_repo="manu/french-trivia", - hf_subset="default", - evaluation_splits=("train",), - hf_avail_splits=["train"], - generation_size=400, - stop_sequence=("\n",), - metrics=[ - MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.FRENCH), - ], - ) -] - - -chegeka_tasks = [ - LightevalTaskConfig( - name=f"chegeka_{Language.RUSSIAN.value}", - prompt_function=get_qa_prompt_function( - Language.RUSSIAN, - lambda line: { - "question": line["inputs"]["text"], - "choices": [line["outputs"]], - }, - ), - suite=("lighteval",), - hf_repo="ai-forever/MERA", - hf_subset="chegeka", - evaluation_splits=("train",), - hf_avail_splits=["train"], - generation_size=400, - stop_sequence=("\n",), - metrics=[ - MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"), - MultilingualQuasiF1ScoreMetric(Language.RUSSIAN), - ], - ) -] - -TASKS_TABLE.extend( - [ - *mkqa_tasks, - *mlqa_tasks, - *chegeka_tasks, - *mintaka_tasks, - *french_triviqa_tasks, - ] -) - - -# ------------------------------- BoolQ Tasks (yes/no) ------------------------------- # -ACVA_SUBSET = [ - "Algeria", - "Ancient_Egypt", - "Arab_Empire", - "Arabic_Architecture", - "Arabic_Art", - "Arabic_Astronomy", - "Arabic_Calligraphy", - "Arabic_Ceremony", - "Arabic_Clothing", - "Arabic_Culture", - "Arabic_Food", - "Arabic_Funeral", - "Arabic_Geography", - "Arabic_History", - "Arabic_Language_Origin", - "Arabic_Literature", - "Arabic_Math", - "Arabic_Medicine", - "Arabic_Music", - "Arabic_Ornament", - "Arabic_Philosophy", - "Arabic_Physics_and_Chemistry", - "Arabic_Wedding", - "Bahrain", - "Comoros", - "Egypt_modern", - "InfluenceFromAncientEgypt", - "InfluenceFromByzantium", - "InfluenceFromChina", - "InfluenceFromGreece", - "InfluenceFromIslam", - "InfluenceFromPersia", - "InfluenceFromRome", - "Iraq", - "Islam_Education", - "Islam_branches_and_schools", - "Islamic_law_system", - "Jordan", - "Kuwait", - "Lebanon", - "Libya", - "Mauritania", - "Mesopotamia_civilization", - "Morocco", - "Oman", - "Palestine", - "Qatar", - "Saudi_Arabia", - "Somalia", - "Sudan", - "Syria", - "Tunisia", - "United_Arab_Emirates", - "Yemen", - "communication", - "computer_and_phone", - "daily_life", - "entertainment", -] - -acva_tasks = [ - LightevalTaskConfig( - name=f"acva_{Language.ARABIC.value}:{subset}", - prompt_function=get_boolq_prompt_function( - Language.ARABIC, - lambda line: { - "question": line["question"], - "answer": line["answer"] == "صح", - }, - formulation=CFFormulation(), - ), - suite=("lighteval",), - hf_repo="OALL/ACVA", - hf_subset=subset, - evaluation_splits=("test",), - few_shots_split="validation", - metrics=[MultilingualQuasiExactMatchMetric(Language.ARABIC, "full"), LogLikelihoodAccMetric()], - generation_size=5, - stop_sequence=("\n",), - ) - for subset in ACVA_SUBSET -] - - -french_boolq_tasks = [ - LightevalTaskConfig( - name=f"community_boolq_{Language.FRENCH.value}", - prompt_function=get_boolq_prompt_function( - Language.FRENCH, - lambda line: { - "question": line["question"], - "answer": line["label"] == 1, - "context": line["passage"], - }, - formulation=CFFormulation(), - ), - suite=("lighteval",), - hf_repo="manu/french_boolq", - hf_subset="default", - evaluation_splits=("test",), - few_shots_split="valid", - generation_size=5, - stop_sequence=["\n"], - metrics=[MultilingualQuasiExactMatchMetric(Language.FRENCH, "full"), LogLikelihoodAccMetric()], - ) -] - -hindi_boolq_tasks = [ - LightevalTaskConfig( - name=f"community_boolq_{language.value}", - prompt_function=get_boolq_prompt_function( - language, - lambda line: { - "question": line["question"], - "answer": line["answer"], - "context": line["passage"], - }, - formulation=CFFormulation(), - ), - suite=("lighteval",), - hf_repo="ai4bharat/boolq-hi", - hf_subset=standardize_tag(language.value), - evaluation_splits=("validation",), - few_shots_split="train", - generation_size=5, - stop_sequence=["\n"], - metrics=[MultilingualQuasiExactMatchMetric(language, "full"), LogLikelihoodAccMetric()], - ) - for language in [ - Language.HINDI, - Language.GUJARATI, - Language.MALAYALAM, - Language.MARATHI, - Language.TAMIL, - ] -] - - -TASKS_TABLE.extend( - [ - *acva_tasks, - *french_boolq_tasks, - *hindi_boolq_tasks, - ] -) - -# ------------------------------- Translation Tasks ------------------------------- # -flores_200_languages = [ - # "ace_Arab", - "ace_Latn", - "acm_Arab", - "acq_Arab", - "aeb_Arab", - "afr_Latn", - "ajp_Arab", - "aka_Latn", - "amh_Ethi", - "apc_Arab", - "arb_Arab", - # "arb_Latn", - "ars_Arab", - "ary_Arab", - "arz_Arab", - "asm_Beng", - "ast_Latn", - "awa_Deva", - "ayr_Latn", - "azb_Arab", - "azj_Latn", - "bak_Cyrl", - "bam_Latn", - "ban_Latn", - "bel_Cyrl", - "bem_Latn", - "ben_Beng", - "bho_Deva", - # "bjn_Arab", - "bjn_Latn", - "bod_Tibt", - "bos_Latn", - "bug_Latn", - "bul_Cyrl", - "cat_Latn", - "ceb_Latn", - "ces_Latn", - "cjk_Latn", - "ckb_Arab", - "crh_Latn", - "cym_Latn", - "dan_Latn", - "deu_Latn", - "dik_Latn", - "dyu_Latn", - "dzo_Tibt", - "ell_Grek", - "eng_Latn", - "epo_Latn", - "est_Latn", - "eus_Latn", - "ewe_Latn", - "fao_Latn", - "fij_Latn", - "fin_Latn", - "fon_Latn", - "fra_Latn", - "fur_Latn", - "fuv_Latn", - "gla_Latn", - "gle_Latn", - "glg_Latn", - "grn_Latn", - "guj_Gujr", - "hat_Latn", - "hau_Latn", - "heb_Hebr", - "hin_Deva", - "hne_Deva", - "hrv_Latn", - "hun_Latn", - "hye_Armn", - "ibo_Latn", - "ilo_Latn", - "ind_Latn", - "isl_Latn", - "ita_Latn", - "jav_Latn", - "jpn_Jpan", - "kab_Latn", - "kac_Latn", - "kam_Latn", - "kan_Knda", - # "kas_Arab", - "kas_Deva", - "kat_Geor", - # "knc_Arab", - "knc_Latn", - "kaz_Cyrl", - "kbp_Latn", - "kea_Latn", - "khm_Khmr", - "kik_Latn", - "kin_Latn", - "kir_Cyrl", - "kmb_Latn", - "kmr_Latn", - "kon_Latn", - "kor_Hang", - "lao_Laoo", - "lij_Latn", - "lim_Latn", - "lin_Latn", - "lit_Latn", - "lmo_Latn", - "ltg_Latn", - "ltz_Latn", - "lua_Latn", - "lug_Latn", - "luo_Latn", - "lus_Latn", - "lvs_Latn", - "mag_Deva", - "mai_Deva", - "mal_Mlym", - "mar_Deva", - # "min_Arab", - "min_Latn", - "mkd_Cyrl", - "plt_Latn", - "mlt_Latn", - "mni_Beng", - "khk_Cyrl", - "mos_Latn", - "mri_Latn", - "mya_Mymr", - "nld_Latn", - "nno_Latn", - "nob_Latn", - "npi_Deva", - "nso_Latn", - "nus_Latn", - "nya_Latn", - "oci_Latn", - "gaz_Latn", - "ory_Orya", - "pag_Latn", - "pan_Guru", - "pap_Latn", - "pes_Arab", - "pol_Latn", - "por_Latn", - "prs_Arab", - "pbt_Arab", - "quy_Latn", - "ron_Latn", - "run_Latn", - "rus_Cyrl", - "sag_Latn", - "san_Deva", - "sat_Olck", - "scn_Latn", - "shn_Mymr", - "sin_Sinh", - "slk_Latn", - "slv_Latn", - "smo_Latn", - "sna_Latn", - "snd_Arab", - "som_Latn", - "sot_Latn", - "spa_Latn", - "als_Latn", - "srd_Latn", - "srp_Cyrl", - "ssw_Latn", - "sun_Latn", - "swe_Latn", - "swh_Latn", - "szl_Latn", - "tam_Taml", - "tat_Cyrl", - "tel_Telu", - "tgk_Cyrl", - "tgl_Latn", - "tha_Thai", - "tir_Ethi", - "taq_Latn", - "taq_Tfng", - "tpi_Latn", - "tsn_Latn", - "tso_Latn", - "tuk_Latn", - "tum_Latn", - "tur_Latn", - "twi_Latn", - "tzm_Tfng", - "uig_Arab", - "ukr_Cyrl", - "umb_Latn", - "urd_Arab", - "uzn_Latn", - "vec_Latn", - "vie_Latn", - "war_Latn", - "wol_Latn", - "xho_Latn", - "ydd_Hebr", - "yor_Latn", - "yue_Hant", - "zho_Hans", - # "zho_Hant", - "zsm_Latn", - "zul_Latn", -] - - -def flores_adapter(lang1, lang2): - return lambda line: { - "source_text": line[f"sentence_{lang1}"], - "target_text": line[f"sentence_{lang2}"], - } - - -flores200_tasks = [ - LightevalTaskConfig( - name=f"flores200:{lang1}-{lang2}", - prompt_function=get_translation_prompt_function( - source_language=Language(manage_duplicate_language_codes(lang1.split("_")[0])), - target_language=Language(manage_duplicate_language_codes(lang2.split("_")[0])), - adapter=flores_adapter(lang1, lang2), - formulation=CFFormulation(), - ), - suite=("lighteval",), - hf_repo="facebook/flores", - hf_subset=f"{lang1}-{lang2}", - hf_avail_splits=["dev", "devtest"], - evaluation_splits=["devtest"], - few_shots_split="dev", - few_shots_select=None, - generation_size=300, - metrics=[Metrics.chrf_plus, Metrics.bleu, Metrics.bleu_1, Metrics.bleu_4], - stop_sequence=["\n"], - version=0, - ) - for (lang1, lang2) in permutations(flores_200_languages, 2) -] - -TASKS_TABLE.extend( - [ - *flores200_tasks, - ] -) diff --git a/src/lighteval/tasks/multilingual/tasks/acva.py b/src/lighteval/tasks/multilingual/tasks/acva.py new file mode 100644 index 000000000..14f371d32 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/acva.py @@ -0,0 +1,115 @@ +""" +name: +Acva + +dataset: +OALL/ACVA + +abstract: +Acva multilingual benchmark. + +languages: +arabic + +tags: +knowledge, multilingual, multiple-choice + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, + MultilingualQuasiExactMatchMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.boolq import get_boolq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, +) +from lighteval.utils.language import Language + + +ACVA_SUBSET = [ + "Algeria", + "Ancient_Egypt", + "Arab_Empire", + "Arabic_Architecture", + "Arabic_Art", + "Arabic_Astronomy", + "Arabic_Calligraphy", + "Arabic_Ceremony", + "Arabic_Clothing", + "Arabic_Culture", + "Arabic_Food", + "Arabic_Funeral", + "Arabic_Geography", + "Arabic_History", + "Arabic_Language_Origin", + "Arabic_Literature", + "Arabic_Math", + "Arabic_Medicine", + "Arabic_Music", + "Arabic_Ornament", + "Arabic_Philosophy", + "Arabic_Physics_and_Chemistry", + "Arabic_Wedding", + "Bahrain", + "Comoros", + "Egypt_modern", + "InfluenceFromAncientEgypt", + "InfluenceFromByzantium", + "InfluenceFromChina", + "InfluenceFromGreece", + "InfluenceFromIslam", + "InfluenceFromPersia", + "InfluenceFromRome", + "Iraq", + "Islam_Education", + "Islam_branches_and_schools", + "Islamic_law_system", + "Jordan", + "Kuwait", + "Lebanon", + "Libya", + "Mauritania", + "Mesopotamia_civilization", + "Morocco", + "Oman", + "Palestine", + "Qatar", + "Saudi_Arabia", + "Somalia", + "Sudan", + "Syria", + "Tunisia", + "United_Arab_Emirates", + "Yemen", + "communication", + "computer_and_phone", + "daily_life", + "entertainment", +] + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"acva_{Language.ARABIC.value}:{subset}", + prompt_function=get_boolq_prompt_function( + Language.ARABIC, + lambda line: { + "question": line["question"], + "answer": line["answer"] == "صح", + }, + formulation=CFFormulation(), + ), + suite=("lighteval",), + hf_repo="OALL/ACVA", + hf_subset=subset, + evaluation_splits=("test",), + few_shots_split="validation", + metrics=[MultilingualQuasiExactMatchMetric(Language.ARABIC, "full"), LogLikelihoodAccMetric()], + generation_size=5, + stop_sequence=("\n",), + ) + for subset in ACVA_SUBSET +] diff --git a/src/lighteval/tasks/multilingual/tasks/afri_mgsm.py b/src/lighteval/tasks/multilingual/tasks/afri_mgsm.py new file mode 100644 index 000000000..1be96436e --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/afri_mgsm.py @@ -0,0 +1,72 @@ +""" +name: +Afri Mgsm + +dataset: +masakhane/afrimgsm + +abstract: +African MGSM: MGSM for African Languages + +languages: +amharic, ewe, french, hausa, igbo, kinyarwanda, lingala, luganda, oromo, shona, +sotho, swahili, twi, wolof, xhosa, yoruba, zulu + +tags: +math, multilingual, reasoning + +paper: +https://arxiv.org/abs/2406.03368. +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"afri_mgsm_{language.value}", + prompt_function=get_qa_prompt_function( + language, + lambda line: { + "question": line["question"], + # The cot is available but we have no use: + # line["answer"] + "choices": [str(line["answer_number"])], + }, + ), + suite=("lighteval",), + hf_repo="masakhane/afrimgsm", + hf_subset=language.value, + evaluation_splits=("test",), + few_shots_split="train", + generation_size=25, + metrics=[ + MultilingualQuasiExactMatchMetric(language, "full"), + ], + stop_sequence=("\n",), + ) + for language in [ + Language.AMHARIC, + # Language.EWE, + Language.FRENCH, + # Language.HAUSA, + # Language.IGBO, + # Language.KINYARWANDA, + # Language.LINGALA, + # Language.LUGANDA, + # Language.OROMO, + # Language.SHONA, + # Language.SOTHO, + Language.SWAHILI, + # Language.TWI, + # Language.WOLOF, + # Language.XHOSA, + Language.YORUBA, + # Language.ZULU, + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/afri_mmlu.py b/src/lighteval/tasks/multilingual/tasks/afri_mmlu.py new file mode 100644 index 000000000..e4d21f350 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/afri_mmlu.py @@ -0,0 +1,104 @@ +""" +name: +Afri Mmlu + +dataset: +masakhane/afrimmlu + +abstract: +African MMLU: African Massive Multitask Language Understanding + +languages: +amharic, ewe, french, hausa, igbo, kinyarwanda, lingala, luganda, oromo, shona, +sotho, swahili, twi, wolof, xhosa, yoruba, zulu + +tags: +knowledge, multilingual, multiple-choice + +paper: +https://arxiv.org/abs/2406.03368. +""" + +from functools import partial + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +AFRI_MMLU_SUBSETS = [ + "elementary_mathematics", + "high_school_mathematics", + "high_school_geography", + "high_school_microeconomics", + "international_law", + "global_facts", +] + + +afri_mmlu_tasks = [ + LightevalTaskConfig( + name=f"afri_mmlu_{language.value}_{formulation.name.lower()}:{subset}", + prompt_function=get_mcq_prompt_function( + language, + lambda line: { + "question": line["question"], + "choices": line["choices"], + "gold_idx": LETTER_INDICES.index(line["answer"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="masakhane/afrimmlu", + # Temporary until the pr is merged + hf_revision="refs/pr/1", + hf_subset=language.value, + hf_filter=partial(lambda subset, line: line["subject"] == subset, subset), + evaluation_splits=("test",), + few_shots_split="dev", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), + ], + ), + ) + for subset in AFRI_MMLU_SUBSETS + for language in [ + Language.AMHARIC, + # Language.EWE, + Language.FRENCH, + # Language.HAUSA, + # Language.IGBO, + # Language.KINYARWANDA, + # Language.LINGALA, + # Language.LUGANDA, + # Language.OROMO, + # Language.SHONA, + # Language.SOTHO, + Language.SWAHILI, + # Language.TWI, + # Language.WOLOF, + # Language.XHOSA, + Language.YORUBA, + # Language.ZULU, + ] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/afri_xnli.py b/src/lighteval/tasks/multilingual/tasks/afri_xnli.py new file mode 100644 index 000000000..6bf3e315f --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/afri_xnli.py @@ -0,0 +1,86 @@ +""" +name: +Afri Xnli + +dataset: +masakhane/afrixnli + +abstract: +African XNLI: African XNLI + +languages: +amharic, ewe, french, hausa, igbo, kinyarwanda, lingala, luganda, oromo, shona, +sotho, swahili, twi, wolof, xhosa, yoruba, zulu + +tags: +classification, multilingual, nli + +paper: +https://arxiv.org/abs/2406.03368. +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.nli import get_nli_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"afri_xnli_{language.value}_{formulation.name.lower()}", + suite=("lighteval",), + prompt_function=get_nli_prompt_function( + language=language, + adapter=lambda line: { + "premise": line["premise"], + "hypothesis": line["hypothesis"], + # Since we ignore the neutral label + "gold_idx": {0: 0, 2: 1}[line["label"]], + }, + relations=["entailment", "contradiction"], + formulation=formulation, + ), + hf_repo="masakhane/afrixnli", + hf_subset=language.value, + hf_filter=lambda x: int(x["label"]) in [0, 2], + evaluation_splits=("test",), + few_shots_split="validation", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for language in [ + Language.AMHARIC, + # Language.EWE, + Language.FRENCH, + # Language.HAUSA, + # Language.IGBO, + # Language.KINYARWANDA, + # Language.LINGALA, + # Language.LUGANDA, + # Language.OROMO, + # Language.SHONA, + # Language.SOTHO, + Language.SWAHILI, + # Language.TWI, + # Language.WOLOF, + # Language.XHOSA, + Language.YORUBA, + # Language.ZULU, + ] + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/community_tasks/arabic_evals.py b/src/lighteval/tasks/multilingual/tasks/arabic.py similarity index 96% rename from community_tasks/arabic_evals.py rename to src/lighteval/tasks/multilingual/tasks/arabic.py index 0e917d25d..c85d2ecbd 100644 --- a/community_tasks/arabic_evals.py +++ b/src/lighteval/tasks/multilingual/tasks/arabic.py @@ -1,30 +1,20 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team +""" +name: +Arabic Evals -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +dataset: +MBZUAI/ArabicMMLU, MBZUAI/human_translated_arabic_mmlu, OALL/Arabic_MMLU, OALL/ACVA, asas-ai/AraTrust-categorized -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +abstract: +Collection of benchmarks for Arabic language. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +languages: +arabic -# ruff: noqa: F405, F403, F401 -""" -Custom evaluation tasks for lighteval +tags: +knowledge, multilingual, multiple-choice -This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. +paper: """ import random diff --git a/src/lighteval/tasks/multilingual/tasks/arabic_arc.py b/src/lighteval/tasks/multilingual/tasks/arabic_arc.py new file mode 100644 index 000000000..29d9ee9d4 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/arabic_arc.py @@ -0,0 +1,62 @@ +""" +name: +Arabic Arc + +dataset: +OALL/AlGhafa-Arabic-LLM-Benchmark-Translated + +abstract: +Arabic Arc multilingual benchmark. + +languages: +arabic + +tags: +multilingual, multiple-choice, reasoning + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.adapters import ( + alghafa_adapter, +) +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"alghafa_arc_{Language.ARABIC.value}_{formulation.name.lower()}:easy", + prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation), + suite=["lighteval"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", + hf_subset="arc_easy_ar", + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/arabic_mmlu.py b/src/lighteval/tasks/multilingual/tasks/arabic_mmlu.py new file mode 100644 index 000000000..d8031c7f6 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/arabic_mmlu.py @@ -0,0 +1,113 @@ +""" +name: +Arabic Mmlu + +dataset: +MBZUAI/ArabicMMLU + +abstract: +Arabic Mmlu multilingual benchmark. + +languages: +arabic + +tags: +knowledge, multilingual, multiple-choice + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation, normalize_subset +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +ARABIC_MMLU_SUBSETS = [ + "Islamic Studies", + "Islamic Studies (Middle School)", + "Islamic Studies (Primary School)", + "Islamic Studies (High School)", + "Driving Test", + "Natural Science (Middle School)", + "Natural Science (Primary School)", + "History (Middle School)", + "History (Primary School)", + "History (High School)", + "General Knowledge", + "General Knowledge (Middle School)", + "General Knowledge (Primary School)", + "Law (Professional)", + "Physics (High School)", + "Social Science (Middle School)", + "Social Science (Primary School)", + "Management (University)", + "Arabic Language (Middle School)", + "Arabic Language (Primary School)", + "Arabic Language (High School)", + "Political Science (University)", + "Philosophy (High School)", + "Accounting (University)", + "Computer Science (Middle School)", + "Computer Science (Primary School)", + "Computer Science (High School)", + "Computer Science (University)", + "Geography (Middle School)", + "Geography (Primary School)", + "Geography (High School)", + "Math (Primary School)", + "Biology (High School)", + "Economics (Middle School)", + "Economics (High School)", + "Economics (University)", + "Arabic Language (General)", + "Arabic Language (Grammar)", + "Civics (Middle School)", + "Civics (High School)", +] + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mmlu_{Language.ARABIC.value}_{formulation.name.lower()}:{normalize_subset(subset)}", + prompt_function=get_mcq_prompt_function( + Language.ARABIC, + lambda line: { + "context": line["Context"], + "question": line["Question"], + "choices": [str(o) for o in [line[f"Option {i}"] for i in range(1, 6)] if o], + "gold_idx": LETTER_INDICES.index(line["Answer Key"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="MBZUAI/ArabicMMLU", + hf_subset=subset, + evaluation_splits=("test",), + hf_avail_splits=["dev"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), + ], + ), + ) + for subset in ARABIC_MMLU_SUBSETS + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/arcd.py b/src/lighteval/tasks/multilingual/tasks/arcd.py new file mode 100644 index 000000000..d1404821b --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/arcd.py @@ -0,0 +1,57 @@ +""" +name: +Arcd + +dataset: +hsseinmz/arcd + +abstract: +ARCD: Arabic Reading Comprehension Dataset. + +languages: +arabic + +tags: +multilingual, multiple-choice, qa, reasoning + +paper: +https://arxiv.org/pdf/1906.05394 +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +# ARCD: Arabic Reading Comprehension Dataset. +# https://arxiv.org/pdf/1906.05394 + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"arcd_{Language.ARABIC.value}", + prompt_function=get_qa_prompt_function( + Language.ARABIC, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="hsseinmz/arcd", + hf_subset="plain_text", + evaluation_splits=("validation",), + few_shots_split="train", + metrics=( + MultilingualQuasiExactMatchMetric(Language.ARABIC, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.ARABIC), + ), + generation_size=400, + stop_sequence=("\n",), + ) +] diff --git a/src/lighteval/tasks/multilingual/tasks/belebele.py b/src/lighteval/tasks/multilingual/tasks/belebele.py new file mode 100644 index 000000000..2623e1868 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/belebele.py @@ -0,0 +1,192 @@ +""" +name: +Belebele + +dataset: +facebook/belebele + +abstract: +Belebele: A large-scale reading comprehension dataset covering 122 languages. + +languages: +arabic, armenian, bengali, cyrillic, devanagari, ethiopic, georgian, greek, +gujarati, gurmukhi, chinese (simplified), chinese (traditional), hangul, hebrew, +japanese, khmer, kannada, lao, latin, malayalam, myanmar, odia, sinhala, tamil, +telugu, thai, tibetan + +tags: +multilingual, multiple-choice, reading-comprehension + +paper: +https://arxiv.org/abs/2308.16884 +""" + +from langcodes import Language as LangCodeLanguage + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import iso_639_3_ind_to_iso_639_3_macro + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"belebele_{language}_{formulation.name.lower()}", + prompt_function=get_mcq_prompt_function( + iso_639_3_ind_to_iso_639_3_macro[LangCodeLanguage.get(language).to_alpha3()], + lambda line: { + "question": line["question"], + "context": line["flores_passage"], + "choices": [line[f"mc_answer{i}"] for i in range(1, 5)], + "gold_idx": int(line["correct_answer_num"]) - 1, + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="facebook/belebele", + hf_subset=language, + evaluation_splits=("test",), + hf_avail_splits=["test"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] + for language in [ + "acm_Arab", + "arz_Arab", + "ceb_Latn", + "fin_Latn", + "hin_Deva", + "ita_Latn", + "khm_Khmr", + "lvs_Latn", + "npi_Deva", + "pol_Latn", + "slv_Latn", + "swe_Latn", + # "tso_Latn", + # "xho_Latn", + "afr_Latn", + "asm_Beng", + "ces_Latn", + "fra_Latn", + "hin_Latn", + "jav_Latn", + # "kin_Latn", + "mal_Mlym", + "npi_Latn", + "por_Latn", + # "sna_Latn", + "swh_Latn", + "tur_Latn", + "yor_Latn", + "als_Latn", + "azj_Latn", + "ckb_Arab", + # "fuv_Latn", + "hrv_Latn", + "jpn_Jpan", + "kir_Cyrl", + "mar_Deva", + # "nso_Latn", + "snd_Arab", + "tam_Taml", + "ukr_Cyrl", + "zho_Hans", + "amh_Ethi", + # "bam_Latn", + "dan_Latn", + # "gaz_Latn", + "hun_Latn", + # "kac_Latn", + "kor_Hang", + "mkd_Cyrl", + # "nya_Latn", + "ron_Latn", + "som_Latn", + "tel_Telu", + "urd_Arab", + "zho_Hant", + "apc_Arab", + "ben_Beng", + "deu_Latn", + # "grn_Latn", + "hye_Armn", + "kan_Knda", + "lao_Laoo", + "mlt_Latn", + "ory_Orya", + "rus_Cyrl", + # "sot_Latn", + "tgk_Cyrl", + "urd_Latn", + "zsm_Latn", + "arb_Arab", + "ben_Latn", + "ell_Grek", + "guj_Gujr", + # "ibo_Latn", + "kat_Geor", + # "lin_Latn", + # "mri_Latn", + "pan_Guru", + # "shn_Mymr", + "spa_Latn", + "tgl_Latn", + "uzn_Latn", + # "zul_Latn", + "arb_Latn", + # "bod_Tibt", + "eng_Latn", + # "hat_Latn", + # "ilo_Latn", + "kaz_Cyrl", + "lit_Latn", + "mya_Mymr", + "pbt_Arab", + "sin_Latn", + "srp_Cyrl", + "tha_Thai", + "vie_Latn", + "ars_Arab", + "bul_Cyrl", + "est_Latn", + # "hau_Latn", + "ind_Latn", + # "kea_Latn", + # "lug_Latn", + "nld_Latn", + "pes_Arab", + "sin_Sinh", + # "ssw_Latn", + # "tir_Ethi", + "war_Latn", + "ary_Arab", + "cat_Latn", + "eus_Latn", + "heb_Hebr", + "isl_Latn", + # "khk_Cyrl", + # "luo_Latn", + "nob_Latn", + "plt_Latn", + "slk_Latn", + # "sun_Latn", + # "tsn_Latn", + # "wol_Latn", + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/c3.py b/src/lighteval/tasks/multilingual/tasks/c3.py new file mode 100644 index 000000000..4440b5b00 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/c3.py @@ -0,0 +1,73 @@ +""" +name: +C3 + +dataset: +clue/clue + +abstract: +C3: A Chinese Challenge Corpus for Cross-lingual and Cross-modal Tasks Reading +comprehension task part of clue. + +languages: +chinese + +tags: +multilingual, multiple-choice, reasoning + +paper: +https://arxiv.org/abs/2004.05986 +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +# C3: A Chinese Challenge Corpus for Cross-lingual and Cross-modal Tasks +# Reading comprehension task part of clue +# Paper: https://arxiv.org/abs/2004.05986 + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"c3_{Language.CHINESE.value}_{formulation.name.lower()}", + suite=("lighteval",), + prompt_function=get_mcq_prompt_function( + Language.CHINESE, + lambda line: { + "question": line["question"], + "choices": line["choice"], + "gold_idx": line["choice"].index(line["answer"]), + "context": " ".join(line["context"]), + }, + formulation=formulation, + ), + hf_repo="clue/clue", + hf_subset="c3", + evaluation_splits=("validation",), + few_shots_split="train", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/ceval.py b/src/lighteval/tasks/multilingual/tasks/ceval.py new file mode 100644 index 000000000..c037a0df3 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/ceval.py @@ -0,0 +1,127 @@ +""" +name: +Ceval + +dataset: +ceval/ceval-exam + +abstract: +Ceval multilingual benchmark. + +languages: +chinese + +tags: +knowledge, multilingual, multiple-choice + +paper: +""" + +from functools import partial + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.adapters import ( + ceval_adapter, +) +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +CEVAL_SUBSET = [ + "computer_network", + "operating_system", + "computer_architecture", + "college_programming", + "college_physics", + "college_chemistry", + "advanced_mathematics", + "probability_and_statistics", + "discrete_mathematics", + "electrical_engineer", + "metrology_engineer", + "high_school_mathematics", + "high_school_physics", + "high_school_chemistry", + "high_school_biology", + "middle_school_mathematics", + "middle_school_biology", + "middle_school_physics", + "middle_school_chemistry", + "veterinary_medicine", + "college_economics", + "business_administration", + "marxism", + "mao_zedong_thought", + "education_science", + "teacher_qualification", + "high_school_politics", + "high_school_geography", + "middle_school_politics", + "middle_school_geography", + "modern_chinese_history", + "ideological_and_moral_cultivation", + "logic", + "law", + "chinese_language_and_literature", + "art_studies", + "professional_tour_guide", + "legal_professional", + "high_school_chinese", + "high_school_history", + "middle_school_history", + "civil_servant", + "sports_science", + "plant_protection", + "basic_medicine", + "clinical_medicine", + "urban_and_rural_planner", + "accountant", + "fire_engineer", + "environmental_impact_assessment_engineer", + "tax_accountant", + "physician", +] + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"ceval_{Language.CHINESE.value}_{formulation.name.lower()}:{subset}", + prompt_function=get_mcq_prompt_function( + Language.CHINESE, + partial( + ceval_adapter, + Language.CHINESE, + formulation, + ), + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="ceval/ceval-exam", + hf_subset=subset, + evaluation_splits=("val",), + few_shots_split="dev", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for subset in CEVAL_SUBSET + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/chegeka.py b/src/lighteval/tasks/multilingual/tasks/chegeka.py new file mode 100644 index 000000000..3b2174ab9 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/chegeka.py @@ -0,0 +1,51 @@ +""" +name: +Chegeka + +dataset: +ai-forever/MERA + +abstract: +Chegeka multilingual benchmark. + +languages: +russian + +tags: +knowledge, multilingual, qa + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"chegeka_{Language.RUSSIAN.value}", + prompt_function=get_qa_prompt_function( + Language.RUSSIAN, + lambda line: { + "question": line["inputs"]["text"], + "choices": [line["outputs"]], + }, + ), + suite=("lighteval",), + hf_repo="ai-forever/MERA", + hf_subset="chegeka", + evaluation_splits=("train",), + hf_avail_splits=["train"], + generation_size=400, + stop_sequence=("\n",), + metrics=[ + MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.RUSSIAN), + ], + ) +] diff --git a/src/lighteval/tasks/multilingual/tasks/chinese_squad.py b/src/lighteval/tasks/multilingual/tasks/chinese_squad.py new file mode 100644 index 000000000..521e0bc60 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/chinese_squad.py @@ -0,0 +1,53 @@ +""" +name: +Chinese Squad + +dataset: +lighteval/ChineseSquad + +abstract: +ChineseSquad is a reading comprehension dataset for Chinese. + +languages: +chinese + +tags: +multilingual, qa + +paper: +https://github.com/pluto-junzeng/ChineseSquad +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"chinese_squad_{Language.CHINESE.value}", + prompt_function=get_qa_prompt_function( + Language.CHINESE, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="lighteval/ChineseSquad", + hf_subset="default", + evaluation_splits=("validation",), + few_shots_split="train", + metrics=( + MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.CHINESE), + ), + generation_size=400, + stop_sequence=("\n",), + ) +] diff --git a/src/lighteval/tasks/multilingual/tasks/cmath.py b/src/lighteval/tasks/multilingual/tasks/cmath.py new file mode 100644 index 000000000..f1e7d45ed --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/cmath.py @@ -0,0 +1,49 @@ +""" +name: +Cmath + +dataset: +weitianwen/cmath + +abstract: +Cmath multilingual benchmark. + +languages: +chinese + +tags: +math, multilingual, reasoning + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"cmath_{Language.CHINESE.value}", + prompt_function=get_qa_prompt_function( + Language.CHINESE, + lambda line: { + "question": line["question"], + "choices": [line["golden"]], + }, + ), + suite=("lighteval",), + hf_repo="weitianwen/cmath", + hf_subset="default", + evaluation_splits=("test",), + few_shots_split="validation", + generation_size=25, + metrics=[ + MultilingualQuasiExactMatchMetric(Language.CHINESE, "full"), + ], + stop_sequence=("\n",), + ) +] diff --git a/src/lighteval/tasks/multilingual/tasks/cmmlu.py b/src/lighteval/tasks/multilingual/tasks/cmmlu.py new file mode 100644 index 000000000..8153d7bf6 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/cmmlu.py @@ -0,0 +1,139 @@ +""" +name: +Cmmlu + +dataset: +haonan-li/cmmlu + +abstract: +Cmmlu multilingual benchmark. + +languages: +chinese + +tags: +knowledge, multilingual, multiple-choice + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +CMMLU_SUBSETS = [ + "agronomy", + "anatomy", + "ancient_chinese", + "arts", + "astronomy", + "business_ethics", + "chinese_civil_service_exam", + "chinese_driving_rule", + "chinese_food_culture", + "chinese_foreign_policy", + "chinese_history", + "chinese_literature", + "chinese_teacher_qualification", + "clinical_knowledge", + "college_actuarial_science", + "college_education", + "college_engineering_hydrology", + "college_law", + "college_mathematics", + "college_medical_statistics", + "college_medicine", + "computer_science", + "computer_security", + "conceptual_physics", + "construction_project_management", + "economics", + "education", + "electrical_engineering", + "elementary_chinese", + "elementary_commonsense", + "elementary_information_and_technology", + "elementary_mathematics", + "ethnology", + "food_science", + "genetics", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_geography", + "high_school_mathematics", + "high_school_physics", + "high_school_politics", + "human_sexuality", + "international_law", + "journalism", + "jurisprudence", + "legal_and_moral_basis", + "logical", + "machine_learning", + "management", + "marketing", + "marxist_theory", + "modern_chinese", + "nutrition", + "philosophy", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_study", + "sociology", + "sports_science", + "traditional_chinese_medicine", + "virology", + "world_history", + "world_religions", +] + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"cmmlu_{Language.CHINESE.value}_{formulation.name.lower()}:{subset}", + prompt_function=get_mcq_prompt_function( + Language.CHINESE, + lambda line: { + "question": line["Question"], + "choices": [line["A"], line["B"], line["C"], line["D"]], + "gold_idx": LETTER_INDICES.index(line["Answer"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="haonan-li/cmmlu", + hf_subset=subset, + evaluation_splits=("test",), + few_shots_split="dev", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), + ], + ), + ) + for subset in CMMLU_SUBSETS + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/cmnli.py b/src/lighteval/tasks/multilingual/tasks/cmnli.py new file mode 100644 index 000000000..c8667978c --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/cmnli.py @@ -0,0 +1,67 @@ +""" +name: +Cmnli + +dataset: +fenffef/cmnli + +abstract: +Native Chinese NLI dataset based on MNLI approach (Machine Translated) + +languages: +chinese + +tags: +classification, multilingual, nli + +paper: +https://arxiv.org/abs/2004.05986 +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.nli import get_nli_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"cmnli_{Language.CHINESE.value}_{formulation.name.lower()}", + prompt_function=get_nli_prompt_function( + language=Language.CHINESE, + adapter=lambda line: { + "premise": line["sentence1"], + "hypothesis": line["sentence2"], + # Since we ignore the neutral label + "gold_idx": {"entailment": 0, "contradiction": 1}[line["label"]], + }, + relations=["entailment", "contradiction"], + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="fenffef/cmnli", + hf_subset="default", + hf_filter=lambda x: x["label"] in ["entailment", "contradiction"], + # Only keep the positive and negative examples + evaluation_splits=("validation",), + few_shots_split="train", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/cmrc2018.py b/src/lighteval/tasks/multilingual/tasks/cmrc2018.py new file mode 100644 index 000000000..63174fd98 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/cmrc2018.py @@ -0,0 +1,53 @@ +""" +name: +Cmrc2018 + +dataset: +clue/clue + +abstract: +CMRC 2018: A span-extraction machine reading comprehension dataset for Chinese. + +languages: +chinese + +tags: +multilingual, qa + +paper: +https://arxiv.org/abs/1810.07366 +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"cmrc2018_{Language.CHINESE.value}", + prompt_function=get_qa_prompt_function( + Language.CHINESE, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="clue/clue", + hf_subset="cmrc2018", + evaluation_splits=("trial",), + few_shots_split="train", + generation_size=400, + metrics=( + MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.CHINESE), + ), + stop_sequence=("\n",), + ) +] diff --git a/src/lighteval/tasks/multilingual/tasks/copa_indic.py b/src/lighteval/tasks/multilingual/tasks/copa_indic.py new file mode 100644 index 000000000..4d664647d --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/copa_indic.py @@ -0,0 +1,93 @@ +""" +name: +Copa Indic + +dataset: +ai4bharat/IndicCOPA + +abstract: +IndicCOPA: COPA for Indic Languages Paper: https://arxiv.org/pdf/2212.05409 +IndicCOPA extends COPA to 15 Indic languages, providing a valuable resource for +evaluating common sense reasoning in these languages. + +languages: +assamese, bengali, gujarati, hindi, kannada, malayalam, marathi, nepali, oriya, +punjabi, sanskrit, sindhi, tamil, telugu, urdu + +tags: +multilingual, multiple-choice, reasoning + +paper: +https://arxiv.org/pdf/2212.05409 +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.copa import get_copa_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +# IndicCOPA: COPA for Indic Languages +# Paper: https://arxiv.org/pdf/2212.05409 +# IndicCOPA extends COPA to 15 Indic languages, providing a valuable resource for +# evaluating common sense reasoning in these languages. + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"indicxcopa_{language.value}_{formulation.name.lower()}", + suite=["lighteval"], + prompt_function=get_copa_prompt_function( + language, + adapter=lambda line: { + "context": line["premise"], + "cause_effect": line["question"], + "continuations": [line["choice1"], line["choice2"]], + "gold_idx": int(line["label"]), + }, + formulation=formulation, + ), + hf_repo="ai4bharat/IndicCOPA", + hf_subset=f"translation-{standardize_tag(language.value)}", + hf_revision="d356ef19a4eb287e88a51d07a56b73ba88c7f188", + evaluation_splits=["test"], + hf_avail_splits=["test"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for language in [ + Language.ASSAMESE, + Language.BENGALI, + Language.GUJARATI, + Language.HINDI, + Language.KANNADA, + Language.MALAYALAM, + Language.MARATHI, + Language.NEPALI, + Language.ORIYA, + Language.PUNJABI, + Language.SANSKRIT, + Language.SINDHI, + Language.TAMIL, + Language.TELUGU, + Language.URDU, + # Optionally: Maithili, Santali, Sindhi, Konkani + ] + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/enem.py b/src/lighteval/tasks/multilingual/tasks/enem.py new file mode 100644 index 000000000..b852eeb4e --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/enem.py @@ -0,0 +1,73 @@ +""" +name: +Enem + +dataset: +maritaca-ai/enem + +abstract: +ENEM (Exame Nacional do Ensino Médio) is a standardized Brazilian national +secondary education examination. The exam is used both as a university admission +test and as a high school evaluation test. + +languages: +portuguese + +tags: +knowledge, multilingual, multiple-choice + +paper: +https://huggingface.co/datasets/maritaca-ai/enem +""" + +from functools import partial + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.adapters import ( + enem_adapter, +) +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"enem_{Language.PORTUGUESE.value}_{formulation.name.lower()}:{year}", + prompt_function=get_mcq_prompt_function( + Language.PORTUGUESE, + partial( + enem_adapter, + Language.PORTUGUESE, + ), + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="maritaca-ai/enem", + hf_subset=year, + evaluation_splits=("train",), + hf_avail_splits=["train"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for year in ["2022", "2023", "2024"] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/exams.py b/src/lighteval/tasks/multilingual/tasks/exams.py new file mode 100644 index 000000000..69424a0ef --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/exams.py @@ -0,0 +1,194 @@ +""" +name: +Exams + +dataset: +mhardalov/exams + +abstract: +Exams multilingual benchmark. + +languages: +albanian, arabic, bulgarian, croatian, french, german, hungarian, italian, +lithuanian, macedonian, polish, portuguese, serbian, spanish, turkish, +vietnamese + +tags: +knowledge, multilingual, multiple-choice + +paper: +""" + +from functools import partial + +from langcodes import Language as LangCodeLanguage +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation, normalize_subset +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +exams_subjects_by_lang: dict[Language, set[str]] = { + Language.ARABIC: {"Biology", "Islamic Studies", "Physics", "Science", "Social"}, + Language.BULGARIAN: {"Biology", "Chemistry", "Geography", "History", "Philosophy", "Physics"}, + Language.CROATIAN: { + "Biology", + "Chemistry", + "Ethics", + "Fine Arts", + "Geography", + "Geology", + "History", + "Informatics", + "Philosophy", + "Physics", + "Politics", + "Psychology", + "Religion", + "Sociology", + }, + Language.HUNGARIAN: { + "Agriculture", + "Agriculture (Mechanical knowledge)", + "Biology", + "Chemistry", + "Economics", + "Economics & Marketing", + "Economics Basics (Business)", + "Economics Basics (Theoretical)", + "Forestry", + "Geography", + "Landscaping", + "Physics", + "Politics", + "Tourism", + }, + Language.ITALIAN: { + "Biology", + "Chemistry", + "Ethics", + "Geography", + "Geology", + "History", + "Informatics", + "Philosophy", + "Physics", + "Politics", + "Psychology", + "Sociology", + }, + Language.SERBIAN: { + "Biology", + "Chemistry", + "Ethics", + "Geography", + "Geology", + "History", + "Informatics", + "Philosophy", + "Physics", + "Politics", + "Psychology", + "Religion", + "Sociology", + }, + Language.FRENCH: {"Economics", "Economics & Marketing", "Economics Basics (Theoretical)", "Geography", "Physics"}, + Language.GERMAN: { + "Chemistry", + "Economics", + "Economics & Marketing", + "Economics Basics (Theoretical)", + "Geography", + "Physics", + "Tourism", + }, + Language.SPANISH: {"Geography", "Physics"}, + Language.LITHUANIAN: {"Geology", "History"}, + Language.ALBANIAN: { + "Biology", + "Business", + "Chemistry", + "Fine Arts", + "History", + "Philosophy", + "Physics", + "Sociology", + }, + Language.MACEDONIAN: { + "Biology", + "Business", + "Chemistry", + "Fine Arts", + "History", + "Philosophy", + "Physics", + "Sociology", + }, + Language.TURKISH: { + "Biology", + "Business", + "Chemistry", + "Geography", + "History", + "Philosophy", + "Physics", + "Sociology", + }, + Language.POLISH: {"Professional"}, + Language.PORTUGUESE: {"Biology", "Economics", "Geology", "Philosophy"}, + Language.VIETNAMESE: {"Biology", "Chemistry", "Citizenship", "Geography", "History", "Physics"}, +} + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"exams_{language.value}_{formulation.name.lower()}:{normalize_subset(subject)}", + prompt_function=get_mcq_prompt_function( + language, + lambda line: { + "question": line["question"]["stem"], + "choices": line["question"]["choices"]["text"], + "gold_idx": line["question"]["choices"]["label"].index(line["answerKey"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="mhardalov/exams", + hf_subset="multilingual", + # Weird bug in dataset + hf_filter=partial( + lambda language, subject, line: line["answerKey"] != "@" + and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name() + and line["info"]["subject"] == subject, + language, + subject, + ), + evaluation_splits=("test",), + few_shots_split="train", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for language in exams_subjects_by_lang.keys() + for subject in exams_subjects_by_lang[language] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/faquad.py b/src/lighteval/tasks/multilingual/tasks/faquad.py new file mode 100644 index 000000000..cec220bd0 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/faquad.py @@ -0,0 +1,55 @@ +""" +name: +Faquad + +dataset: +eraldoluis/faquad + +abstract: +FaQuAD: A Portuguese Reading Comprehension Dataset + +languages: +portuguese + +tags: +multilingual, qa + +paper: +https://arxiv.org/abs/2007.15671 +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"faquad_{Language.PORTUGUESE.value}", + prompt_function=get_qa_prompt_function( + Language.PORTUGUESE, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="eraldoluis/faquad", + hf_subset="plain_text", + hf_revision="205ba826a2282a4a5aa9bd3651e55ee4f2da1546", + hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), + evaluation_splits=("validation",), + few_shots_split="train", + metrics=( + MultilingualQuasiExactMatchMetric(Language.PORTUGUESE, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.PORTUGUESE), + ), + generation_size=400, + stop_sequence=("\n",), + ) +] diff --git a/community_tasks/filipino_evals.py b/src/lighteval/tasks/multilingual/tasks/filipino.py similarity index 92% rename from community_tasks/filipino_evals.py rename to src/lighteval/tasks/multilingual/tasks/filipino.py index 45011535e..daf29daa6 100644 --- a/community_tasks/filipino_evals.py +++ b/src/lighteval/tasks/multilingual/tasks/filipino.py @@ -1,31 +1,21 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team +""" +name: +Filipino Evals -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +dataset: +filbench/filbench-eval -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +abstract: +Collection of benchmarks for Filipino language. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +languages: +filipino -# ruff: noqa: F405, F403, F401 +tags: +knowledge, multilingual, multiple-choice -""" -This file contains the tasks for the Filipino language, collectively known as FilBench. -It includes several tasks for the following categories: Cultural Knowledge, Classical NLP, Reading Comprehension, and Generation. -For more information, please read the paper: https://github.com/filbench/filbench-eval/blob/main/filbench.pdf +paper: +https://github.com/filbench/filbench-eval/blob/main/filbench.pdf Contact: - Lester James V. Miranda @@ -51,7 +41,6 @@ ) from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.multilingual.tasks import MMLU_SUBSETS from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation from lighteval.tasks.requests import Doc from lighteval.tasks.templates.multichoice import get_mcq_prompt_function @@ -65,6 +54,66 @@ from lighteval.utils.language import Language, iso_639_3_ind_to_iso_639_3_macro +MMLU_SUBSETS = [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions", +] + # Balita NLP FILIPINO_BALITA_TASKS = [ LightevalTaskConfig( @@ -150,7 +199,6 @@ few_shots_select="random", suite=["community"], generation_size=-1, - trust_dataset=True, metrics=get_metrics_for_formulation( formulation, [ @@ -201,7 +249,6 @@ few_shots_split="test", few_shots_select="random", generation_size=-1, - trust_dataset=True, version=0, ) for formulation in [MCFFormulation(), HybridFormulation()] @@ -243,14 +290,13 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc: hf_subset="default", prompt_function=filipino_dengue_pfn, hf_repo="jcblaise/dengue_filipino", - metrics=[Metrics.loglikelihood_acc_norm], + metrics=[LogLikelihoodAccMetric(normalization=LogProbTokenNorm())], hf_avail_splits=["train", "test", "validation"], evaluation_splits=["train"], few_shots_split="train", few_shots_select="random", suite=("community",), generation_size=-1, - trust_dataset=True, version=0, ) for subset in dengue_filipino_subsets @@ -286,7 +332,6 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc: few_shots_select="random", suite=["community"], generation_size=-1, - trust_dataset=True, version=0, ) for formulation in [MCFFormulation(), HybridFormulation()] @@ -370,7 +415,6 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc: few_shots_split="test", few_shots_select="random", generation_size=-1, - trust_dataset=True, version=0, ) for subset in ["culturology", "history", "language", "driving_license"] @@ -432,7 +476,6 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc: LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), - trust_dataset=True, ) for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] ] @@ -465,7 +508,6 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc: few_shots_split=None, few_shots_select=None, generation_size=64, - trust_dataset=True, version=0, ) for language in ["fil_Latn"] @@ -519,7 +561,6 @@ def create_sib200_task(language: Language, formulation): few_shots_split="validation", few_shots_select="random", generation_size=-1, - trust_dataset=True, version=0, ) @@ -575,7 +616,6 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str, few_shots_split="test", few_shots_select="random", generation_size=-1, - trust_dataset=True, version=0, ) for formulation in [MCFFormulation(), HybridFormulation()] @@ -605,7 +645,6 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str, few_shots_split="test", few_shots_select="random", generation_size=-1, - trust_dataset=True, version=0, ) for formulation in [MCFFormulation(), HybridFormulation()] @@ -652,7 +691,6 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str, ], hf_avail_splits=["test"], evaluation_splits=["test"], - trust_dataset=True, generation_size=64, ) for language, meta in lang_dict.items() @@ -685,7 +723,6 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str, evaluation_splits=["validation"], few_shots_split=["validation"], few_shots_select="random", - trust_dataset=True, generation_size=64, ) ] @@ -714,7 +751,6 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str, few_shots_select="random", suite=["community"], generation_size=-1, - trust_dataset=True, metrics=get_metrics_for_formulation( formulation, [ @@ -758,7 +794,6 @@ def create_universalner_task(language: Language, formulation): few_shots_select="random", suite=["community"], generation_size=-1, - trust_dataset=True, metrics=get_metrics_for_formulation( formulation, [ diff --git a/src/lighteval/tasks/multilingual/tasks/flores200.py b/src/lighteval/tasks/multilingual/tasks/flores200.py new file mode 100644 index 000000000..c9d07122c --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/flores200.py @@ -0,0 +1,271 @@ +""" +name: +Flores200 + +dataset: +facebook/flores + +abstract: +Flores200 multilingual benchmark. + +languages: +arabic, armenian, bengali, cyrillic, devanagari, ethiopic, georgian, greek, +gujarati, gurmukhi, chinese (simplified), chinese (traditional), hangul, hebrew, +japanese, khmer, kannada, lao, latin, malayalam, myanmar, odia, sinhala, tamil, +telugu, thai, tibetan + +tags: +multilingual, translation + +paper: +""" + +from itertools import permutations + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.translation import get_translation_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, +) +from lighteval.utils.language import Language, manage_duplicate_language_codes + + +flores_200_languages = [ + # "ace_Arab", + "ace_Latn", + "acm_Arab", + "acq_Arab", + "aeb_Arab", + "afr_Latn", + "ajp_Arab", + "aka_Latn", + "amh_Ethi", + "apc_Arab", + "arb_Arab", + # "arb_Latn", + "ars_Arab", + "ary_Arab", + "arz_Arab", + "asm_Beng", + "ast_Latn", + "awa_Deva", + "ayr_Latn", + "azb_Arab", + "azj_Latn", + "bak_Cyrl", + "bam_Latn", + "ban_Latn", + "bel_Cyrl", + "bem_Latn", + "ben_Beng", + "bho_Deva", + # "bjn_Arab", + "bjn_Latn", + "bod_Tibt", + "bos_Latn", + "bug_Latn", + "bul_Cyrl", + "cat_Latn", + "ceb_Latn", + "ces_Latn", + "cjk_Latn", + "ckb_Arab", + "crh_Latn", + "cym_Latn", + "dan_Latn", + "deu_Latn", + "dik_Latn", + "dyu_Latn", + "dzo_Tibt", + "ell_Grek", + "eng_Latn", + "epo_Latn", + "est_Latn", + "eus_Latn", + "ewe_Latn", + "fao_Latn", + "fij_Latn", + "fin_Latn", + "fon_Latn", + "fra_Latn", + "fur_Latn", + "fuv_Latn", + "gla_Latn", + "gle_Latn", + "glg_Latn", + "grn_Latn", + "guj_Gujr", + "hat_Latn", + "hau_Latn", + "heb_Hebr", + "hin_Deva", + "hne_Deva", + "hrv_Latn", + "hun_Latn", + "hye_Armn", + "ibo_Latn", + "ilo_Latn", + "ind_Latn", + "isl_Latn", + "ita_Latn", + "jav_Latn", + "jpn_Jpan", + "kab_Latn", + "kac_Latn", + "kam_Latn", + "kan_Knda", + # "kas_Arab", + "kas_Deva", + "kat_Geor", + # "knc_Arab", + "knc_Latn", + "kaz_Cyrl", + "kbp_Latn", + "kea_Latn", + "khm_Khmr", + "kik_Latn", + "kin_Latn", + "kir_Cyrl", + "kmb_Latn", + "kmr_Latn", + "kon_Latn", + "kor_Hang", + "lao_Laoo", + "lij_Latn", + "lim_Latn", + "lin_Latn", + "lit_Latn", + "lmo_Latn", + "ltg_Latn", + "ltz_Latn", + "lua_Latn", + "lug_Latn", + "luo_Latn", + "lus_Latn", + "lvs_Latn", + "mag_Deva", + "mai_Deva", + "mal_Mlym", + "mar_Deva", + # "min_Arab", + "min_Latn", + "mkd_Cyrl", + "plt_Latn", + "mlt_Latn", + "mni_Beng", + "khk_Cyrl", + "mos_Latn", + "mri_Latn", + "mya_Mymr", + "nld_Latn", + "nno_Latn", + "nob_Latn", + "npi_Deva", + "nso_Latn", + "nus_Latn", + "nya_Latn", + "oci_Latn", + "gaz_Latn", + "ory_Orya", + "pag_Latn", + "pan_Guru", + "pap_Latn", + "pes_Arab", + "pol_Latn", + "por_Latn", + "prs_Arab", + "pbt_Arab", + "quy_Latn", + "ron_Latn", + "run_Latn", + "rus_Cyrl", + "sag_Latn", + "san_Deva", + "sat_Olck", + "scn_Latn", + "shn_Mymr", + "sin_Sinh", + "slk_Latn", + "slv_Latn", + "smo_Latn", + "sna_Latn", + "snd_Arab", + "som_Latn", + "sot_Latn", + "spa_Latn", + "als_Latn", + "srd_Latn", + "srp_Cyrl", + "ssw_Latn", + "sun_Latn", + "swe_Latn", + "swh_Latn", + "szl_Latn", + "tam_Taml", + "tat_Cyrl", + "tel_Telu", + "tgk_Cyrl", + "tgl_Latn", + "tha_Thai", + "tir_Ethi", + "taq_Latn", + "taq_Tfng", + "tpi_Latn", + "tsn_Latn", + "tso_Latn", + "tuk_Latn", + "tum_Latn", + "tur_Latn", + "twi_Latn", + "tzm_Tfng", + "uig_Arab", + "ukr_Cyrl", + "umb_Latn", + "urd_Arab", + "uzn_Latn", + "vec_Latn", + "vie_Latn", + "war_Latn", + "wol_Latn", + "xho_Latn", + "ydd_Hebr", + "yor_Latn", + "yue_Hant", + "zho_Hans", + # "zho_Hant", + "zsm_Latn", + "zul_Latn", +] + + +def flores_adapter(lang1, lang2): + return lambda line: { + "source_text": line[f"sentence_{lang1}"], + "target_text": line[f"sentence_{lang2}"], + } + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"flores200:{lang1}-{lang2}", + prompt_function=get_translation_prompt_function( + source_language=Language(manage_duplicate_language_codes(lang1.split("_")[0])), + target_language=Language(manage_duplicate_language_codes(lang2.split("_")[0])), + adapter=flores_adapter(lang1, lang2), + formulation=CFFormulation(), + ), + suite=("lighteval",), + hf_repo="facebook/flores", + hf_subset=f"{lang1}-{lang2}", + hf_avail_splits=["dev", "devtest"], + evaluation_splits=["devtest"], + few_shots_split="dev", + few_shots_select=None, + generation_size=300, + metrics=[Metrics.chrf_plus, Metrics.bleu, Metrics.bleu_1, Metrics.bleu_4], + stop_sequence=["\n"], + version=0, + ) + for (lang1, lang2) in permutations(flores_200_languages, 2) +] diff --git a/src/lighteval/tasks/multilingual/tasks/fquad_v2.py b/src/lighteval/tasks/multilingual/tasks/fquad_v2.py new file mode 100644 index 000000000..b7f177a32 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/fquad_v2.py @@ -0,0 +1,53 @@ +""" +name: +Fquad V2 + +dataset: +manu/fquad2_test + +abstract: +FQuAD v2: French Question Answering Dataset version 2. + +languages: +french + +tags: +multilingual, qa + +paper: +https://arxiv.org/abs/2002.06071 +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"fquadv2_{Language.FRENCH.value}", + prompt_function=get_qa_prompt_function( + Language.FRENCH, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="manu/fquad2_test", + hf_subset="default", + evaluation_splits=("test_hasAns",), + few_shots_split="valid_hasAns", + generation_size=400, + stop_sequence=("\n",), + metrics=( + MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.FRENCH), + ), + ) +] diff --git a/community_tasks/french_evals.py b/src/lighteval/tasks/multilingual/tasks/french.py similarity index 72% rename from community_tasks/french_evals.py rename to src/lighteval/tasks/multilingual/tasks/french.py index 8e0480aac..12cf3d928 100644 --- a/community_tasks/french_evals.py +++ b/src/lighteval/tasks/multilingual/tasks/french.py @@ -1,33 +1,21 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +""" +name: +French Evals -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +dataset: +fr-gouv-coordination-ia/IFEval-fr, fr-gouv-coordination-ia/gpqa-fr, fr-gouv-coordination-ia/bac-fr -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +abstract: +Collection of benchmarks for the french language. -# ruff: noqa: F405, F403, F401 -""" -Custom evaluation tasks for lighteval. +languages: +french -This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. +tags: +knowledge, multiple-choice, qa -This module implements tasks for the french specific datasets -See : https://huggingface.co/fr-gouv-coordination-ia +paper: +https://huggingface.co/fr-gouv-coordination-ia """ import random @@ -35,9 +23,9 @@ from lighteval.metrics.metrics import Metrics from lighteval.metrics.normalizations import math_normalizer from lighteval.tasks.default_prompts import LETTER_INDICES -from lighteval.tasks.extended.ifeval.main import ifeval_metrics from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc +from lighteval.tasks.tasks.ifeval.main import ifeval_metrics from lighteval.utils.utils import as_list diff --git a/src/lighteval/tasks/multilingual/tasks/french_boolq.py b/src/lighteval/tasks/multilingual/tasks/french_boolq.py new file mode 100644 index 000000000..d1bd58931 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/french_boolq.py @@ -0,0 +1,53 @@ +""" +name: +French Boolq + +dataset: +manu/french_boolq + +abstract: +French Boolq multilingual benchmark. + +languages: +french + +tags: +classification, multilingual, qa + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, + MultilingualQuasiExactMatchMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.boolq import get_boolq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"community_boolq_{Language.FRENCH.value}", + prompt_function=get_boolq_prompt_function( + Language.FRENCH, + lambda line: { + "question": line["question"], + "answer": line["label"] == 1, + "context": line["passage"], + }, + formulation=CFFormulation(), + ), + suite=("lighteval",), + hf_repo="manu/french_boolq", + hf_subset="default", + evaluation_splits=("test",), + few_shots_split="valid", + generation_size=5, + stop_sequence=["\n"], + metrics=[MultilingualQuasiExactMatchMetric(Language.FRENCH, "full"), LogLikelihoodAccMetric()], + ) +] diff --git a/src/lighteval/tasks/multilingual/tasks/french_triviqa.py b/src/lighteval/tasks/multilingual/tasks/french_triviqa.py new file mode 100644 index 000000000..7fa335703 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/french_triviqa.py @@ -0,0 +1,51 @@ +""" +name: +French Triviqa + +dataset: +manu/french-trivia + +abstract: +French Triviqa multilingual benchmark. + +languages: +french + +tags: +multilingual, qa + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"community_triviaqa_{Language.FRENCH.value}", + prompt_function=get_qa_prompt_function( + Language.FRENCH, + lambda line: { + "question": line["Question"], + "choices": [line["Answer"]], + }, + ), + suite=("lighteval",), + hf_repo="manu/french-trivia", + hf_subset="default", + evaluation_splits=("train",), + hf_avail_splits=["train"], + generation_size=400, + stop_sequence=("\n",), + metrics=[ + MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.FRENCH), + ], + ) +] diff --git a/community_tasks/german_rag_evals.py b/src/lighteval/tasks/multilingual/tasks/german_rag.py similarity index 78% rename from community_tasks/german_rag_evals.py rename to src/lighteval/tasks/multilingual/tasks/german_rag.py index 052826287..06eb398d7 100644 --- a/community_tasks/german_rag_evals.py +++ b/src/lighteval/tasks/multilingual/tasks/german_rag.py @@ -1,33 +1,21 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team -# Copyright (c) 2024 Philip May, Deutsche Telekom AG - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -# ruff: noqa: F405, F403, F401 """ -Custom evaluation tasks for lighteval. +name: +German RAG Evals -This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. -This module implements the 4 tasks of deutsche-telekom/Ger-RAG-eval. -See: https://huggingface.co/datasets/deutsche-telekom/Ger-RAG-eval +dataset: +deutsche-telekom/Ger-RAG-eval + +abstract: +Collection of benchmarks for the German language. + +languages: +german + +tags: +knowledge, reasoning, multiple-choice + +paper: +https://huggingface.co/datasets/deutsche-telekom/Ger-RAG-eval """ from lighteval.metrics.metrics import Metrics diff --git a/src/lighteval/tasks/multilingual/tasks/germanquad.py b/src/lighteval/tasks/multilingual/tasks/germanquad.py new file mode 100644 index 000000000..895c2bedc --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/germanquad.py @@ -0,0 +1,55 @@ +""" +name: +Germanquad + +dataset: +deepset/germanquad + +abstract: +GermanQuAD: High-quality German QA dataset with 13,722 questions. + +languages: +german + +tags: +multilingual, qa + +paper: +https://arxiv.org/abs/2104.12741 +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"germanquad_{Language.GERMAN.value}", + prompt_function=get_qa_prompt_function( + Language.GERMAN, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="deepset/germanquad", + hf_subset="plain_text", + hf_revision="fff05ceaf2ffbe5b65c7e0c57e678f7b7e1a0581", + hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), + evaluation_splits=("test",), + few_shots_split="train", + generation_size=400, + stop_sequence=("\n",), + metrics=( + MultilingualQuasiExactMatchMetric(Language.GERMAN, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.GERMAN), + ), + ) +] diff --git a/src/lighteval/tasks/multilingual/tasks/global_mmlu.py b/src/lighteval/tasks/multilingual/tasks/global_mmlu.py new file mode 100644 index 000000000..217eb25e6 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/global_mmlu.py @@ -0,0 +1,184 @@ +""" +name: +Global Mmlu + +dataset: +CohereForAI/Global-MMLU + +abstract: +Translated MMLU using both professional and non-professional translators. +Contains tags for cultural sensitivity. + +languages: +amharic, arabic, bengali, chinese, czech, dutch, english, french, german, +hebrew, hindi, indonesian, italian, japanese, korean, malay, norwegian, polish, +portuguese, romanian, russian, serbian, spanish, swahili, swedish, tamil, +telugu, thai, turkish, ukrainian, urdu, vietnamese, yoruba, zulu + +tags: +knowledge, multilingual, multiple-choice + +paper: +https://huggingface.co/papers/2412.03304 +""" + +from functools import partial + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +MMLU_SUBSETS = [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions", +] + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"global_mmlu_{sensitivity_label.lower()}_{language.value}_{formulation.name.lower()}:{subset}", + prompt_function=get_mcq_prompt_function( + language, + lambda line: { + "question": line["question"], + "choices": [line["option_a"], line["option_b"], line["option_c"], line["option_d"]], + "gold_idx": LETTER_INDICES.index(line["answer"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="CohereForAI/Global-MMLU", + hf_subset=standardize_tag(language.value), + evaluation_splits=("test",), + few_shots_split="dev", + hf_filter=partial( + lambda subset, sensitivity_label, x: x["subject"].lower() == subset + and ( + sensitivity_label == "ALL" or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK") + ) + and all(x[f"option_{opt}"] is not None and x[f"option_{opt}"].strip() for opt in "abcd"), + subset, + sensitivity_label, + ), + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), + ], + ), + ) + for subset in MMLU_SUBSETS + for language in [ + Language.AMHARIC, + Language.ARABIC, + Language.BENGALI, + Language.CHINESE, + Language.CZECH, + Language.GERMAN, + Language.ENGLISH, + Language.SPANISH, + Language.FRENCH, + Language.HEBREW, + Language.HINDI, + Language.INDONESIAN, + Language.ITALIAN, + Language.JAPANESE, + Language.KOREAN, + Language.MALAY, + Language.DUTCH, + Language.NORWEGIAN, + Language.POLISH, + Language.PORTUGUESE, + Language.ROMANIAN, + Language.RUSSIAN, + Language.SERBIAN, + Language.SWEDISH, + Language.SWAHILI, + Language.TAMIL, + Language.TELUGU, + Language.THAI, + Language.TURKISH, + Language.UKRAINIAN, + Language.URDU, + Language.VIETNAMESE, + Language.YORUBA, + Language.ZULU, + ] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] + for sensitivity_label in ["ALL", "CA", "CS", "UNK"] +] diff --git a/src/lighteval/tasks/multilingual/tasks/hellaswag_hin.py b/src/lighteval/tasks/multilingual/tasks/hellaswag_hin.py new file mode 100644 index 000000000..ad3db12de --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/hellaswag_hin.py @@ -0,0 +1,62 @@ +""" +name: +Hellaswag Hin + +dataset: +ai4bharat/hellaswag-hi + +abstract: +Hellaswag Hin multilingual benchmark. + +languages: +hindi + +tags: +multilingual, multiple-choice, reasoning + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"community_hellaswag_{Language.HINDI.value}_{formulation.name.lower()}", + suite=["lighteval"], + prompt_function=get_hellaswag_prompt_function( + language=Language.HINDI, + adapter=lambda line: { + "ctx_a": line["ctx_a"], + "continuations": line["endings"], + "gold_idx": int(line["label"]), + }, + formulation=formulation, + ), + hf_repo="ai4bharat/hellaswag-hi", + hf_filter=lambda line: all(len(choice.strip()) > 0 for choice in line["endings"]), + hf_subset="hi", + evaluation_splits=("validation",), + few_shots_split="validation", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + ], + ), + ) + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/hellaswag_tel.py b/src/lighteval/tasks/multilingual/tasks/hellaswag_tel.py new file mode 100644 index 000000000..127329160 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/hellaswag_tel.py @@ -0,0 +1,61 @@ +""" +name: +Hellaswag Tel + +dataset: +LightFury9/hellaswag-telugu + +abstract: +Hellaswag Tel multilingual benchmark. + +languages: +telugu + +tags: +multilingual, multiple-choice, reasoning + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"community_hellaswag_{Language.TELUGU.value}_{formulation.name.lower()}", + suite=["lighteval"], + prompt_function=get_hellaswag_prompt_function( + language=Language.TELUGU, + adapter=lambda line: { + "ctx_a": line["ctx_a"], + "continuations": line["endings"], + "gold_idx": int(line["label"]), + }, + formulation=formulation, + ), + hf_repo="LightFury9/hellaswag-telugu", + hf_subset="default", + evaluation_splits=("valid",), + few_shots_split="train", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/hellaswag_tha.py b/src/lighteval/tasks/multilingual/tasks/hellaswag_tha.py new file mode 100644 index 000000000..201f287bd --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/hellaswag_tha.py @@ -0,0 +1,65 @@ +""" +name: +Hellaswag Tha + +dataset: +lighteval/hellaswag_thai + +abstract: +Hellaswag Thai This is a Thai adaptation of the Hellaswag task. Similar to the +Turkish version, there's no specific paper, but it has been found to be +effective for evaluating Thai language models on commonsense reasoning tasks. + +languages: +thai + +tags: +multilingual, multiple-choice, reasoning + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"community_hellaswag_{Language.THAI.value}_{formulation.name.lower()}", + suite=["lighteval"], + prompt_function=get_hellaswag_prompt_function( + language=Language.THAI, + adapter=lambda line: { + "ctx_a": line["ctx_a"], + "ctx_b": line["ctx_b"], + "continuations": line["endings"], + "gold_idx": int(line["label"]), + }, + formulation=formulation, + wikihow_artifacts=[" [ชื่อ]", " [ส่วนหัว]", " [ขั้นตอน]", " [header]", " [Header]"], + ), + hf_repo="lighteval/hellaswag_thai", + hf_subset="default", + evaluation_splits=["validation"], + few_shots_split="train", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/hellaswag_tur.py b/src/lighteval/tasks/multilingual/tasks/hellaswag_tur.py new file mode 100644 index 000000000..84cb9bc52 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/hellaswag_tur.py @@ -0,0 +1,68 @@ +""" +name: +Hellaswag Tur + +dataset: +malhajar/hellaswag_tr-v0.2 + +abstract: +Hellaswag Turkish This is a Turkish adaptation of the Hellaswag task. While +there's no specific paper for this version, it has been found to work well for +evaluating Turkish language models on commonsense reasoning tasks. We don't +handle them in single task as there is quite a lot of differences +(dataset/subset, dot replacement, etc.) which would make it hard to read + +languages: +turkish + +tags: +multilingual, multiple-choice, reasoning + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"community_hellaswag_{Language.TURKISH.value}_{formulation.name.lower()}", + suite=["lighteval"], + prompt_function=get_hellaswag_prompt_function( + language=Language.TURKISH, + adapter=lambda line: { + "ctx_a": line["ctx_a"], + "ctx_b": line["ctx_b"], + "continuations": line["endings"], + "gold_idx": int(line["label"]), + }, + formulation=formulation, + # https://github.com/malhajar17/lm-evaluation-harness_turkish/blob/main/lm_eval/tasks/hellaswag_tr-v0.2/utils.py + wikihow_artifacts=[" [title]", " [başlık]", " [adım]", " [header]"], + ), + hf_repo="malhajar/hellaswag_tr-v0.2", + hf_subset="default", + evaluation_splits=["validation"], + hf_avail_splits=["validation"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/hindi_arc.py b/src/lighteval/tasks/multilingual/tasks/hindi_arc.py new file mode 100644 index 000000000..625a0ebd0 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/hindi_arc.py @@ -0,0 +1,70 @@ +""" +name: +Hindi Arc + +dataset: +ai4bharat/ai2_arc-hi + +abstract: +Hindi Arc multilingual benchmark. + +languages: +hindi + +tags: +multilingual, multiple-choice, reasoning + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"community_arc_{Language.HINDI.value}_{formulation.name.lower()}:{subset}", + prompt_function=get_mcq_prompt_function( + Language.HINDI, + lambda line: { + "question": line["question"], + "choices": line["choices"]["text"], + "gold_idx": int(line["answerKey"]) - 1 + if line["answerKey"].isdigit() + else LETTER_INDICES.index(line["answerKey"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="ai4bharat/ai2_arc-hi", + hf_subset=f"ARC-{subset.capitalize()}", + evaluation_splits=("test",), + few_shots_split="validation", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ] + + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore + ), + ) + for subset in ["easy", "challenge"] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/hindi_boolq.py b/src/lighteval/tasks/multilingual/tasks/hindi_boolq.py new file mode 100644 index 000000000..2a77d0ac2 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/hindi_boolq.py @@ -0,0 +1,62 @@ +""" +name: +Hindi Boolq + +dataset: +ai4bharat/boolq-hi + +abstract: +Hindi Boolq multilingual benchmark. + +languages: +gujarati, hindi, malayalam, marathi, tamil + +tags: +classification, multilingual, qa + +paper: +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, + MultilingualQuasiExactMatchMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.boolq import get_boolq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"community_boolq_{language.value}", + prompt_function=get_boolq_prompt_function( + language, + lambda line: { + "question": line["question"], + "answer": line["answer"], + "context": line["passage"], + }, + formulation=CFFormulation(), + ), + suite=("lighteval",), + hf_repo="ai4bharat/boolq-hi", + hf_subset=standardize_tag(language.value), + evaluation_splits=("validation",), + few_shots_split="train", + generation_size=5, + stop_sequence=["\n"], + metrics=[MultilingualQuasiExactMatchMetric(language, "full"), LogLikelihoodAccMetric()], + ) + for language in [ + Language.HINDI, + Language.GUJARATI, + Language.MALAYALAM, + Language.MARATHI, + Language.TAMIL, + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/indicqa.py b/src/lighteval/tasks/multilingual/tasks/indicqa.py new file mode 100644 index 000000000..09eb297d5 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/indicqa.py @@ -0,0 +1,71 @@ +""" +name: +Indicqa + +dataset: +ai4bharat/IndicQA + +abstract: +IndicQA: A reading comprehension dataset for 11 Indian languages. + +languages: +assamese, bengali, gujarati, hindi, kannada, malayalam, marathi, oriya, punjabi, +tamil, telugu + +tags: +multilingual, qa + +paper: +https://arxiv.org/abs/2407.13522 +""" + +from langcodes import Language as LangCodeLanguage + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"indicqa_{language.value}", + prompt_function=get_qa_prompt_function( + language, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="ai4bharat/IndicQA", + hf_subset=f"indicqa.{LangCodeLanguage.get(language.value).language}", + hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), + hf_revision="92d96092ae229950973dac3b9998f8b3a8949b0a", + evaluation_splits=("test",), + hf_avail_splits=("test",), + generation_size=400, + metrics=( + MultilingualQuasiExactMatchMetric(language, "prefix"), + MultilingualQuasiF1ScoreMetric(language), + ), + stop_sequence=("\n",), + ) + for language in [ + Language.ASSAMESE, + Language.BENGALI, + Language.GUJARATI, + Language.HINDI, + Language.KANNADA, + Language.MALAYALAM, + Language.MARATHI, + Language.ORIYA, + Language.PUNJABI, + Language.TAMIL, + Language.TELUGU, + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/kenswquad.py b/src/lighteval/tasks/multilingual/tasks/kenswquad.py new file mode 100644 index 000000000..c90ca1c36 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/kenswquad.py @@ -0,0 +1,53 @@ +""" +name: +Kenswquad + +dataset: +lighteval/KenSwQuAD + +abstract: +KenSwQuAD: A question answering dataset for Kenyan Swahili. + +languages: +swahili + +tags: +multilingual, qa + +paper: +https://arxiv.org/abs/2205.02364 +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"kenswquad_{Language.SWAHILI.value}", + prompt_function=get_qa_prompt_function( + Language.SWAHILI, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [line["answer"]], + }, + ), + suite=("lighteval",), + hf_repo="lighteval/KenSwQuAD", + hf_subset="default", + evaluation_splits=("test",), + few_shots_split="validation", + metrics=( + MultilingualQuasiExactMatchMetric(Language.SWAHILI, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.SWAHILI), + ), + generation_size=400, + stop_sequence=("\n",), + ) +] diff --git a/src/lighteval/tasks/multilingual/tasks/m3exams.py b/src/lighteval/tasks/multilingual/tasks/m3exams.py new file mode 100644 index 000000000..65a03f94a --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/m3exams.py @@ -0,0 +1,85 @@ +""" +name: +M3Exams + +dataset: +chiayewken/m3exam + +abstract: +M3Exam: Multitask Multilingual Multimodal Evaluation Benchmark It also contains +a multimodal version but we don't support that Paper: +https://arxiv.org/abs/2306.05179 + +languages: +afrikaans, chinese, english, italian, javanese, portuguese, swahili, thai, +vietnamese + +tags: +knowledge, multilingual, multiple-choice + +paper: +https://arxiv.org/abs/2306.05179 +""" + +from functools import partial + +from langcodes import Language as LangCodeLanguage +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.adapters import ( + get_m3exam_adapter, +) +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"m3exams_{language.value}_{formulation.name.lower()}", + suite=("lighteval",), + prompt_function=get_mcq_prompt_function( + language, + partial(get_m3exam_adapter, language), + formulation=formulation, + ), + hf_repo="chiayewken/m3exam", + hf_subset=LangCodeLanguage(standardize_tag(language.value)).language_name().lower(), + evaluation_splits=("test",), + few_shots_split="dev", + generation_size=-1, + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for language in [ + Language.AFRIKAANS, + Language.CHINESE, + Language.ENGLISH, + Language.ITALIAN, + Language.JAVANESE, + Language.PORTUGUESE, + Language.SWAHILI, + Language.THAI, + Language.VIETNAMESE, + ] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/mathlogicqa_rus.py b/src/lighteval/tasks/multilingual/tasks/mathlogicqa_rus.py new file mode 100644 index 000000000..ac7652a46 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/mathlogicqa_rus.py @@ -0,0 +1,70 @@ +""" +name: +Mathlogicqa Rus + +dataset: +ai-forever/MERA + +abstract: +MathLogicQA is a dataset for evaluating mathematical reasoning in language +models. It consists of multiple-choice questions that require logical reasoning +and mathematical problem-solving. This Russian version is part of the MERA +(Multilingual Evaluation of Reasoning Abilities) benchmark. + +languages: +russian + +tags: +math, multilingual, qa, reasoning + +paper: +https://github.com/ai-forever/MERA +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mathlogic_qa_{Language.RUSSIAN.value}_{formulation.name.lower()}", + prompt_function=get_mcq_prompt_function( + Language.RUSSIAN, + lambda line: { + "question": line["inputs"]["text"], + "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]], + "gold_idx": LETTER_INDICES.index(line["outputs"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="ai-forever/MERA", + hf_subset="mathlogicqa", + evaluation_splits=("train",), + hf_avail_splits=["train"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [ + CFFormulation(), + MCFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/meta_mmlu.py b/src/lighteval/tasks/multilingual/tasks/meta_mmlu.py new file mode 100644 index 000000000..f7a88e3f6 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/meta_mmlu.py @@ -0,0 +1,149 @@ +""" +name: +Meta Mmlu + +dataset: +meta-llama/Meta-Llama-3.1-8B-Instruct-evals + +abstract: +Meta MMLU: A multilingual version of MMLU (using google translation) + +languages: +french, german, hindi, italian, portuguese, spanish, thai + +tags: +knowledge, multilingual, multiple-choice + +paper: +https://arxiv.org/abs/2407.21783 +""" + +from functools import partial + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +MMLU_SUBSETS = [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions", +] + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"meta_mmlu_{language.value}_{formulation.name.lower()}:{subset}", + prompt_function=get_mcq_prompt_function( + language, + lambda line: { + "question": line["input_question"], + "choices": [v for _, v in sorted(line["input_choice_list"].items(), key=lambda x: x[0])], + "gold_idx": LETTER_INDICES.index(line["input_correct_responses"][0]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="meta-llama/Meta-Llama-3.1-8B-Instruct-evals", + hf_subset=f"Llama-3.1-8B-Instruct-evals__multilingual_mmlu_{standardize_tag(language.value)}__details", + hf_filter=partial( + lambda language, subset, line: line["subtask_name"] + == f"mmlu_{standardize_tag(language.value)}_chat.{subset}", + language, + subset, + ), + evaluation_splits=("latest",), + hf_avail_splits=["latest"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), + ], + ), + ) + for subset in MMLU_SUBSETS + for language in [ + Language.GERMAN, + Language.SPANISH, + Language.FRENCH, + Language.HINDI, + Language.ITALIAN, + Language.PORTUGUESE, + Language.THAI, + ] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/mgsm.py b/src/lighteval/tasks/multilingual/tasks/mgsm.py new file mode 100644 index 000000000..c72cf1ca7 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/mgsm.py @@ -0,0 +1,67 @@ +""" +name: +Mgsm + +dataset: +juletxara/mgsm + +abstract: +Mgsm multilingual benchmark. + +languages: +bengali, chinese, english, french, german, japanese, russian, spanish, swahili, +telugu, thai + +tags: +math, multilingual, reasoning + +paper: +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mgsm_{language.value}", + prompt_function=get_qa_prompt_function( + language, + lambda line: { + "question": line["question"], + # The cot is available but we have no use: + # line["answer"] + "choices": [str(line["answer_number"])], + }, + ), + suite=("lighteval",), + hf_repo="juletxara/mgsm", + hf_subset=standardize_tag(language.value), + evaluation_splits=("test",), + few_shots_split="train", + generation_size=25, + metrics=[ + MultilingualQuasiExactMatchMetric(language, "full"), + ], + stop_sequence=("\n",), + ) + for language in [ + Language.ENGLISH, + Language.SPANISH, + Language.FRENCH, + Language.GERMAN, + Language.RUSSIAN, + Language.CHINESE, + Language.JAPANESE, + Language.THAI, + Language.SWAHILI, + Language.BENGALI, + Language.TELUGU, + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/mintaka.py b/src/lighteval/tasks/multilingual/tasks/mintaka.py new file mode 100644 index 000000000..e888a103e --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/mintaka.py @@ -0,0 +1,64 @@ +""" +name: +Mintaka + +dataset: +AmazonScience/mintaka + +abstract: +Mintaka multilingual benchmark. + +languages: +arabic, english, french, german, hindi, italian, japanese, portuguese, spanish + +tags: +knowledge, multilingual, qa + +paper: +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mintaka_{lang.value}", + prompt_function=get_qa_prompt_function( + lang, + lambda line: { + "question": line["question"], + "choices": [line["answerText"]], + }, + ), + suite=("lighteval",), + hf_repo="AmazonScience/mintaka", + hf_subset=standardize_tag(lang.value), + evaluation_splits=("test",), + few_shots_split="train", + generation_size=400, + stop_sequence=("\n",), + metrics=[ + MultilingualQuasiExactMatchMetric(lang, "prefix"), + MultilingualQuasiF1ScoreMetric(lang), + ], + ) + for lang in [ + Language.ARABIC, + Language.GERMAN, + Language.ENGLISH, + Language.SPANISH, + Language.FRENCH, + Language.HINDI, + Language.ITALIAN, + Language.JAPANESE, + Language.PORTUGUESE, + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/mkqa.py b/src/lighteval/tasks/multilingual/tasks/mkqa.py new file mode 100644 index 000000000..a4d803633 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/mkqa.py @@ -0,0 +1,108 @@ +""" +name: +Mkqa + +dataset: +apple/mkqa + +abstract: +Mkqa multilingual benchmark. + +languages: +arabic, chinese, chinese_hong_kong, chinese_traditional, danish, dutch, english, +finnish, french, german, hebrew, hungarian, italian, japanese, khmer, korean, +malay, norwegian, polish, portuguese, russian, spanish, swedish, thai, turkish, +vietnamese + +tags: +multilingual, qa + +paper: +""" + +from functools import partial + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.adapters import ( + get_mkqa_adapter, +) +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +MKQA_TASK_TO_ID = { + "entity": 0, + "long_answer": 1, + # "unanswerable": 2, + "date": 3, + "number": 4, + "number_with_unit": 5, + "short_phrase": 6, + "binary": 7, +} + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mkqa_{language.value}:{subset}", + prompt_function=get_qa_prompt_function(language, partial(get_mkqa_adapter, language)), + suite=("lighteval",), + hf_repo="apple/mkqa", + hf_subset="mkqa", + hf_revision="325131889721ae0ed885b76ecb8011369d75abad", + hf_filter=partial( + lambda language, subset, line: line["answers"][ + "zh_cn" if language == Language.CHINESE else standardize_tag(language.value) + ][0]["type"] + == MKQA_TASK_TO_ID[subset], + language, + subset, + ), + evaluation_splits=("train",), + hf_avail_splits=["train"], + stop_sequence=("\n",), + metrics=[ + MultilingualQuasiExactMatchMetric(language, "prefix"), + MultilingualQuasiF1ScoreMetric(language), + ] + if subset in ["entity", "long_answer", "short_phrase"] + else [ + MultilingualQuasiExactMatchMetric(language, "full"), + ], + ) + for subset in MKQA_TASK_TO_ID.keys() + for language in [ + Language.ARABIC, + Language.DANISH, + Language.GERMAN, + Language.ENGLISH, + Language.SPANISH, + Language.FINNISH, + Language.FRENCH, + Language.HEBREW, + Language.HUNGARIAN, + Language.ITALIAN, + Language.JAPANESE, + Language.KOREAN, + Language.KHMER, + Language.MALAY, + Language.DUTCH, + Language.NORWEGIAN, + Language.POLISH, + Language.PORTUGUESE, + Language.RUSSIAN, + Language.SWEDISH, + Language.THAI, + Language.TURKISH, + Language.VIETNAMESE, + Language.CHINESE, # Simplified + # Language.CHINESE_HONG_KONG, + # Language.CHINESE_TRADITIONAL, + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py b/src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py new file mode 100644 index 000000000..2a48c369b --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py @@ -0,0 +1,110 @@ +""" +name: +Mlmm Arc Challenge + +dataset: +jon-tow/okapi_arc_challenge + +abstract: +ARC (AI2 Reasoning Challenge) is a dataset for question answering that requires +reasoning. It consists of multiple-choice science questions from 3rd to 9th +grade exams. The dataset is split into two parts: ARC-Easy and ARC-Challenge. +ARC-Easy contains questions that can be answered correctly by both humans and +simple baseline models. ARC-Challenge contains questions that are difficult for +both humans and current AI systems. Similar to MMLU, ARC tasks uses PMI +normalization by default but only for the challenge set. + +languages: +arabic, bengali, catalan, chinese, croatian, danish, dutch, french, german, +hindi, hungarian, indonesian, italian, kannada, malayalam, marathi, nepali, +romanian, russian, serbian, slovak, spanish, tamil, telugu, ukrainian, +vietnamese + +tags: +multilingual, multiple-choice, reasoning + +paper: +https://github.com/nlp-uoregon/mlmm-evaluation +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mlmm_arc_{language.value}_{formulation.name.lower()}:challenge", + prompt_function=get_mcq_prompt_function( + language, + lambda line: { + "question": line["question"], + "choices": line["choices"]["text"], + "gold_idx": int(line["answerKey"]) - 1 + if line["answerKey"].isdigit() + else LETTER_INDICES.index(line["answerKey"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="jon-tow/okapi_arc_challenge", + hf_subset=standardize_tag(language.value), + hf_revision="823d5d7bfaf8974a3ab52a825b6cf4903b35dbc4", + evaluation_splits=("test",), + few_shots_split="train", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), + ], + ), + ) + for language in [ + Language.RUSSIAN, + Language.GERMAN, + Language.CHINESE, + Language.FRENCH, + Language.SPANISH, + Language.ITALIAN, + Language.DUTCH, + Language.VIETNAMESE, + Language.INDONESIAN, + Language.ARABIC, + Language.HUNGARIAN, + Language.ROMANIAN, + Language.DANISH, + Language.SLOVAK, + Language.UKRAINIAN, + Language.CATALAN, + Language.SERBIAN, + Language.CROATIAN, + Language.HINDI, + Language.BENGALI, + Language.TAMIL, + Language.NEPALI, + Language.MALAYALAM, + Language.MARATHI, + Language.TELUGU, + Language.KANNADA, + ] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/mlmm_hellaswag.py b/src/lighteval/tasks/multilingual/tasks/mlmm_hellaswag.py new file mode 100644 index 000000000..a8933a101 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/mlmm_hellaswag.py @@ -0,0 +1,108 @@ +""" +name: +Mlmm Hellaswag + +dataset: +jon-tow/okapi_hellaswag + +abstract: +Hellaswag is a commonsense reasoning task that requires models to complete a +given scenario with the most plausible ending. It tests the model's ability to +understand and reason about everyday situations and human behavior. +MLMM-Hellaswag: Multilingual adaptation of Hellaswag + +languages: +arabic, armenian, basque, bengali, catalan, chinese, croatian, danish, dutch, +french, german, gujarati, hindi, hungarian, icelandic, indonesian, italian, +kannada, malayalam, marathi, nepali, norwegian, portuguese, romanian, russian, +serbian, slovak, spanish, swedish, tamil, telugu, ukrainian, vietnamese + +tags: +multilingual, multiple-choice, reasoning + +paper: +https://arxiv.org/abs/2306.07610 +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mlmm_hellaswag_{lang.value}_{formulation.name.lower()}", + suite=["lighteval"], + prompt_function=get_hellaswag_prompt_function( + language=lang, + adapter=lambda line: { + # We don't use activity_label as they are not available + "ctx_a": line["ctx_a"], + "ctx_b": line["ctx_b"], + "continuations": line["endings"], + "gold_idx": int(line["label"]), + }, + formulation=formulation, + ), + hf_repo="jon-tow/okapi_hellaswag", + hf_subset=standardize_tag(lang.value), + hf_revision="96ed8e0dfc6172dad1d3df338d7b8ba6c1ff9d83", + evaluation_splits=["validation"], + hf_avail_splits=["validation"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for lang in [ + Language.ARABIC, + Language.BENGALI, + Language.CATALAN, + Language.DANISH, + Language.GERMAN, + Language.SPANISH, + Language.BASQUE, + Language.FRENCH, + Language.GUJARATI, + Language.HINDI, + Language.CROATIAN, + Language.HUNGARIAN, + Language.ARMENIAN, + Language.INDONESIAN, + Language.ICELANDIC, + Language.ITALIAN, + Language.KANNADA, + Language.MALAYALAM, + Language.MARATHI, + Language.NORWEGIAN, + Language.NEPALI, + Language.DUTCH, + Language.PORTUGUESE, + Language.ROMANIAN, + Language.RUSSIAN, + Language.SLOVAK, + Language.SERBIAN, + Language.SWEDISH, + Language.TAMIL, + Language.TELUGU, + Language.UKRAINIAN, + Language.VIETNAMESE, + Language.CHINESE, + ] + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/mlmm_mmlu.py b/src/lighteval/tasks/multilingual/tasks/mlmm_mmlu.py new file mode 100644 index 000000000..031cdc767 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/mlmm_mmlu.py @@ -0,0 +1,167 @@ +""" +name: +Mlmm Mmlu + +dataset: +jon-tow/okapi_mmlu + +abstract: +MLMM MMLU: Another multilingual version of MMLU + +languages: +arabic, bengali, catalan, chinese, croatian, danish, dutch, french, german, +hindi, hungarian, indonesian, italian, kannada, malayalam, marathi, nepali, +romanian, russian, serbian, slovak, spanish, tamil, telugu, ukrainian, +vietnamese + +tags: +knowledge, multilingual, multiple-choice + +paper: +https://github.com/nlp-uoregon/mlmm-evaluation +""" + +from functools import partial + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +MMLU_SUBSETS = [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions", +] + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mlmm_mmlu_{language.value}_{formulation.name.lower()}:{subset}", + prompt_function=get_mcq_prompt_function( + language, + lambda line: { + "question": line["question"], + "choices": line["choices"], + "gold_idx": LETTER_INDICES.index(line["answer"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="jon-tow/okapi_mmlu", + hf_subset=standardize_tag(language.value), + hf_revision="refs/pr/1", + hf_filter=partial(lambda subset, line: line["id"].split("/")[0] == subset, subset), + evaluation_splits=("test",), + few_shots_split="dev", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), + ], + ), + ) + for subset in MMLU_SUBSETS + for language in [ + Language.RUSSIAN, + Language.GERMAN, + Language.CHINESE, + Language.FRENCH, + Language.SPANISH, + Language.ITALIAN, + Language.DUTCH, + Language.VIETNAMESE, + Language.INDONESIAN, + Language.ARABIC, + Language.HUNGARIAN, + Language.ROMANIAN, + Language.DANISH, + Language.SLOVAK, + Language.UKRAINIAN, + Language.CATALAN, + Language.SERBIAN, + Language.CROATIAN, + Language.HINDI, + Language.BENGALI, + Language.TAMIL, + Language.NEPALI, + Language.MALAYALAM, + Language.MARATHI, + Language.TELUGU, + Language.KANNADA, + ] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/mlmm_truthfulqa.py b/src/lighteval/tasks/multilingual/tasks/mlmm_truthfulqa.py new file mode 100644 index 000000000..1851693fa --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/mlmm_truthfulqa.py @@ -0,0 +1,113 @@ +""" +name: +Mlmm Truthfulqa + +dataset: +jon-tow/okapi_truthfulqa + +abstract: +TruthfulQA: Measuring How Models Mimic Human Falsehoods + +languages: +arabic, armenian, basque, bengali, catalan, chinese, croatian, danish, dutch, +french, german, gujarati, hindi, hungarian, icelandic, indonesian, italian, +kannada, malayalam, marathi, nepali, norwegian, portuguese, romanian, russian, +serbian, slovak, spanish, swedish, tamil, telugu, ukrainian, vietnamese + +tags: +factuality, multilingual, qa + +paper: +https://arxiv.org/abs/2109.07958 +""" + +from functools import partial + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mlmm_truthfulqa_{language.value}_{formulation.name.lower()}:{subset}", + prompt_function=get_mcq_prompt_function( + language, + partial( + lambda subset, line: { + "question": line["question"], + "choices": line[f"{subset}_targets"]["choices"], + "gold_idx": [ix for ix, label in enumerate(line[f"{subset}_targets"]["labels"]) if label == 1], # type: ignore + }, + subset, + ), + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="jon-tow/okapi_truthfulqa", + hf_subset=standardize_tag(language.value), + hf_revision="cdd5db1a66fd04105622109d1c2a5cbc8cde7586", + evaluation_splits=("validation",), + hf_avail_splits=["validation"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for subset in ["mc1", "mc2"] + for language in [ + Language.ARABIC, + Language.BENGALI, + Language.CATALAN, + Language.DANISH, + Language.GERMAN, + Language.SPANISH, + Language.BASQUE, + Language.FRENCH, + Language.GUJARATI, + Language.HINDI, + Language.CROATIAN, + Language.HUNGARIAN, + Language.ARMENIAN, + Language.INDONESIAN, + Language.ICELANDIC, + Language.ITALIAN, + Language.KANNADA, + Language.MALAYALAM, + Language.MARATHI, + Language.NORWEGIAN, + Language.NEPALI, + Language.DUTCH, + Language.PORTUGUESE, + Language.ROMANIAN, + Language.RUSSIAN, + Language.SLOVAK, + Language.SERBIAN, + Language.SWEDISH, + Language.TAMIL, + Language.TELUGU, + Language.UKRAINIAN, + Language.VIETNAMESE, + Language.CHINESE, + ] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/mlqa.py b/src/lighteval/tasks/multilingual/tasks/mlqa.py new file mode 100644 index 000000000..70515b678 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/mlqa.py @@ -0,0 +1,68 @@ +""" +name: +Mlqa + +dataset: +facebook/mlqa + +abstract: +MLQA (MultiLingual Question Answering) is a benchmark dataset for evaluating +cross-lingual question answering performance. It consists of QA instances in 7 +languages: English, Arabic, German, Spanish, Hindi, Vietnamese, and Chinese. The +dataset is derived from the SQuAD v1.1 dataset, with questions and contexts +translated by professional translators. + +languages: +arabic, chinese, german, hindi, spanish, vietnamese + +tags: +multilingual, qa + +paper: +https://arxiv.org/abs/1910.07475 +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mlqa_{lang.value}", + prompt_function=get_qa_prompt_function( + lang, + lambda line: { + "context": line["context"], + "question": line["question"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="facebook/mlqa", + hf_subset=f"mlqa.{standardize_tag(lang.value)}.{standardize_tag(lang.value)}", + hf_revision="397ed406c1a7902140303e7faf60fff35b58d285", + evaluation_splits=("test",), + hf_avail_splits=["test"], + generation_size=400, + stop_sequence=("\n",), + metrics=[ + MultilingualQuasiExactMatchMetric(lang, "prefix"), + MultilingualQuasiF1ScoreMetric(lang), + ], + ) + for lang in [ + Language.ARABIC, + Language.GERMAN, + Language.SPANISH, + Language.CHINESE, + Language.HINDI, + Language.VIETNAMESE, + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/oab_exams.py b/src/lighteval/tasks/multilingual/tasks/oab_exams.py new file mode 100644 index 000000000..88302cf53 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/oab_exams.py @@ -0,0 +1,68 @@ +""" +name: +Oab Exams + +dataset: +eduagarcia/oab_exams + +abstract: +OAB Exams: A collection of questions from the Brazilian Bar Association exam The +exam is required for anyone who wants to practice law in Brazil + +languages: +portuguese + +tags: +knowledge, multilingual, multiple-choice + +paper: +https://huggingface.co/datasets/eduagarcia/oab_exams +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"oab_exams_{Language.PORTUGUESE.value}_{formulation.name.lower()}", + prompt_function=get_mcq_prompt_function( + Language.PORTUGUESE, + lambda line: { + "question": line["question"], + "choices": line["choices"]["text"], + "gold_idx": LETTER_INDICES.index(line["answerKey"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="eduagarcia/oab_exams", + hf_subset="default", + evaluation_splits=("train",), + hf_avail_splits=["train"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/ocnli.py b/src/lighteval/tasks/multilingual/tasks/ocnli.py new file mode 100644 index 000000000..48a7278b1 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/ocnli.py @@ -0,0 +1,67 @@ +""" +name: +Ocnli + +dataset: +clue/clue + +abstract: +Native Chinese NLI dataset based. + +languages: +chinese + +tags: +classification, multilingual, nli + +paper: +https://arxiv.org/pdf/2010.05444 +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.nli import get_nli_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"ocnli_{Language.CHINESE.value}_{formulation.name.lower()}", + prompt_function=get_nli_prompt_function( + language=Language.CHINESE, + adapter=lambda line: { + "premise": line["sentence1"], + "hypothesis": line["sentence2"], + # Since we ignore the neutral label + "gold_idx": {1: 0, 2: 1}[line["label"]], + }, + relations=["entailment", "contradiction"], + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="clue/clue", + hf_subset="ocnli", + # Only keep the positive and negative examples + hf_filter=lambda x: int(x["label"]) in [1, 2], + evaluation_splits=("validation",), + few_shots_split="train", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/openai_mmlu.py b/src/lighteval/tasks/multilingual/tasks/openai_mmlu.py new file mode 100644 index 000000000..4a4df728a --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/openai_mmlu.py @@ -0,0 +1,150 @@ +""" +name: +Openai Mmlu + +dataset: +openai/MMMLU + +abstract: +Openai Mmlu multilingual benchmark. + +languages: +arabic, bengali, chinese, french, german, hindi, indonesian, italian, japanese, +korean, portuguese, spanish, swahili, yoruba + +tags: +knowledge, multilingual, multiple-choice + +paper: +""" + +from functools import partial + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +MMLU_SUBSETS = [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions", +] + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"openai_mmlu_{language[0].value}_{formulation.name.lower()}:{subset}", + prompt_function=get_mcq_prompt_function( + language[0], + lambda line: { + "question": line["Question"], + "choices": [line["A"], line["B"], line["C"], line["D"]], + "gold_idx": LETTER_INDICES.index(line["Answer"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="openai/MMMLU", + hf_subset=language[1], + evaluation_splits=("test",), + hf_avail_splits=["test"], + hf_filter=partial(lambda subset, x: x["Subject"].lower() == subset, subset), + hf_revision="038c7808122969ead7456361af05cb8f47d247f8", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), + ], + ), + ) + for subset in MMLU_SUBSETS + for language in [ + (Language.ARABIC, "AR_XY"), + (Language.BENGALI, "BN_BD"), + (Language.GERMAN, "DE_DE"), + (Language.SPANISH, "ES_LA"), + (Language.FRENCH, "FR_FR"), + (Language.HINDI, "HI_IN"), + (Language.INDONESIAN, "ID_ID"), + (Language.ITALIAN, "IT_IT"), + (Language.JAPANESE, "JA_JP"), + (Language.KOREAN, "KO_KR"), + (Language.PORTUGUESE, "PT_BR"), + (Language.SWAHILI, "SW_KE"), + (Language.YORUBA, "YO_NG"), + (Language.CHINESE, "ZH_CN"), + ] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/openbook_ara.py b/src/lighteval/tasks/multilingual/tasks/openbook_ara.py new file mode 100644 index 000000000..db5b3a426 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/openbook_ara.py @@ -0,0 +1,67 @@ +""" +name: +Openbook Ara + +dataset: +OALL/AlGhafa-Arabic-LLM-Benchmark-Translated + +abstract: +OpenBookQA: A Question-Answering Dataset for Open-Book Exams OpenBookQA is a +question-answering dataset modeled after open-book exams for assessing human +understanding of a subject. It consists of multiple-choice questions that +require combining facts from a given open book with broad common knowledge. The +task tests language models' ability to leverage provided information and apply +common sense reasoning. + +languages: +arabic + +tags: +multilingual, multiple-choice, reasoning + +paper: +https://arxiv.org/abs/1809.02789 +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.adapters import ( + alghafa_adapter, +) +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"alghafa_openbookqa_{Language.ARABIC.value}_{formulation.name.lower()}", + prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation), + suite=["lighteval"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="openbook_qa_ext_ar", + hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", + evaluation_splits=["test"], + few_shots_split="validation", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/openbook_es.py b/src/lighteval/tasks/multilingual/tasks/openbook_es.py new file mode 100644 index 000000000..c428275fe --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/openbook_es.py @@ -0,0 +1,67 @@ +""" +name: +Openbook Es + +dataset: +BSC-LT/openbookqa-es + +abstract: +Spanish version of OpenBookQA from BSC Language Technology group + +languages: +spanish + +tags: +multilingual, multiple-choice, reasoning + +paper: +https://huggingface.co/datasets/BSC-LT/openbookqa-es +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"openbookqa_{Language.SPANISH.value}_{formulation.name.lower()}", + prompt_function=get_mcq_prompt_function( + Language.SPANISH, + lambda line: { + "question": line["question_stem"], + "choices": line["choices"]["text"], + "gold_idx": LETTER_INDICES.index(line["answerKey"]), + }, + formulation=formulation, + ), + suite=["lighteval"], + hf_repo="BSC-LT/openbookqa-es", + hf_subset="default", + evaluation_splits=("test",), + few_shots_split="validation", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/openbook_rus.py b/src/lighteval/tasks/multilingual/tasks/openbook_rus.py new file mode 100644 index 000000000..498d32eed --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/openbook_rus.py @@ -0,0 +1,68 @@ +""" +name: +Openbook Rus + +dataset: +ai-forever/MERA + +abstract: +The Russian version is part of the MERA (Multilingual Enhanced Russian NLP +Architectures) project. + +languages: +russian + +tags: +multilingual, multiple-choice, reasoning + +paper: +https://arxiv.org/abs/2401.04531 +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mera_openbookqa_{Language.RUSSIAN.value}_{formulation.name.lower()}", + prompt_function=get_mcq_prompt_function( + Language.RUSSIAN, + lambda line: { + "question": line["inputs"]["question"], + "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]], + "gold_idx": LETTER_INDICES.index(line["outputs"]), + }, + formulation=formulation, + ), + suite=["lighteval"], + hf_repo="ai-forever/MERA", + hf_subset="ruopenbookqa", + evaluation_splits=("train",), + hf_avail_splits=["train"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/oz.py b/src/lighteval/tasks/multilingual/tasks/oz.py new file mode 100644 index 000000000..dde7552a1 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/oz.py @@ -0,0 +1,77 @@ +""" +name: +OZ Serbian Evals + +dataset: +DjMel/oz-eval + +abstract: +OZ Eval (sr. Opšte Znanje Evaluacija) dataset was created for the purposes of +evaluating General Knowledge of LLM models in Serbian language. Data consists +of 1k+ high-quality questions and answers which were used as part of entry exams +at the Faculty of Philosophy and Faculty of Organizational Sciences, University +of Belgrade. The exams test the General Knowledge of students and were used in +the enrollment periods from 2003 to 2024. + +languages: +serbian + +tags: +knowledge, multiple-choice + +paper: +""" + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +def prompt_fn_oz_eval_task(line, task_name: str = None): + query_template = """Pitanje: {question}\n + Ponuđeni odgovori: + A. {choice_a} + B. {choice_b} + C. {choice_c} + D. {choice_d} + E. {choice_e} + + Krajnji odgovor:""" + + options = line["options"] + + query = query_template.format( + question=line["questions"], + choice_a=options[0], + choice_b=options[1], + choice_c=options[2], + choice_d=options[3], + choice_e=options[4], + ) + + choices = ["A", "B", "C", "D", "E"] + return Doc( + task_name=task_name, + query=query, + choices=choices, + gold_index=choices.index(line["answer"]), + ) + + +oz_eval_task = LightevalTaskConfig( + name="serbian_evals:oz_task", + prompt_function=prompt_fn_oz_eval_task, + suite=["community"], + hf_repo="DjMel/oz-eval", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + metrics=[Metrics.loglikelihood_acc], + version=0, +) + + +# STORE YOUR EVALS +TASKS_TABLE = [oz_eval_task] diff --git a/src/lighteval/tasks/multilingual/tasks/parus.py b/src/lighteval/tasks/multilingual/tasks/parus.py new file mode 100644 index 000000000..6ff91448b --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/parus.py @@ -0,0 +1,65 @@ +""" +name: +Parus + +dataset: +ai-forever/MERA + +abstract: +PARus: Plausible Alternatives for Russian PARus is the Russian adaptation of the +COPA task, part of the Russian SuperGLUE benchmark. It evaluates common sense +reasoning and causal inference abilities in Russian language models. + +languages: +russian + +tags: +multilingual + +paper: +https://russiansuperglue.com/tasks/task_info/PARus +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.copa import get_copa_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"parus_{Language.RUSSIAN.value}_{formulation.name.lower()}", + suite=["lighteval"], + prompt_function=get_copa_prompt_function( + language=Language.RUSSIAN, + adapter=lambda line: { + "context": line["inputs"]["premise"], + "cause_effect": line["meta"]["task"], + "continuations": [line["inputs"]["choice1"], line["inputs"]["choice2"]], + "gold_idx": int(line["outputs"]) - 1, + }, + formulation=formulation, + ), + hf_repo="ai-forever/MERA", + hf_subset="parus", + evaluation_splits=["train"], + few_shots_split="validation", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/paws_x.py b/src/lighteval/tasks/multilingual/tasks/paws_x.py new file mode 100644 index 000000000..e294cc15c --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/paws_x.py @@ -0,0 +1,79 @@ +""" +name: +Paws X + +dataset: +google-research-datasets/paws-x + +abstract: +PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification This +dataset contains paraphrase identification pairs in multiple languages. It's +derived from PAWS (Paraphrase Adversaries from Word Scrambling) and We treat +paraphrase as entailment and non-paraphrase as contradiction + +languages: +chinese, english, french, german, japanese, korean, spanish + +tags: +classification, multilingual, nli + +paper: +https://arxiv.org/abs/1908.11828 +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.nli import get_nli_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"pawsx_{language.value}_{formulation.name.lower()}", + suite=("lighteval",), + prompt_function=get_nli_prompt_function( + language=language, + adapter=lambda line: { + "premise": line["sentence1"], + "hypothesis": line["sentence2"], + # Since we ignore the neutral label + "gold_idx": int(line["label"]), + }, + relations=["entailment", "contradiction"], + formulation=formulation, + ), + hf_repo="google-research-datasets/paws-x", + hf_subset=standardize_tag(language.value), + evaluation_splits=("test",), + few_shots_split="train", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for language in [ + Language.GERMAN, + Language.ENGLISH, + Language.SPANISH, + Language.FRENCH, + Language.JAPANESE, + Language.KOREAN, + Language.CHINESE, + ] + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/piqa_ar.py b/src/lighteval/tasks/multilingual/tasks/piqa_ar.py new file mode 100644 index 000000000..e3f7b2f40 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/piqa_ar.py @@ -0,0 +1,66 @@ +""" +name: +Piqa Ar + +dataset: +OALL/AlGhafa-Arabic-LLM-Benchmark-Translated + +abstract: +PIQA: Physical Interaction Question Answering PIQA is a benchmark for testing +physical commonsense reasoning. This Arabic version is a translation of the +original PIQA dataset, adapted for Arabic language evaluation. It tests the +ability to reason about physical interactions in everyday situations. + +languages: +arabic + +tags: +multilingual, multiple-choice, qa, reasoning + +paper: +https://arxiv.org/abs/1911.11641 +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.adapters import ( + alghafa_adapter, +) +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"alghafa_piqa_{Language.ARABIC.value}_{formulation.name.lower()}", + prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation), + suite=["lighteval"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", + hf_subset="piqa_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/rcb.py b/src/lighteval/tasks/multilingual/tasks/rcb.py new file mode 100644 index 000000000..7091126a5 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/rcb.py @@ -0,0 +1,68 @@ +""" +name: +Rcb + +dataset: +ai-forever/MERA + +abstract: +Russian Commitment Bank (RCB) is a large-scale NLI dataset with Russian +sentences, collected from the web and crowdsourcing. + +languages: +russian + +tags: +classification, multilingual, nli + +paper: +https://arxiv.org/abs/2401.04531 +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.nli import get_nli_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"rcb_{Language.RUSSIAN.value}_{formulation.name.lower()}", + prompt_function=get_nli_prompt_function( + language=Language.RUSSIAN, + adapter=lambda line: { + "premise": line["inputs"]["premise"], + "hypothesis": line["inputs"]["hypothesis"], + # Since we ignore the neutral label + "gold_idx": int(line["outputs"]) - 1, + }, + relations=["entailment", "contradiction"], + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="ai-forever/MERA", + hf_subset="rcb", + # Ignore neutral label + hf_filter=lambda x: int(x["outputs"] or "0") in [1, 2], + evaluation_splits=("train",), + few_shots_split="validation", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/sber_squad.py b/src/lighteval/tasks/multilingual/tasks/sber_squad.py new file mode 100644 index 000000000..51abc0609 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/sber_squad.py @@ -0,0 +1,53 @@ +""" +name: +Sber Squad + +dataset: +kuznetsoffandrey/sberquad + +abstract: +SberQuAD: A large-scale Russian reading comprehension dataset. + +languages: +russian + +tags: +multilingual, qa + +paper: +https://arxiv.org/abs/1912.09723 +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"sber_squad_{Language.RUSSIAN.value}", + prompt_function=get_qa_prompt_function( + Language.RUSSIAN, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="kuznetsoffandrey/sberquad", + hf_subset="sberquad", + evaluation_splits=("validation",), + few_shots_split="train", + metrics=( + MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.RUSSIAN), + ), + generation_size=400, + stop_sequence=("\n",), + ) +] diff --git a/community_tasks/serbian_eval.py b/src/lighteval/tasks/multilingual/tasks/serbian_eval.py similarity index 95% rename from community_tasks/serbian_eval.py rename to src/lighteval/tasks/multilingual/tasks/serbian_eval.py index c235c7e47..e2df1f57a 100644 --- a/community_tasks/serbian_eval.py +++ b/src/lighteval/tasks/multilingual/tasks/serbian_eval.py @@ -1,34 +1,22 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +""" +name: +Serbian Evals +dataset: +datatab/serbian-llm-benchmark -""" -This module contains task configurations and prompt functions for evaluating -LLM models on Serbian datasets. -Each task is defined using the `LightevalTaskConfig` class with its respective -prompt function. +abstract: The tasks cover a variety of benchmarks, including: standard task like ARC[E][C], BoolQ, Hellaswag, OpenBookQA,PIQA, Winogrande and a custom OZ Eval. MMLU is separated by subject and also all in one. + +languages: +serbian + +tags: +knowledge, multiple-choice + +paper: """ from enum import Enum diff --git a/src/lighteval/tasks/multilingual/tasks/soqal.py b/src/lighteval/tasks/multilingual/tasks/soqal.py new file mode 100644 index 000000000..ad41456c9 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/soqal.py @@ -0,0 +1,61 @@ +""" +name: +Soqal + +dataset: +OALL/AlGhafa-Arabic-LLM-Benchmark-Native + +abstract: +SOQAL: A large-scale Arabic reading comprehension dataset. + +languages: +arabic + +tags: +multilingual, qa + +paper: +https://arxiv.org/abs/1906.05394 +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.adapters import ( + alghafa_adapter, +) +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"soqal_{Language.ARABIC.value}_{formulation.name.lower()}", + hf_subset="multiple_choice_grounded_statement_soqal_task", + prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation), + evaluation_splits=["test"], + few_shots_split="validation", + suite=["lighteval"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/squad_es.py b/src/lighteval/tasks/multilingual/tasks/squad_es.py new file mode 100644 index 000000000..4022a8420 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/squad_es.py @@ -0,0 +1,54 @@ +""" +name: +Squad Es + +dataset: +ccasimiro/squad_es + +abstract: +SQuAD-es: Spanish translation of the Stanford Question Answering Dataset + +languages: +spanish + +tags: +multilingual, qa + +paper: +https://huggingface.co/datasets/ccasimiro/squad_es +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"squad_{Language.SPANISH.value}", + prompt_function=get_qa_prompt_function( + Language.SPANISH, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="ccasimiro/squad_es", + hf_subset="v2.0.0", + hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), + evaluation_splits=("validation",), + few_shots_split="train", + metrics=( + MultilingualQuasiExactMatchMetric(Language.SPANISH, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.SPANISH), + ), + generation_size=400, + stop_sequence=("\n",), + ) +] diff --git a/src/lighteval/tasks/multilingual/tasks/squad_it.py b/src/lighteval/tasks/multilingual/tasks/squad_it.py new file mode 100644 index 000000000..d894e19be --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/squad_it.py @@ -0,0 +1,54 @@ +""" +name: +Squad It + +dataset: +crux82/squad_it + +abstract: +SQuAD-it: Italian translation of the SQuAD dataset. + +languages: +italian + +tags: +multilingual, qa + +paper: +https://github.com/crux82/squad-it +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"squad_{Language.ITALIAN.value}", + prompt_function=get_qa_prompt_function( + Language.ITALIAN, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="crux82/squad_it", + hf_subset="default", + hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), + evaluation_splits=("test",), + few_shots_split="train", + generation_size=400, + stop_sequence=("\n",), + metrics=( + MultilingualQuasiExactMatchMetric(Language.ITALIAN, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.ITALIAN), + ), + ) +] diff --git a/src/lighteval/tasks/multilingual/tasks/swahili_arc.py b/src/lighteval/tasks/multilingual/tasks/swahili_arc.py new file mode 100644 index 000000000..c40efa573 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/swahili_arc.py @@ -0,0 +1,72 @@ +""" +name: +Swahili Arc + +dataset: + +abstract: +Swahili Arc multilingual benchmark. + +languages: +swahili + +tags: +multilingual, multiple-choice, reasoning + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"community_arc_{Language.SWAHILI.value}_{formulation.name.lower()}:{subset}", + prompt_function=get_mcq_prompt_function( + Language.SWAHILI, + lambda line: { + "question": line["question"], + "choices": line["choices"]["text"], + "gold_idx": int(line["answerKey"]) - 1 + if line["answerKey"].isdigit() + else LETTER_INDICES.index(line["answerKey"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo=f"Mollel/ARC_{subset.capitalize()}_SWH", + hf_subset="default", + hf_revision="5347439d3193c8a0dabaab3819914bf076dc94d4" + if subset == "easy" + else "dc1df9df632d14c251594d9129fb833d2ca4429c", + evaluation_splits=("test",), + few_shots_split="train", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ] + + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore + ), + ) + for subset in ["easy", "challenge"] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/thai_exams.py b/src/lighteval/tasks/multilingual/tasks/thai_exams.py new file mode 100644 index 000000000..73f8140f7 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/thai_exams.py @@ -0,0 +1,64 @@ +""" +name: +Thai Exams + +dataset: +scb10x/thai_exam + +abstract: +Thai Exams multilingual benchmark. + +languages: +thai + +tags: +knowledge, multilingual, multiple-choice + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.adapters import ( + thai_exams_adapter, +) +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +THAI_EXAMS_SUBSETS = ["a_level", "ic", "onet", "tgat", "tpat1"] + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"thai_exams_{Language.THAI.value}_{formulation.name.lower()}:{subset}", + prompt_function=get_mcq_prompt_function(Language.THAI, thai_exams_adapter, formulation=formulation), + suite=("lighteval",), + hf_repo="scb10x/thai_exam", + hf_subset=subset, + evaluation_splits=("test",), + few_shots_split="train", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for subset in THAI_EXAMS_SUBSETS + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/thaiqa.py b/src/lighteval/tasks/multilingual/tasks/thaiqa.py new file mode 100644 index 000000000..bf2b5c279 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/thaiqa.py @@ -0,0 +1,52 @@ +""" +name: +Thaiqa + +dataset: +lighteval/thaiqa_squad_fixed + +abstract: +ThaiQA: A question answering dataset for the Thai language. + +languages: +thai + +tags: +multilingual, qa + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"thaiqa_{Language.THAI.value}", + prompt_function=get_qa_prompt_function( + Language.THAI, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["answer"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="lighteval/thaiqa_squad_fixed", + hf_subset="default", + evaluation_splits=("train",), + few_shots_split="validation", + generation_size=400, + stop_sequence=("\n",), + metrics=( + MultilingualQuasiExactMatchMetric(Language.THAI, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.THAI), + ), + ) +] diff --git a/src/lighteval/tasks/multilingual/tasks/tquad_v2.py b/src/lighteval/tasks/multilingual/tasks/tquad_v2.py new file mode 100644 index 000000000..e337ff538 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/tquad_v2.py @@ -0,0 +1,52 @@ +""" +name: +Tquad V2 + +dataset: +erdometo/tquad2 + +abstract: +TQuAD v2: Turkish Question Answering Dataset version 2. + +languages: +turkish + +tags: +multilingual, qa + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"tquadv2_{Language.TURKISH.value}", + prompt_function=get_qa_prompt_function( + Language.TURKISH, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [a["text"] for a in line["answers"]], + }, + ), + suite=("lighteval",), + hf_repo="erdometo/tquad2", + hf_subset="default", + evaluation_splits=("validation",), + few_shots_split="train", + generation_size=400, + stop_sequence=("\n",), + metrics=( + MultilingualQuasiExactMatchMetric(Language.TURKISH, "prefix"), + MultilingualQuasiF1ScoreMetric(Language.TURKISH), + ), + ) +] diff --git a/community_tasks/turkic_evals.py b/src/lighteval/tasks/multilingual/tasks/turkic.py similarity index 64% rename from community_tasks/turkic_evals.py rename to src/lighteval/tasks/multilingual/tasks/turkic.py index 242b25f81..074fc9b4a 100644 --- a/community_tasks/turkic_evals.py +++ b/src/lighteval/tasks/multilingual/tasks/turkic.py @@ -1,40 +1,22 @@ -# MIT License +""" +name: +Turkic Evals -# Copyright (c) 2024 The HuggingFace Team +dataset: +jafarisbarov/TUMLU-mini -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +abstract: +TUMLU-mini is a benchmark for Turkic language understanding, comprising 1,000 +prompts organized into 10 subsets. -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +languages: +turkic -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +tags: +knowledge, multiple-choice -# ruff: noqa: F405, F403, F401 -""" -Task to evaluate LLMs on TUMLU-mini benchmark: https://huggingface.co/datasets/jafarisbarov/TUMLU-mini - -For more details, see the associated paper: - -@misc{isbarov2025tumluunifiednativelanguage, - title={{TUMLU: A Unified and Native Language Understanding Benchmark for Turkic Languages}}, - author={Jafar Isbarov and Arofat Akhundjanova and Mammad Hajili and Kavsar Huseynova and Dmitry Gaynullin and Anar Rzayev and Osman Tursun and Ilshat Saetov and Rinat Kharisov and Saule Belginova and Ariana Kenbayeva and Amina Alisheva and Aizirek Turdubaeva and Abdullatif Köksal and Samir Rustamov and Duygu Ataman}, - year={2025}, - eprint={2502.11020}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2502.11020}, -} +paper: +https://arxiv.org/abs/2502.11020 """ from functools import partial diff --git a/src/lighteval/tasks/multilingual/tasks/turkish_arc.py b/src/lighteval/tasks/multilingual/tasks/turkish_arc.py new file mode 100644 index 000000000..9174851e6 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/turkish_arc.py @@ -0,0 +1,70 @@ +""" +name: +Turkish Arc + +dataset: +malhajar/arc-tr + +abstract: +Turkish ARC Comes from the Turkish leaderboard + +languages: +turkish + +tags: +multilingual, multiple-choice, reasoning + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"community_arc_{Language.TURKISH.value}_{formulation.name.lower()}:{subset}", + prompt_function=get_mcq_prompt_function( + Language.TURKISH, + lambda line: { + "question": line["question"], + "choices": line["choices"]["text"], + "gold_idx": int(line["answerKey"]) - 1 + if line["answerKey"].isdigit() + else LETTER_INDICES.index(line["answerKey"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="malhajar/arc-tr", + hf_subset=f"ARC-{subset.capitalize()}", + evaluation_splits=("test",), + hf_avail_splits=["train"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ] + + ([LogLikelihoodAccMetric(normalization=LogProbPMINorm())] if subset == "challenge" else []), # type: ignore + ), + ) + for subset in ["easy", "challenge"] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py b/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py new file mode 100644 index 000000000..cc0605456 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py @@ -0,0 +1,81 @@ +""" +name: +Turkish Mmlu + +dataset: +AYueksel/TurkishMMLU + +abstract: +Turkish Mmlu multilingual benchmark. + +languages: +turkish + +tags: +knowledge, multilingual, multiple-choice + +paper: +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation, normalize_subset +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TURKISH_MMLU_SUBSET = [ + "Biology", + "Chemistry", + "Geography", + "History", + "Mathematics", + "Philosophy", + "Physics", + "Religion_and_Ethics", + "Turkish_Language_and_Literature", +] + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mmlu_{Language.TURKISH.value}_{formulation.name.lower()}:{normalize_subset(subset)}", + prompt_function=get_mcq_prompt_function( + Language.TURKISH, + lambda line: { + "question": line["question"], + "choices": line["choices"], + "gold_idx": LETTER_INDICES.index(line["answer"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="AYueksel/TurkishMMLU", + hf_subset=subset, + evaluation_splits=("test",), + few_shots_split="dev", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), + ], + ), + ) + for subset in TURKISH_MMLU_SUBSET + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/tydiqa.py b/src/lighteval/tasks/multilingual/tasks/tydiqa.py new file mode 100644 index 000000000..b7a62e2dd --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/tydiqa.py @@ -0,0 +1,66 @@ +""" +name: +Tydiqa + +dataset: +google-research-datasets/tydiqa + +abstract: +Other QA tasks for RC TyDi QA: A benchmark for information-seeking question answering in typologically diverse languages. https://arxiv.org/abs/2003.05002 + +languages: +arabic, bengali, english, finnish, indonesian, japanese, korean, russian, swahili, telugu, thai + +tags: +multilingual, qa + +paper: +https://arxiv.org/abs/2003.05002 +""" + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"tydiqa_{language.value}", + prompt_function=get_qa_prompt_function( + language, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="google-research-datasets/tydiqa", + hf_subset="secondary_task", + evaluation_splits=("validation",), + few_shots_split="train", + generation_size=400, + stop_sequence=("\n",), + metrics=( + MultilingualQuasiExactMatchMetric(language, "prefix"), + MultilingualQuasiF1ScoreMetric(language), + ), + ) + for language in [ + Language.ENGLISH, + Language.ARABIC, + Language.BENGALI, + Language.FINNISH, + Language.INDONESIAN, + Language.JAPANESE, + Language.KOREAN, + Language.SWAHILI, + Language.RUSSIAN, + Language.TELUGU, + Language.THAI, + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/worldtree_rus.py b/src/lighteval/tasks/multilingual/tasks/worldtree_rus.py new file mode 100644 index 000000000..814c80b49 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/worldtree_rus.py @@ -0,0 +1,70 @@ +""" +name: +Worldtree Rus + +dataset: +ai-forever/MERA + +abstract: +WorldTree is a dataset for multi-hop inference in science question answering. It +provides explanations for elementary science questions by combining facts from a +semi-structured knowledge base. This Russian version is part of the MERA +(Multilingual Evaluation of Reasoning Abilities) benchmark. + +languages: +russian + +tags: +multilingual + +paper: +https://github.com/ai-forever/MERA +""" + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mera_worldtree_{Language.RUSSIAN.value}_{formulation.name.lower()}", + prompt_function=get_mcq_prompt_function( + Language.RUSSIAN, + lambda line: { + "question": line["inputs"]["question"], + "choices": [line["inputs"][f"option_{i.lower()}"] for i in LETTER_INDICES[:4]], + "gold_idx": LETTER_INDICES.index(line["outputs"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="ai-forever/MERA", + hf_subset="ruworldtree", + evaluation_splits=("train",), + hf_avail_splits=["train"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/xcodah.py b/src/lighteval/tasks/multilingual/tasks/xcodah.py new file mode 100644 index 000000000..5b6783eaf --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/xcodah.py @@ -0,0 +1,83 @@ +""" +name: +Xcodah + +dataset: +INK-USC/xcsr + +abstract: +Xcodah multilingual benchmark. + +languages: +arabic, chinese, dutch, english, french, german, hindi, italian, japanese, +polish, portuguese, russian, spanish, swahili, urdu, vietnamese + +tags: +multilingual, multiple-choice, reasoning + +paper: +""" + +from functools import partial + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.adapters import ( + xcodah_adapter, +) +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"xcodah_{language.value}_{formulation.name.lower()}", + prompt_function=get_mcq_prompt_function(language, partial(xcodah_adapter, language), formulation=formulation), + suite=("lighteval",), + hf_repo="INK-USC/xcsr", + hf_subset=f"X-CODAH-{standardize_tag(language.value) if language != Language.JAPANESE else 'jap'}", + evaluation_splits=("validation",), + hf_avail_splits=["validation"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for language in [ + Language.ARABIC, + Language.GERMAN, + Language.ENGLISH, + Language.SPANISH, + Language.FRENCH, + Language.HINDI, + Language.ITALIAN, + Language.JAPANESE, + Language.DUTCH, + Language.POLISH, + Language.PORTUGUESE, + Language.RUSSIAN, + Language.SWAHILI, + Language.URDU, + Language.VIETNAMESE, + Language.CHINESE, + ] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/xcopa.py b/src/lighteval/tasks/multilingual/tasks/xcopa.py new file mode 100644 index 000000000..aafb34c77 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/xcopa.py @@ -0,0 +1,82 @@ +""" +name: +Xcopa + +dataset: + +abstract: +COPA (Choice of Plausible Alternatives) tasks involve determining the most +plausible cause or effect for a given premise. These tasks test common sense +reasoning and causal inference abilities. XCOPA: Cross-lingual Choice of +Plausible Alternatives. + +languages: +arabic, chinese, estonian, haitian, indonesian, italian, quechua, swahili, +tamil, thai, turkish, vietnamese + +tags: +multilingual, multiple-choice, narrative, reasoning + +paper: +https://aclanthology.org/2020.emnlp-main.185/ +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.copa import get_copa_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"xcopa_{language.value}_{formulation.name.lower()}", + suite=["lighteval"], + prompt_function=get_copa_prompt_function( + language, + adapter=lambda line: { + "context": line["premise"], + "cause_effect": line["question"], + "continuations": [line["choice1"], line["choice2"]], + "gold_idx": int(line["label"]), + }, + formulation=formulation, + ), + hf_repo=("OALL/AlGhafa-Arabic-LLM-Benchmark-Translated" if language == Language.ARABIC else "xcopa"), + hf_subset=("copa_ext_ar" if language == Language.ARABIC else standardize_tag(language.value)), + evaluation_splits=["test"], + few_shots_split="validation", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for language in [ + Language.ARABIC, + Language.ESTONIAN, + Language.INDONESIAN, + Language.ITALIAN, + Language.SWAHILI, + Language.TAMIL, + Language.THAI, + Language.TURKISH, + Language.VIETNAMESE, + Language.CHINESE, + Language.HAITIAN, + Language.QUECHUA, + ] + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/xcsqa.py b/src/lighteval/tasks/multilingual/tasks/xcsqa.py new file mode 100644 index 000000000..ef12349f6 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/xcsqa.py @@ -0,0 +1,95 @@ +""" +name: +Xcsqa + +dataset: +INK-USC/xcsr + +abstract: +XCSQA (Cross-lingual Commonsense QA) is part of the XCSR (Cross-lingual +Commonsense Reasoning) benchmark It is a multilingual extension of the +CommonsenseQA dataset, covering 16 languages The task involves answering +multiple-choice questions that require commonsense reasoning Uses PMI +normalization. + +languages: +arabic, chinese, dutch, english, french, german, hindi, italian, japanese, +polish, portuguese, russian, spanish, swahili, urdu, vietnamese + +tags: +multilingual, multiple-choice, qa, reasoning + +paper: +https://arxiv.org/abs/2110.08462 +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbPMINorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"xcsqa_{language.value}_{formulation.name.lower()}", + prompt_function=get_mcq_prompt_function( + language, + lambda line: { + "question": line["question"]["stem"], + "choices": line["question"]["choices"]["text"], + "gold_idx": line["question"]["choices"]["label"].index(line["answerKey"]), + }, + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="INK-USC/xcsr", + hf_subset=f"X-CSQA-{standardize_tag(language.value) if language != Language.JAPANESE else 'jap'}", + hf_filter=lambda x: all( + len(x["question"]["choices"]["text"][i].strip()) > 0 for i in range(len(x["question"]["choices"]["text"])) + ), + evaluation_splits=("validation",), + hf_avail_splits=["validation"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), + ], + ), + ) + for language in [ + Language.ARABIC, + Language.GERMAN, + Language.ENGLISH, + Language.SPANISH, + Language.FRENCH, + Language.HINDI, + Language.ITALIAN, + Language.JAPANESE, + Language.DUTCH, + Language.POLISH, + Language.PORTUGUESE, + Language.RUSSIAN, + Language.SWAHILI, + Language.URDU, + Language.VIETNAMESE, + Language.CHINESE, + ] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/xnli.py b/src/lighteval/tasks/multilingual/tasks/xnli.py new file mode 100644 index 000000000..9c55458ec --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/xnli.py @@ -0,0 +1,93 @@ +""" +name: +Xnli + +dataset: +facebook/xnli + +abstract: +NLI (Natural Language Inference) tasks involve determining the logical +relationship between two given sentences: a premise and a hypothesis. The goal +is to classify whether the hypothesis is entailed by, contradicts, or is neutral +with respect to the premise. After our inspection we found the neutral label to +be quite ambiguous and decided to exclude it. But you can easily add it by +modifying the adapters The XNLI dataset is a multilingual variant of MultiNLI + +languages: +arabic, bulgarian, chinese, english, french, german, greek, hindi, russian, +spanish, swahili, thai, turkish, urdu, vietnamese + +tags: +classification, multilingual, nli + +paper: +https://aclanthology.org/D18-1269/ +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.nli import get_nli_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"xnli_{language.value}_{formulation.name.lower()}", + suite=["lighteval"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + prompt_function=get_nli_prompt_function( + language=language, + adapter=lambda line: { + "premise": line["premise"], + "hypothesis": line["hypothesis"], + # Since we ignore the neutral label + "gold_idx": {0: 0, 2: 1}[line["label"]], + }, + relations=["entailment", "contradiction"], + formulation=formulation, + ), + hf_filter=lambda line: line["label"] in [0, 2], + hf_repo="facebook/xnli", + hf_subset=standardize_tag(language.value), + evaluation_splits=["validation"], + few_shots_split="train", + ) + for language in [ + Language.ARABIC, + Language.ENGLISH, + Language.FRENCH, + Language.SPANISH, + Language.BULGARIAN, + Language.GERMAN, + Language.GREEK, + Language.ENGLISH, + Language.FRENCH, + Language.HINDI, + Language.RUSSIAN, + Language.SWAHILI, + Language.THAI, + Language.TURKISH, + Language.URDU, + Language.VIETNAMESE, + Language.CHINESE, + ] + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/xnli2.py b/src/lighteval/tasks/multilingual/tasks/xnli2.py new file mode 100644 index 000000000..cf3ec6a66 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/xnli2.py @@ -0,0 +1,100 @@ +""" +name: +Xnli2 + +dataset: + +abstract: +Improvement on XNLI with better translation, from our experience models tend to +perform better on XNLI2.0 than XNLI. + +languages: +arabic, assamese, bengali, bulgarian, chinese, english, french, german, greek, +gujarati, hindi, kannada, marathi, punjabi, russian, sanskrit, spanish, swahili, +tamil, thai, turkish, urdu, vietnamese + +tags: +classification, multilingual, nli + +paper: +https://arxiv.org/abs/2301.06527 +""" + +from langcodes import Language as LangCodeLanguage +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.nli import get_nli_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"xnli2.0_{language.value}_{formulation.name.lower()}", + suite=["lighteval"], + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + prompt_function=get_nli_prompt_function( + language=language, + adapter=lambda line: { + "premise": line["premise"], + "hypothesis": line["hypothesis"], + # Since we ignore the neutral label + "gold_idx": {0: 0, 2: 1}[line["label"]], + }, + relations=["entailment", "contradiction"], + formulation=formulation, + ), + hf_filter=lambda line: line["label"] in [0, 2] + and line["premise"] is not None + and line["hypothesis"] is not None, + hf_repo=f"Harsit/xnli2.0_train_{LangCodeLanguage(standardize_tag(language.value)).language_name().lower()}", + hf_subset="default", + evaluation_splits=["train"], + hf_avail_splits=["train"], + ) + for language in [ + Language.ENGLISH, + Language.FRENCH, + Language.PUNJABI, + Language.GUJARATI, + Language.KANNADA, + Language.ASSAMESE, + Language.BENGALI, + Language.MARATHI, + Language.SANSKRIT, + Language.TAMIL, + Language.GERMAN, + Language.ENGLISH, + Language.URDU, + Language.VIETNAMESE, + Language.TURKISH, + Language.THAI, + Language.SWAHILI, + Language.SPANISH, + Language.RUSSIAN, + Language.HINDI, + Language.GREEK, + Language.CHINESE, + Language.BULGARIAN, + Language.ARABIC, + # Theoretically also: Bhojpuri, Gujarati, Odiya + ] + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/xnli_indic.py b/src/lighteval/tasks/multilingual/tasks/xnli_indic.py new file mode 100644 index 000000000..4d3cf481c --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/xnli_indic.py @@ -0,0 +1,83 @@ +""" +name: +Xnli Indic + +dataset: +Divyanshu/indicxnli + +abstract: +Another variant of XNLI, with emphasis on Indic languages. + +languages: +assamese, bengali, gujarati, hindi, kannada, malayalam, marathi, oriya, punjabi, +tamil, telugu + +tags: +classification, multilingual, nli + +paper: +https://arxiv.org/abs/2204.08776 +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.nli import get_nli_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"indicnxnli_{language.value}_{formulation.name.lower()}", + suite=["lighteval"], + prompt_function=get_nli_prompt_function( + language=language, + adapter=lambda line: { + "premise": line["premise"], + "hypothesis": line["hypothesis"], + # Since we ignore the neutral label + "gold_idx": {0: 0, 2: 1}[line["label"]], + }, + relations=["entailment", "contradiction"], + formulation=formulation, + ), + hf_repo="Divyanshu/indicxnli", + hf_subset=standardize_tag(language.value), + # Ignore neutral + hf_filter=lambda x: int(x["label"]) in [0, 2], + evaluation_splits=["validation"], + few_shots_split="train", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for language in [ + Language.ASSAMESE, + Language.BENGALI, + Language.GUJARATI, + Language.HINDI, + Language.KANNADA, + Language.MALAYALAM, + Language.MARATHI, + Language.ORIYA, + Language.PUNJABI, + Language.TAMIL, + Language.TELUGU, + ] + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] diff --git a/src/lighteval/tasks/multilingual/tasks/xquad.py b/src/lighteval/tasks/multilingual/tasks/xquad.py new file mode 100644 index 000000000..858b3a6ee --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/xquad.py @@ -0,0 +1,74 @@ +""" +name: +Xquad + +dataset: +google/xquad + +abstract: +Reading Comprehension (RC) tasks evaluate a model's ability to understand and +extract information from text passages. These tasks typically involve answering +questions based on given contexts, spanning multiple languages and formats. Add +RC tasks supporting about 130 unique languages/scripts. SQuAD - like XQuAD: +Cross-lingual Question Answering Dataset, extending SQuAD to 11 languages. + +languages: +arabic, chinese, english, german, greek, hindi, romanian, russian, spanish, +thai, turkish, vietnamese + +tags: +multilingual, qa + +paper: +https://arxiv.org/abs/1910.11856 +""" + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + MultilingualQuasiExactMatchMetric, + MultilingualQuasiF1ScoreMetric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"xquad_{language.value}", + prompt_function=get_qa_prompt_function( + language, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="google/xquad", + hf_subset=f"xquad.{standardize_tag(language.value)}", + evaluation_splits=("validation",), + few_shots_split="validation", + generation_size=400, + stop_sequence=("\n",), + metrics=( + MultilingualQuasiExactMatchMetric(language, "prefix"), + MultilingualQuasiF1ScoreMetric(language), + ), + ) + for language in [ + Language.ARABIC, + Language.GERMAN, + Language.GREEK, + Language.ENGLISH, + Language.SPANISH, + Language.HINDI, + Language.ROMANIAN, + Language.RUSSIAN, + Language.THAI, + Language.TURKISH, + Language.VIETNAMESE, + Language.CHINESE, + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/xstory.py b/src/lighteval/tasks/multilingual/tasks/xstory.py new file mode 100644 index 000000000..aaf9842c5 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/xstory.py @@ -0,0 +1,93 @@ +""" +name: +Xstory + +dataset: +juletxara/xstory_cloze + +abstract: +Xstory multilingual benchmark. + +languages: +arabic, basque, burmese, chinese, hindi, indonesian, russian, spanish, swahili, +telugu + +tags: +multilingual, narrative + +paper: +""" + +from functools import partial + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.continuation import get_continuation_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.tasks.templates.utils.translation_literals import TRANSLATION_LITERALS +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"xstory_cloze_{lang.value}_{formulation.name.lower()}", + prompt_function=get_continuation_prompt_function( + lang, + partial( + lambda lang, line: { + "context": TRANSLATION_LITERALS[lang].sentence_space.join( + [ + line["input_sentence_1"], + line["input_sentence_2"], + line["input_sentence_3"], + line["input_sentence_4"], + ] + ), + "continuations": [line["sentence_quiz1"], line["sentence_quiz2"]], + "gold_idx": int(line["answer_right_ending"]) - 1, # type: ignore + }, + lang, + ), + formulation=formulation, + ), + suite=("lighteval",), + hf_repo="juletxara/xstory_cloze", + hf_subset=standardize_tag(lang.value), + evaluation_splits=["eval"], + few_shots_split="train", + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), + ) + for lang in [ + Language.RUSSIAN, + Language.CHINESE, + Language.SPANISH, + Language.ARABIC, + Language.HINDI, + Language.INDONESIAN, + Language.TELUGU, + Language.SWAHILI, + Language.BASQUE, + Language.BURMESE, + ] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/multilingual/tasks/xwinograd.py b/src/lighteval/tasks/multilingual/tasks/xwinograd.py new file mode 100644 index 000000000..827399e42 --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/xwinograd.py @@ -0,0 +1,71 @@ +""" +name: +Xwinograd + +dataset: +Muennighoff/xwinograd + +abstract: +Xwinograd multilingual benchmark. + +languages: +chinese, english, french, japanese, portuguese, russian + +tags: +multilingual, multiple-choice, reasoning + +paper: +""" + +from functools import partial + +from langcodes import standardize_tag + +from lighteval.metrics.dynamic_metrics import ( + LogLikelihoodAccMetric, +) +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.multilingual.adapters import ( + winogrand_adapter, +) +from lighteval.tasks.templates.continuation import get_continuation_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"xwinograd_{language.value}_{formulation.name.lower()}", + suite=("lighteval",), + prompt_function=get_continuation_prompt_function( + language, partial(winogrand_adapter, language), formulation=formulation + ), + hf_repo="Muennighoff/xwinograd", + hf_subset=standardize_tag(language.value) if language != Language.JAPANESE else "jp", + evaluation_splits=("test",), + hf_avail_splits=["test"], + metrics=[ + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ) + for language in [ + Language.ENGLISH, + Language.FRENCH, + Language.JAPANESE, + Language.PORTUGUESE, + Language.RUSSIAN, + Language.CHINESE, + ] + for formulation in [ + MCFFormulation(), + CFFormulation(), + HybridFormulation(), + ] +] diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 95914991c..cabde57be 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -28,13 +28,12 @@ import logging import os import sys +import time from functools import lru_cache from itertools import groupby from pathlib import Path from types import ModuleType -import lighteval.tasks.default_tasks as default_tasks -from lighteval.tasks.extended import AVAILABLE_EXTENDED_TASKS_MODULES from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig @@ -114,10 +113,8 @@ class Registry: def __init__( self, tasks: str | Path | None = None, - custom_tasks: str | Path | ModuleType | None = None, - load_community: bool = False, - load_extended: bool = False, load_multilingual: bool = False, + custom_tasks: str | Path | ModuleType | None = None, ): """ Initialize the Registry class. @@ -130,8 +127,6 @@ def __init__( - A Path object pointing to a custom tasks file - A module object containing custom task configurations - None for default behavior (no custom tasks) - load_community: Whether to load community-contributed tasks. - load_extended: Whether to load extended tasks with custom logic. load_multilingual: Whether to load multilingual tasks. Each custom task module should contain a TASKS_TABLE exposing @@ -146,8 +141,6 @@ def __init__( ) ] """ - self._custom_tasks = custom_tasks - if tasks is None: logger.warning( "You passed no task name. This should only occur if you are using the CLI to inspect tasks." @@ -155,16 +148,10 @@ def __init__( self.tasks_list = [] else: self.tasks_list = self._get_full_task_list_from_input_string(tasks) - # These parameters are dynamically set by the task names provided, thanks to `activate_suites_to_load`, - # except in the `tasks` CLI command to display the full list - self._load_community = load_community - self._load_extended = load_extended - self._load_multilingual = load_multilingual - self._activate_loading_of_optional_suite() # we dynamically set the loading parameters - - # We load all task to - self._task_registry = self._load_full_registry() + self._task_registry = Registry.load_all_task_configs( + custom_tasks=custom_tasks, load_multilingual=load_multilingual + ) self.task_to_configs = self._update_task_configs() def _get_full_task_list_from_input_string(self, tasks: str | Path) -> list[str]: @@ -175,21 +162,7 @@ def _get_full_task_list_from_input_string(self, tasks: str | Path) -> list[str]: else: tasks_list = tasks.split(",") - # We might have tasks provided as task groups in the custom tasks - # We load the whole task_groups mapping - if self._custom_tasks is None: - task_groups = {} - else: - custom_tasks_module = Registry.create_custom_tasks_module(custom_tasks=self._custom_tasks) - tasks_group_dict = {} - if hasattr(custom_tasks_module, "TASKS_GROUPS"): - tasks_group_dict = custom_tasks_module.TASKS_GROUPS - - # We should allow defining task groups as comma-separated strings or lists of tasks - task_groups = {k: v if isinstance(v, list) else v.split(",") for k, v in tasks_group_dict.items()} - - # Then link actual task_group to task list if needed - # (At this point the strings are either task name/superset name or group names) + task_groups = {} expanded_tasks_list: list[str] = [] for maybe_task_group in tasks_list: # We either expand the group (in case it's a group name), or we keep it as is (in case it's a task name or superset name) @@ -203,76 +176,6 @@ def _get_full_task_list_from_input_string(self, tasks: str | Path) -> list[str]: return expanded_tasks_list - def _activate_loading_of_optional_suite(self) -> None: - """Dynamically selects which of the optional suite we want to load.""" - suites = {task.split("|")[0] for task in self.tasks_list} - - for suite_name in suites: - if suite_name not in DEFAULT_SUITES: - logger.warning( - f"Suite {suite_name} unknown. This is not normal, unless you are testing adding new evaluations." - ) - - if "extended" in suites: - self._load_extended = True - if "multilingual" in suites: - self._load_multilingual = True - if "community" in suites: - self._load_community = True - - def _load_full_registry(self) -> dict[str, LightevalTaskConfig]: - """ - Returns: - dict[str, LightevalTaskConfig]: A dictionary mapping task names (suite|task) to their corresponding LightevalTask classes. - - Example: - { - "lighteval|arc_easy": LightevalTaskConfig(name="arc_easy", suite="lighteval", ...), - } - """ - custom_tasks_registry = {} - custom_tasks_module = [] - custom_task_configs = [] - - if self._custom_tasks is not None: - custom_tasks_module.append(Registry.create_custom_tasks_module(custom_tasks=self._custom_tasks)) - - # Need to load extended tasks - if self._load_extended: - for extended_task_module in AVAILABLE_EXTENDED_TASKS_MODULES: - custom_tasks_module.append(extended_task_module) - - # Need to load community tasks - if self._load_community: - community_modules = load_community_tasks() - for community_task_module in community_modules: - custom_tasks_module.append(community_task_module) - - # Need to load multilingual tasks - if self._load_multilingual: - import lighteval.tasks.multilingual.tasks as multilingual_tasks - - custom_tasks_module.append(multilingual_tasks) - - # We load all - for module in custom_tasks_module: - custom_task_configs.extend(module.TASKS_TABLE) - logger.info(f"Found {len(module.TASKS_TABLE)} custom tasks in {module.__file__}") - - if len(custom_task_configs) > 0: - custom_tasks_registry = Registry.create_task_config_dict(meta_table=custom_task_configs) - - default_tasks_registry = Registry.create_task_config_dict() - - # Check the overlap between default_tasks_registry and custom_tasks_registry - intersection = set(default_tasks_registry.keys()).intersection(set(custom_tasks_registry.keys())) - if len(intersection) > 0: - logger.warning( - f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the custom ones on conflict." - ) - - return {**default_tasks_registry, **custom_tasks_registry} - def _update_task_configs(self) -> dict[str, LightevalTaskConfig]: # noqa: C901 """ Updates each config depending on the input tasks (we replace all provided params, like few shot number, sampling params, etc) @@ -401,26 +304,68 @@ def create_custom_tasks_module(custom_tasks: str | Path | ModuleType) -> ModuleT return importlib.import_module(str(custom_tasks)) @staticmethod - def create_task_config_dict(meta_table: list[LightevalTaskConfig] | None = None) -> dict[str, LightevalTaskConfig]: - """Create configuration tasks based on the provided meta_table. + def _extract_configs(module: ModuleType) -> dict[str, LightevalTaskConfig]: + configs = {} + if hasattr(module, "TASKS_TABLE"): + for config in getattr(module, "TASKS_TABLE"): + configs[f"{config.suite[0]}|{config.name}"] = config + return configs - Args: - meta_table: meta_table containing tasks - configurations. If not provided, it will be loaded from TABLE_PATH. + @staticmethod + def _load_from_files(files: list[Path], module_prefix: str) -> dict[str, LightevalTaskConfig]: + configs = {} + for task_file in files: + module_name = task_file.stem + module = importlib.import_module(f"{module_prefix}.{module_name}") + configs.update(Registry._extract_configs(module)) + return configs - Returns: - Dict[str, LightevalTaskConfig]: A dictionary of task names mapped to their corresponding LightevalTaskConfig. - """ - if meta_table is None: - meta_table = [config for config in vars(default_tasks).values() if isinstance(config, LightevalTaskConfig)] + @staticmethod + def _load_from_subdirs(subdirs: list[Path]) -> dict[str, LightevalTaskConfig]: + configs = {} + for task_dir in subdirs: + module_name = task_dir.name + module = importlib.import_module(f"lighteval.tasks.tasks.{module_name}.main") + configs.update(Registry._extract_configs(module)) + return configs - tasks_with_config: dict[str, LightevalTaskConfig] = {} - for config in meta_table: - for suite in config.suite: - if suite in DEFAULT_SUITES: - tasks_with_config[f"{suite}|{config.name}"] = config + @staticmethod + def load_all_task_configs( + custom_tasks: str | Path | None = None, load_multilingual: bool = False + ) -> dict[str, LightevalTaskConfig]: + """Load all LightevalTaskConfig objects from all Python files in the tasks/ directory.""" + time_start = time.perf_counter() + # Get the tasks directory + TASKS_DIR = Path(__file__).parent / "tasks" + TASKS_DIR_MULTILINGUAL = Path(__file__).parent / "multilingual" / "tasks" + loaded_configs = {} + + # Get all Python files in the tasks directory (excluding __init__.py) + task_files = [f for f in TASKS_DIR.glob("*.py") if f.name != "__init__.py"] + task_files_multilingual = [f for f in TASKS_DIR_MULTILINGUAL.glob("*.py") if f.name != "__init__.py"] + + # Also get all subdirectories with main.py files + task_subdirs = [d for d in TASKS_DIR.iterdir() if d.is_dir() and (d / "main.py").exists()] + + loaded_configs.update(Registry._load_from_files(task_files, "lighteval.tasks.tasks")) + if load_multilingual: + loaded_configs.update( + Registry._load_from_files(task_files_multilingual, "lighteval.tasks.multilingual.tasks") + ) + loaded_configs.update(Registry._load_from_subdirs(task_subdirs)) + + if custom_tasks is not None: + custom_tasks_module = Registry.create_custom_tasks_module(custom_tasks) + custom_tasks_configs = Registry._extract_configs(custom_tasks_module) + if set(custom_tasks_configs.keys()) & set(loaded_configs.keys()): + raise ValueError( + f"Custom tasks {custom_tasks} conflict with built-in tasks, please use a different name. Conflicting tasks: {set(custom_tasks_configs.keys()) & set(loaded_configs.keys())}" + ) + loaded_configs.update(custom_tasks_configs) - return tasks_with_config + time_end = time.perf_counter() + logger.info(f"Loaded {len(loaded_configs)} task configs in {time_end - time_start:.1f} seconds") + return loaded_configs def print_all_tasks(self, suites: str | None = None): """Print all the tasks in the task registry. diff --git a/src/lighteval/tasks/tasks/agieval.py b/src/lighteval/tasks/tasks/agieval.py new file mode 100644 index 000000000..1f6f6f3d2 --- /dev/null +++ b/src/lighteval/tasks/tasks/agieval.py @@ -0,0 +1,356 @@ +""" +name: +Agieval + +dataset: +dmayhem93/agieval-aqua-rat, dmayhem93/agieval-gaokao-biology, dmayhem93/agieval-gaokao-chemistry, dmayhem93/agieval-gaokao-chinese, dmayhem93/agieval-gaokao-english, dmayhem93/agieval-gaokao-geography, dmayhem93/agieval-gaokao-history, dmayhem93/agieval-gaokao-mathqa, dmayhem93/agieval-gaokao-physics, dmayhem93/agieval-logiqa-en, dmayhem93/agieval-logiqa-zh, dmayhem93/agieval-lsat-ar, dmayhem93/agieval-lsat-lr, dmayhem93/agieval-lsat-rc, dmayhem93/agieval-sat-en, dmayhem93/agieval-sat-en-without-passage, dmayhem93/agieval-sat-math + +abstract: +AGIEval is a human-centric benchmark specifically designed to evaluate the +general abilities of foundation models in tasks pertinent to human cognition and +problem-solving. This benchmark is derived from 20 official, public, and +high-standard admission and qualification exams intended for general human +test-takers, such as general college admission tests (e.g., Chinese College +Entrance Exam (Gaokao) and American SAT), law school admission tests, math +competitions, lawyer qualification tests, and national civil service exams. + +languages: +english, chinese + +tags: +biology, chemistry, geography, history, knowledge, language, multiple-choice, physics, reasoning + +paper: +https://arxiv.org/abs/2304.06364 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +agieval_aqua_rat = LightevalTaskConfig( + name="agieval:aqua-rat", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-aqua-rat", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_gaokao_biology = LightevalTaskConfig( + name="agieval:gaokao-biology", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-gaokao-biology", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_gaokao_chemistry = LightevalTaskConfig( + name="agieval:gaokao-chemistry", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-gaokao-chemistry", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_gaokao_chinese = LightevalTaskConfig( + name="agieval:gaokao-chinese", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-gaokao-chinese", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_gaokao_english = LightevalTaskConfig( + name="agieval:gaokao-english", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-gaokao-english", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_gaokao_geography = LightevalTaskConfig( + name="agieval:gaokao-geography", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-gaokao-geography", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_gaokao_history = LightevalTaskConfig( + name="agieval:gaokao-history", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-gaokao-history", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_gaokao_mathqa = LightevalTaskConfig( + name="agieval:gaokao-mathqa", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-gaokao-mathqa", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_gaokao_physics = LightevalTaskConfig( + name="agieval:gaokao-physics", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-gaokao-physics", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_logiqa_en = LightevalTaskConfig( + name="agieval:logiqa-en", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-logiqa-en", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_logiqa_zh = LightevalTaskConfig( + name="agieval:logiqa-zh", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-logiqa-zh", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_lsat_ar = LightevalTaskConfig( + name="agieval:lsat-ar", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-lsat-ar", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_lsat_lr = LightevalTaskConfig( + name="agieval:lsat-lr", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-lsat-lr", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_lsat_rc = LightevalTaskConfig( + name="agieval:lsat-rc", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-lsat-rc", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_sat_en = LightevalTaskConfig( + name="agieval:sat-en", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-sat-en", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_sat_en_without_passage = LightevalTaskConfig( + name="agieval:sat-en-without-passage", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-sat-en-without-passage", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +agieval_sat_math = LightevalTaskConfig( + name="agieval:sat-math", + suite=["lighteval"], + prompt_function=prompt.agieval, + hf_repo="dmayhem93/agieval-sat-math", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=None, + version=0, +) + +TASKS_TABLE = [ + agieval_aqua_rat, + agieval_gaokao_biology, + agieval_gaokao_chemistry, + agieval_gaokao_chinese, + agieval_gaokao_english, + agieval_gaokao_geography, + agieval_gaokao_history, + agieval_gaokao_mathqa, + agieval_gaokao_physics, + agieval_logiqa_en, + agieval_logiqa_zh, + agieval_lsat_ar, + agieval_lsat_lr, + agieval_lsat_rc, + agieval_sat_en, + agieval_sat_en_without_passage, + agieval_sat_math, +] diff --git a/src/lighteval/tasks/tasks/aime.py b/src/lighteval/tasks/tasks/aime.py new file mode 100644 index 000000000..ac82a00eb --- /dev/null +++ b/src/lighteval/tasks/tasks/aime.py @@ -0,0 +1,127 @@ +""" +name: +Aime + +dataset: +HuggingFaceH4/aime_2024, yentinglin/aime_2025 + +abstract: +The American Invitational Mathematics Examination (AIME) is a prestigious, +invite-only mathematics competition for high-school students who perform in the +top 5% of the AMC 12 mathematics exam. It involves 15 questions of increasing +difficulty, with the answer to every question being a single integer from 0 to +999. The median score is historically between 4 and 6 questions correct (out of +the 15 possible). Two versions of the test are given every year (thirty +questions total). + +languages: +english + +tags: +math, reasoning + +paper: +https://maa.org/aime-thresholds-are-available/ +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +aime24 = LightevalTaskConfig( + name="aime24", + suite=["lighteval"], + prompt_function=prompt.aime_prompt_fn, + hf_repo="HuggingFaceH4/aime_2024", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.pass_at_k_math(sample_params={"k": 1}), Metrics.avg_at_n_math(sample_params={"n": 1})], + version=2, +) + +aime24_avg = LightevalTaskConfig( + name="aime24_avg", + suite=["lighteval"], + prompt_function=prompt.aime_prompt_fn, + hf_repo="HuggingFaceH4/aime_2024", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.avg_at_n_math(sample_params={"n": 64})], + version=2, +) + +aime24_gpassk = LightevalTaskConfig( + name="aime24_gpassk", + suite=["lighteval"], + prompt_function=prompt.aime_prompt_fn, + hf_repo="HuggingFaceH4/aime_2024", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.g_pass_at_k_math(sample_params={"k": 16, "n": 48})], + version=1, +) + +aime25 = LightevalTaskConfig( + name="aime25", + suite=["lighteval"], + prompt_function=prompt.aime_prompt_fn, + hf_repo="yentinglin/aime_2025", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}), Metrics.avg_at_n_math(sample_params={"n": 1})], + version=2, +) + +aime25_avg = LightevalTaskConfig( + name="aime25_avg", + suite=["lighteval"], + prompt_function=prompt.aime_prompt_fn, + hf_repo="yentinglin/aime_2025", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.avg_at_n_math(sample_params={"n": 64})], + version=2, +) + +aime25_gpassk = LightevalTaskConfig( + name="aime25_gpassk", + suite=["lighteval"], + prompt_function=prompt.aime_prompt_fn, + hf_repo="yentinglin/aime_2025", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.g_pass_at_k_math(sample_params={"k": 16, "n": 48})], + version=1, +) + +TASKS_TABLE = [ + aime24, + aime24_gpassk, + aime25, + aime25_gpassk, +] diff --git a/src/lighteval/tasks/tasks/aimo.py b/src/lighteval/tasks/tasks/aimo.py new file mode 100644 index 000000000..615e26ffa --- /dev/null +++ b/src/lighteval/tasks/tasks/aimo.py @@ -0,0 +1,53 @@ +""" +name: +AIMO Progress Prize 1 + +dataset: +https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize + +abstract: +Task to evaluate LLMs on the training set of the Kaggle AIMO competition: + +languages: +english + +tags: +math, reasoning + +paper: +""" + +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import math_normalizer +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +def aimo_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + choices=[str(line["answer"])], + gold_index=0, + query=line["problem"], + ) + + +task = LightevalTaskConfig( + name="aimo_progress_prize_1", + prompt_function=aimo_prompt, + suite=["community"], + hf_subset="", + hf_repo="lighteval/aimo_progress_prize_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split="train", + few_shots_select="sequential", + metrics=[ + Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}) + ], + generation_size=2048, + stop_sequence=None, +) + +# STORE YOUR EVALS +TASKS_TABLE = [task] diff --git a/src/lighteval/tasks/tasks/anli.py b/src/lighteval/tasks/tasks/anli.py new file mode 100644 index 000000000..86ea842b1 --- /dev/null +++ b/src/lighteval/tasks/tasks/anli.py @@ -0,0 +1,84 @@ +""" +name: +Anli + +dataset: +facebook/anli + +abstract: +The Adversarial Natural Language Inference (ANLI) is a new large-scale NLI +benchmark dataset, The dataset is collected via an iterative, adversarial +human-and-model-in-the-loop procedure. ANLI is much more difficult than its +predecessors including SNLI and MNLI. It contains three rounds. Each round has +train/dev/test splits. + +languages: +english + +tags: +nli, reasoning + +paper: +https://arxiv.org/abs/1910.14599 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +anli_r1 = LightevalTaskConfig( + name="anli:r1", + suite=["lighteval"], + prompt_function=prompt.anli, + hf_repo="facebook/anli", + hf_subset="plain_text", + hf_avail_splits=["train_r1", "dev_r1", "test_r1"], + evaluation_splits=["test_r1"], + few_shots_split="train_r1", + few_shots_select="random_sampling_from_train", + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + + +anli_r2 = LightevalTaskConfig( + name="anli:r2", + suite=["lighteval"], + prompt_function=prompt.anli, + hf_repo="facebook/anli", + hf_subset="plain_text", + hf_avail_splits=["train_r2", "dev_r2", "test_r2"], + evaluation_splits=["test_r2"], + few_shots_split="train_r2", + few_shots_select="random_sampling_from_train", + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + + +anli_r3 = LightevalTaskConfig( + name="anli:r3", + suite=["lighteval"], + prompt_function=prompt.anli, + hf_repo="facebook/anli", + hf_subset="plain_text", + hf_avail_splits=["train_r3", "dev_r3", "test_r3"], + evaluation_splits=["test_r3"], + few_shots_split="train_r3", + few_shots_select="random_sampling_from_train", + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + anli_r1, + anli_r2, + anli_r3, +] diff --git a/src/lighteval/tasks/tasks/arc.py b/src/lighteval/tasks/tasks/arc.py new file mode 100644 index 000000000..25c7d3464 --- /dev/null +++ b/src/lighteval/tasks/tasks/arc.py @@ -0,0 +1,66 @@ +""" +name: +Arc + +dataset: +allenai/ai2_arc + +abstract: +7,787 genuine grade-school level, multiple-choice science questions, assembled +to encourage research in advanced question-answering. The dataset is partitioned +into a Challenge Set and an Easy Set, where the former contains only questions +answered incorrectly by both a retrieval-based algorithm and a word +co-occurrence algorithm + +languages: +english + +tags: +multiple-choice + +paper: +https://arxiv.org/abs/1803.05457 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +arc_challenge = LightevalTaskConfig( + name="arc:challenge", + suite=["lighteval"], + prompt_function=prompt.arc, + hf_repo="allenai/ai2_arc", + hf_subset="ARC-Challenge", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=["\n"], + version=0, +) + +arc_easy = LightevalTaskConfig( + name="arc:easy", + suite=["lighteval"], + prompt_function=prompt.arc, + hf_repo="allenai/ai2_arc", + hf_subset="ARC-Easy", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [arc_challenge, arc_easy] diff --git a/src/lighteval/tasks/tasks/arc_agi_2.py b/src/lighteval/tasks/tasks/arc_agi_2.py new file mode 100644 index 000000000..6e6302a44 --- /dev/null +++ b/src/lighteval/tasks/tasks/arc_agi_2.py @@ -0,0 +1,52 @@ +""" +name: +ArcAgi 2 + +dataset: +arc-agi-community/arc-agi-2 + +abstract: +ARC-AGI tasks are a series of three to five input and output tasks followed by a +final task with only the input listed. Each task tests the utilization of a +specific learned skill based on a minimal number of cognitive priors. +In their native form, tasks are a JSON lists of integers. These JSON can also be +represented visually as a grid of colors using an ARC-AGI task viewer. You can +view an example of a task here. +A successful submission is a pixel-perfect description (color and position) of +the final task's output. +100% of tasks in the ARC-AGI-2 dataset were solved by a minimim of 2 people in +less than or equal to 2 attempts (many were solved more). ARC-AGI-2 is more +difficult for AI. + +languages: +english + +tags: +multiple-choice + +paper: +https://arcprize.org/guide +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +arc_agi_2 = LightevalTaskConfig( + name="arc_agi_2", + suite=["lighteval"], + prompt_function=prompt.arc_agi_2, + hf_repo="arc-agi-community/arc-agi-2", + hf_subset="default", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metrics=[Metrics.exact_match], + stop_sequence=None, + version=0, +) + +TASKS_TABLE = [arc_agi_2] diff --git a/src/lighteval/tasks/tasks/arithmetic.py b/src/lighteval/tasks/tasks/arithmetic.py new file mode 100644 index 000000000..d1e6b6107 --- /dev/null +++ b/src/lighteval/tasks/tasks/arithmetic.py @@ -0,0 +1,198 @@ +""" +name: +Arithmetic + +dataset: +EleutherAI/arithmetic + +abstract: +A small battery of 10 tests that involve asking language models a simple +arithmetic problem in natural language. + +languages: +english + +tags: +math, reasoning + +paper: +https://arxiv.org/abs/2005.14165 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +arithmetic_1dc = LightevalTaskConfig( + name="arithmetic:1dc", + suite=["lighteval"], + prompt_function=prompt.arithmetic, + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_1dc", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +arithmetic_2da = LightevalTaskConfig( + name="arithmetic:2da", + suite=["lighteval"], + prompt_function=prompt.arithmetic, + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_2da", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +arithmetic_2dm = LightevalTaskConfig( + name="arithmetic:2dm", + suite=["lighteval"], + prompt_function=prompt.arithmetic, + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_2dm", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +arithmetic_2ds = LightevalTaskConfig( + name="arithmetic:2ds", + suite=["lighteval"], + prompt_function=prompt.arithmetic, + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_2ds", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +arithmetic_3da = LightevalTaskConfig( + name="arithmetic:3da", + suite=["lighteval"], + prompt_function=prompt.arithmetic, + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_3da", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +arithmetic_3ds = LightevalTaskConfig( + name="arithmetic:3ds", + suite=["lighteval"], + prompt_function=prompt.arithmetic, + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_3ds", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +arithmetic_4da = LightevalTaskConfig( + name="arithmetic:4da", + suite=["lighteval"], + prompt_function=prompt.arithmetic, + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_4da", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +arithmetic_4ds = LightevalTaskConfig( + name="arithmetic:4ds", + suite=["lighteval"], + prompt_function=prompt.arithmetic, + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_4ds", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +arithmetic_5da = LightevalTaskConfig( + name="arithmetic:5da", + suite=["lighteval"], + prompt_function=prompt.arithmetic, + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_5da", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +arithmetic_5ds = LightevalTaskConfig( + name="arithmetic:5ds", + suite=["lighteval"], + prompt_function=prompt.arithmetic, + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_5ds", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + arithmetic_1dc, + arithmetic_2da, + arithmetic_2dm, + arithmetic_2ds, + arithmetic_3da, + arithmetic_3ds, + arithmetic_4da, + arithmetic_4ds, + arithmetic_5da, + arithmetic_5ds, +] diff --git a/src/lighteval/tasks/tasks/asdiv.py b/src/lighteval/tasks/tasks/asdiv.py new file mode 100644 index 000000000..e7141449d --- /dev/null +++ b/src/lighteval/tasks/tasks/asdiv.py @@ -0,0 +1,43 @@ +""" +name: +Asdiv + +dataset: +EleutherAI/asdiv + +abstract: +ASDiv is a dataset for arithmetic reasoning that contains 2,000+ questions +covering addition, subtraction, multiplication, and division. + +languages: +english + +tags: +math, reasoning + +paper: +https://arxiv.org/abs/2410.12853 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +asdiv = LightevalTaskConfig( + name="asdiv", + suite=["lighteval"], + prompt_function=prompt.asdiv, + hf_repo="EleutherAI/asdiv", + hf_subset="asdiv", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [asdiv] diff --git a/src/lighteval/tasks/tasks/babi_qa.py b/src/lighteval/tasks/tasks/babi_qa.py new file mode 100644 index 000000000..5ade7cb23 --- /dev/null +++ b/src/lighteval/tasks/tasks/babi_qa.py @@ -0,0 +1,43 @@ +""" +name: +Babi Qa + +dataset: +facebook/babi_qa + +abstract: +The bAbI benchmark for measuring understanding and reasoning, evaluates reading +comprehension via question answering. + +languages: +english + +tags: +qa, reasoning + +paper: +https://arxiv.org/abs/1502.05698 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +babi_qa = LightevalTaskConfig( + name="babi_qa", + suite=["lighteval"], + prompt_function=prompt.babi_qa, + hf_repo="facebook/babi_qa", + hf_subset="en-valid-qa1", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [babi_qa] diff --git a/src/lighteval/tasks/tasks/bbq.py b/src/lighteval/tasks/tasks/bbq.py new file mode 100644 index 000000000..3b58f2a91 --- /dev/null +++ b/src/lighteval/tasks/tasks/bbq.py @@ -0,0 +1,232 @@ +""" +name: +Bbq + +dataset: +lighteval/bbq_helm + +abstract: +The Bias Benchmark for Question Answering (BBQ) for measuring social bias in +question answering in ambiguous and unambigous context . + +languages: +english + +tags: +bias, multiple-choice, qa + +paper: +https://arxiv.org/abs/2110.08193 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +bbq = LightevalTaskConfig( + name="bbq", + suite=["lighteval"], + prompt_function=prompt.bbq, + hf_repo="lighteval/bbq_helm", + hf_subset="all", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +bbq_Age = LightevalTaskConfig( + name="bbq:Age", + suite=["lighteval"], + prompt_function=prompt.bbq, + hf_repo="lighteval/bbq_helm", + hf_subset="Age", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +bbq_Disability_status = LightevalTaskConfig( + name="bbq:Disability_status", + suite=["lighteval"], + prompt_function=prompt.bbq, + hf_repo="lighteval/bbq_helm", + hf_subset="Disability_status", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +bbq_Gender_identity = LightevalTaskConfig( + name="bbq:Gender_identity", + suite=["lighteval"], + prompt_function=prompt.bbq, + hf_repo="lighteval/bbq_helm", + hf_subset="Gender_identity", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +bbq_Nationality = LightevalTaskConfig( + name="bbq:Nationality", + suite=["lighteval"], + prompt_function=prompt.bbq, + hf_repo="lighteval/bbq_helm", + hf_subset="Nationality", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +bbq_Physical_appearance = LightevalTaskConfig( + name="bbq:Physical_appearance", + suite=["lighteval"], + prompt_function=prompt.bbq, + hf_repo="lighteval/bbq_helm", + hf_subset="Physical_appearance", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +bbq_Race_ethnicity = LightevalTaskConfig( + name="bbq:Race_ethnicity", + suite=["lighteval"], + prompt_function=prompt.bbq, + hf_repo="lighteval/bbq_helm", + hf_subset="Race_ethnicity", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +bbq_Race_x_SES = LightevalTaskConfig( + name="bbq:Race_x_SES", + suite=["lighteval"], + prompt_function=prompt.bbq, + hf_repo="lighteval/bbq_helm", + hf_subset="Race_x_SES", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +bbq_Race_x_gender = LightevalTaskConfig( + name="bbq:Race_x_gender", + suite=["lighteval"], + prompt_function=prompt.bbq, + hf_repo="lighteval/bbq_helm", + hf_subset="Race_x_gender", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +bbq_Religion = LightevalTaskConfig( + name="bbq:Religion", + suite=["lighteval"], + prompt_function=prompt.bbq, + hf_repo="lighteval/bbq_helm", + hf_subset="Religion", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +bbq_SES = LightevalTaskConfig( + name="bbq:SES", + suite=["lighteval"], + prompt_function=prompt.bbq, + hf_repo="lighteval/bbq_helm", + hf_subset="SES", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +bbq_Sexual_orientation = LightevalTaskConfig( + name="bbq:Sexual_orientation", + suite=["lighteval"], + prompt_function=prompt.bbq, + hf_repo="lighteval/bbq_helm", + hf_subset="Sexual_orientation", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + bbq, + bbq_Age, + bbq_Disability_status, + bbq_Gender_identity, + bbq_Nationality, + bbq_Physical_appearance, + bbq_Race_ethnicity, + bbq_Race_x_SES, + bbq_Race_x_gender, + bbq_Religion, + bbq_SES, + bbq_Sexual_orientation, +] diff --git a/src/lighteval/tasks/tasks/bigbench.py b/src/lighteval/tasks/tasks/bigbench.py new file mode 100644 index 000000000..8d3c62d26 --- /dev/null +++ b/src/lighteval/tasks/tasks/bigbench.py @@ -0,0 +1,2746 @@ +""" +name: +Bigbench + +dataset: +tasksource/bigbench + +abstract: +Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models +166 tasks from bigbench benchmark. + +languages: +english + +tags: +reasoning + +paper: +https://arxiv.org/abs/2206.04615 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +abstract_narrative_understanding = LightevalTaskConfig( + name="bigbench:abstract_narrative_understanding", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="abstract_narrative_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +anachronisms = LightevalTaskConfig( + name="bigbench:anachronisms", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="anachronisms", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +analogical_similarity = LightevalTaskConfig( + name="bigbench:analogical_similarity", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="analogical_similarity", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +analytic_entailment = LightevalTaskConfig( + name="bigbench:analytic_entailment", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="analytic_entailment", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +arithmetic_bb = LightevalTaskConfig( + name="bigbench:arithmetic_bb", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="arithmetic", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +ascii_word_recognition = LightevalTaskConfig( + name="bigbench:ascii_word_recognition", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="ascii_word_recognition", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +authorship_verification = LightevalTaskConfig( + name="bigbench:authorship_verification", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="authorship_verification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +auto_categorization = LightevalTaskConfig( + name="bigbench:auto_categorization", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="auto_categorization", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu], + stop_sequence=["\n"], + version=0, +) + +auto_debugging = LightevalTaskConfig( + name="bigbench:auto_debugging", + suite=["lighteval"], + prompt_function=prompt.bigbench_linefeed_before_and_after_query, + hf_repo="tasksource/bigbench", + hf_subset="auto_debugging", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=None, + version=0, +) + +bbq_lite_json = LightevalTaskConfig( + name="bigbench:bbq_lite_json", + suite=["lighteval"], + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="bbq_lite_json", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +bridging_anaphora_resolution_barqa = LightevalTaskConfig( + name="bigbench:bridging_anaphora_resolution_barqa", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="bridging_anaphora_resolution_barqa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +causal_judgment = LightevalTaskConfig( + name="bigbench:causal_judgment", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="causal_judgment", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +cause_and_effect = LightevalTaskConfig( + name="bigbench:cause_and_effect", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="cause_and_effect", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +checkmate_in_one = LightevalTaskConfig( + name="bigbench:checkmate_in_one", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="checkmate_in_one", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +chess_state_tracking = LightevalTaskConfig( + name="bigbench:chess_state_tracking", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="chess_state_tracking", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +chinese_remainder_theorem = LightevalTaskConfig( + name="bigbench:chinese_remainder_theorem", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="chinese_remainder_theorem", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +cifar10_classification = LightevalTaskConfig( + name="bigbench:cifar10_classification", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="cifar10_classification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +code_line_description = LightevalTaskConfig( + name="bigbench:code_line_description", + suite=["lighteval"], + prompt_function=prompt.bigbench_linefeed_before_and_after_query, + hf_repo="tasksource/bigbench", + hf_subset="code_line_description", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +codenames = LightevalTaskConfig( + name="bigbench:codenames", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="codenames", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.rouge_t5, Metrics.bleu], + stop_sequence=["\n"], + version=0, +) + +color = LightevalTaskConfig( + name="bigbench:color", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="color", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[ + Metrics.rouge_t5, + Metrics.bleu, + Metrics.loglikelihood_acc, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], + stop_sequence=["\n"], + version=0, +) + +common_morpheme = LightevalTaskConfig( + name="bigbench:common_morpheme", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="common_morpheme", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +conceptual_combinations = LightevalTaskConfig( + name="bigbench:conceptual_combinations", + suite=["lighteval"], + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="conceptual_combinations", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +conlang_translation = LightevalTaskConfig( + name="bigbench:conlang_translation", + suite=["lighteval"], + prompt_function=prompt.bigbench_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="conlang_translation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.rouge_t5, Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=[".", ";", "!", "?"], + version=0, +) + +contextual_parametric_knowledge_conflicts = LightevalTaskConfig( + name="bigbench:contextual_parametric_knowledge_conflicts", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="contextual_parametric_knowledge_conflicts", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +crash_blossom = LightevalTaskConfig( + name="bigbench:crash_blossom", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="crash_blossom", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +crass_ai = LightevalTaskConfig( + name="bigbench:crass_ai", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="crass_ai", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +cryobiology_spanish = LightevalTaskConfig( + name="bigbench:cryobiology_spanish", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="cryobiology_spanish", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +cryptonite = LightevalTaskConfig( + name="bigbench:cryptonite", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="cryptonite", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +cs_algorithms = LightevalTaskConfig( + name="bigbench:cs_algorithms", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="cs_algorithms", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +dark_humor_detection = LightevalTaskConfig( + name="bigbench:dark_humor_detection", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="dark_humor_detection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +date_understanding = LightevalTaskConfig( + name="bigbench:date_understanding", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="date_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +disambiguation_qa = LightevalTaskConfig( + name="bigbench:disambiguation_qa", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="disambiguation_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +discourse_marker_prediction = LightevalTaskConfig( + name="bigbench:discourse_marker_prediction", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="discourse_marker_prediction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +disfl_qa = LightevalTaskConfig( + name="bigbench:disfl_qa", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="disfl_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +dyck_languages = LightevalTaskConfig( + name="bigbench:dyck_languages", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="dyck_languages", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +elementary_math_qa = LightevalTaskConfig( + name="bigbench:elementary_math_qa", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="elementary_math_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +emoji_movie = LightevalTaskConfig( + name="bigbench:emoji_movie", + suite=["lighteval"], + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="emoji_movie", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[ + Metrics.rouge_t5, + Metrics.bleu, + Metrics.loglikelihood_acc, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], + stop_sequence=["\n"], + version=0, +) + +emojis_emotion_prediction = LightevalTaskConfig( + name="bigbench:emojis_emotion_prediction", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="emojis_emotion_prediction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +empirical_judgments = LightevalTaskConfig( + name="bigbench:empirical_judgments", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="empirical_judgments", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +english_proverbs = LightevalTaskConfig( + name="bigbench:english_proverbs", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="english_proverbs", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +english_russian_proverbs = LightevalTaskConfig( + name="bigbench:english_russian_proverbs", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="english_russian_proverbs", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +entailed_polarity = LightevalTaskConfig( + name="bigbench:entailed_polarity", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="entailed_polarity", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +entailed_polarity_hindi = LightevalTaskConfig( + name="bigbench:entailed_polarity_hindi", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="entailed_polarity_hindi", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +epistemic_reasoning = LightevalTaskConfig( + name="bigbench:epistemic_reasoning", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="epistemic_reasoning", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +evaluating_information_essentiality = LightevalTaskConfig( + name="bigbench:evaluating_information_essentiality", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="evaluating_information_essentiality", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +fact_checker = LightevalTaskConfig( + name="bigbench:fact_checker", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="fact_checker", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +fantasy_reasoning = LightevalTaskConfig( + name="bigbench:fantasy_reasoning", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="fantasy_reasoning", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +few_shot_nlg = LightevalTaskConfig( + name="bigbench:few_shot_nlg", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="few_shot_nlg", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.bleurt], + stop_sequence=["\n"], + version=0, +) + +figure_of_speech_detection = LightevalTaskConfig( + name="bigbench:figure_of_speech_detection", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="figure_of_speech_detection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +formal_fallacies_syllogisms_negation = LightevalTaskConfig( + name="bigbench:formal_fallacies_syllogisms_negation", + suite=["lighteval"], + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="formal_fallacies_syllogisms_negation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +gem = LightevalTaskConfig( + name="bigbench:gem", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="gem", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.rouge_t5], + stop_sequence=["\n"], + version=0, +) + +gender_inclusive_sentences_german = LightevalTaskConfig( + name="bigbench:gender_inclusive_sentences_german", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="gender_inclusive_sentences_german", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +general_knowledge = LightevalTaskConfig( + name="bigbench:general_knowledge", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="general_knowledge", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +geometric_shapes = LightevalTaskConfig( + name="bigbench:geometric_shapes", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="geometric_shapes", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[ + Metrics.rouge_t5, + Metrics.bleu, + Metrics.loglikelihood_acc, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], + stop_sequence=["\n"], + version=0, +) + +goal_step_wikihow = LightevalTaskConfig( + name="bigbench:goal_step_wikihow", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="goal_step_wikihow", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +gre_reading_comprehension = LightevalTaskConfig( + name="bigbench:gre_reading_comprehension", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="gre_reading_comprehension", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +hhh_alignment = LightevalTaskConfig( + name="bigbench:hhh_alignment", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="hhh_alignment", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +hindi_question_answering = LightevalTaskConfig( + name="bigbench:hindi_question_answering", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="hindi_question_answering", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +hindu_knowledge = LightevalTaskConfig( + name="bigbench:hindu_knowledge", + suite=["lighteval"], + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="hindu_knowledge", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +hinglish_toxicity = LightevalTaskConfig( + name="bigbench:hinglish_toxicity", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="hinglish_toxicity", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +human_organs_senses = LightevalTaskConfig( + name="bigbench:human_organs_senses", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="human_organs_senses", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +hyperbaton = LightevalTaskConfig( + name="bigbench:hyperbaton", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="hyperbaton", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +identify_math_theorems = LightevalTaskConfig( + name="bigbench:identify_math_theorems", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="identify_math_theorems", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +identify_odd_metaphor = LightevalTaskConfig( + name="bigbench:identify_odd_metaphor", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="identify_odd_metaphor", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +implicatures = LightevalTaskConfig( + name="bigbench:implicatures", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="implicatures", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +implicit_relations = LightevalTaskConfig( + name="bigbench:implicit_relations", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="implicit_relations", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +intent_recognition = LightevalTaskConfig( + name="bigbench:intent_recognition", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="intent_recognition", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +international_phonetic_alphabet_nli = LightevalTaskConfig( + name="bigbench:international_phonetic_alphabet_nli", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="international_phonetic_alphabet_nli", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +international_phonetic_alphabet_transliterate = LightevalTaskConfig( + name="bigbench:international_phonetic_alphabet_transliterate", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="international_phonetic_alphabet_transliterate", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +intersect_geometry = LightevalTaskConfig( + name="bigbench:intersect_geometry", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="intersect_geometry", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +irony_identification = LightevalTaskConfig( + name="bigbench:irony_identification", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="irony_identification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +kanji_ascii = LightevalTaskConfig( + name="bigbench:kanji_ascii", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="kanji_ascii", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +kannada = LightevalTaskConfig( + name="bigbench:kannada", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="kannada", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +key_value_maps = LightevalTaskConfig( + name="bigbench:key_value_maps", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="key_value_maps", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +known_unknowns = LightevalTaskConfig( + name="bigbench:known_unknowns", + suite=["lighteval"], + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="known_unknowns", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +language_games = LightevalTaskConfig( + name="bigbench:language_games", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="language_games", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +language_identification = LightevalTaskConfig( + name="bigbench:language_identification", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="language_identification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +linguistic_mappings = LightevalTaskConfig( + name="bigbench:linguistic_mappings", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="linguistic_mappings", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +linguistics_puzzles = LightevalTaskConfig( + name="bigbench:linguistics_puzzles", + suite=["lighteval"], + prompt_function=prompt.bigbench_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="linguistics_puzzles", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=None, + version=0, +) + +logic_grid_puzzle = LightevalTaskConfig( + name="bigbench:logic_grid_puzzle", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="logic_grid_puzzle", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +logical_args = LightevalTaskConfig( + name="bigbench:logical_args", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="logical_args", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +logical_deduction = LightevalTaskConfig( + name="bigbench:logical_deduction", + suite=["lighteval"], + prompt_function=prompt.bigbench_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="logical_deduction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +logical_fallacy_detection = LightevalTaskConfig( + name="bigbench:logical_fallacy_detection", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="logical_fallacy_detection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +logical_sequence = LightevalTaskConfig( + name="bigbench:logical_sequence", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="logical_sequence", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +mathematical_induction = LightevalTaskConfig( + name="bigbench:mathematical_induction", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="mathematical_induction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +matrixshapes = LightevalTaskConfig( + name="bigbench:matrixshapes", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="matrixshapes", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +metaphor_boolean = LightevalTaskConfig( + name="bigbench:metaphor_boolean", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="metaphor_boolean", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +metaphor_understanding = LightevalTaskConfig( + name="bigbench:metaphor_understanding", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="metaphor_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +minute_mysteries_qa = LightevalTaskConfig( + name="bigbench:minute_mysteries_qa", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="minute_mysteries_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc, Metrics.rouge_t5], + stop_sequence=["\n"], + version=0, +) + +misconceptions = LightevalTaskConfig( + name="bigbench:misconceptions", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="misconceptions", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +misconceptions_russian = LightevalTaskConfig( + name="bigbench:misconceptions_russian", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="misconceptions_russian", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +mnist_ascii = LightevalTaskConfig( + name="bigbench:mnist_ascii", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="mnist_ascii", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +modified_arithmetic = LightevalTaskConfig( + name="bigbench:modified_arithmetic", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="modified_arithmetic", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +moral_permissibility = LightevalTaskConfig( + name="bigbench:moral_permissibility", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="moral_permissibility", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +movie_dialog_same_or_different = LightevalTaskConfig( + name="bigbench:movie_dialog_same_or_different", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="movie_dialog_same_or_different", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +movie_recommendation = LightevalTaskConfig( + name="bigbench:movie_recommendation", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="movie_recommendation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +mult_data_wrangling = LightevalTaskConfig( + name="bigbench:mult_data_wrangling", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="mult_data_wrangling", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +navigate = LightevalTaskConfig( + name="bigbench:navigate", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="navigate", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +nonsense_words_grammar = LightevalTaskConfig( + name="bigbench:nonsense_words_grammar", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="nonsense_words_grammar", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +novel_concepts = LightevalTaskConfig( + name="bigbench:novel_concepts", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="novel_concepts", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +object_counting = LightevalTaskConfig( + name="bigbench:object_counting", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="object_counting", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +odd_one_out = LightevalTaskConfig( + name="bigbench:odd_one_out", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="odd_one_out", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +operators = LightevalTaskConfig( + name="bigbench:operators", + suite=["lighteval"], + prompt_function=prompt.bigbench_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="operators", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +paragraph_segmentation = LightevalTaskConfig( + name="bigbench:paragraph_segmentation", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="paragraph_segmentation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +parsinlu_qa = LightevalTaskConfig( + name="bigbench:parsinlu_qa", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="parsinlu_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +parsinlu_reading_comprehension = LightevalTaskConfig( + name="bigbench:parsinlu_reading_comprehension", + suite=["lighteval"], + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="parsinlu_reading_comprehension", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=None, + version=0, +) + +penguins_in_a_table = LightevalTaskConfig( + name="bigbench:penguins_in_a_table", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="penguins_in_a_table", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +periodic_elements = LightevalTaskConfig( + name="bigbench:periodic_elements", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="periodic_elements", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +persian_idioms = LightevalTaskConfig( + name="bigbench:persian_idioms", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="persian_idioms", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +phrase_relatedness = LightevalTaskConfig( + name="bigbench:phrase_relatedness", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="phrase_relatedness", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +physical_intuition = LightevalTaskConfig( + name="bigbench:physical_intuition", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="physical_intuition", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +physics = LightevalTaskConfig( + name="bigbench:physics", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="physics", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +physics_questions = LightevalTaskConfig( + name="bigbench:physics_questions", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="physics_questions", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +play_dialog_same_or_different = LightevalTaskConfig( + name="bigbench:play_dialog_same_or_different", + suite=["lighteval"], + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="play_dialog_same_or_different", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +polish_sequence_labeling = LightevalTaskConfig( + name="bigbench:polish_sequence_labeling", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="polish_sequence_labeling", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.f1_score], + stop_sequence=["\n"], + version=0, +) + +presuppositions_as_nli = LightevalTaskConfig( + name="bigbench:presuppositions_as_nli", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="presuppositions_as_nli", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +qa_wikidata = LightevalTaskConfig( + name="bigbench:qa_wikidata", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="qa_wikidata", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[ + Metrics.bleurt, + Metrics.bleu, + Metrics.rouge_t5, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], + stop_sequence=["\n"], + version=0, +) + +question_selection = LightevalTaskConfig( + name="bigbench:question_selection", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="question_selection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +real_or_fake_text = LightevalTaskConfig( + name="bigbench:real_or_fake_text", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="real_or_fake_text", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +reasoning_about_colored_objects = LightevalTaskConfig( + name="bigbench:reasoning_about_colored_objects", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="reasoning_about_colored_objects", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +repeat_copy_logic = LightevalTaskConfig( + name="bigbench:repeat_copy_logic", + suite=["lighteval"], + prompt_function=prompt.bigbench_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="repeat_copy_logic", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +rephrase = LightevalTaskConfig( + name="bigbench:rephrase", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="rephrase", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[ + Metrics.rouge_t5, + Metrics.bleu, + Metrics.loglikelihood_acc, + Metrics.exact_match(sample_params={"strip_strings": False}), + ], + stop_sequence=["\n"], + version=0, +) + +rhyming = LightevalTaskConfig( + name="bigbench:rhyming", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="rhyming", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +riddle_sense = LightevalTaskConfig( + name="bigbench:riddle_sense", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="riddle_sense", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +ruin_names = LightevalTaskConfig( + name="bigbench:ruin_names", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="ruin_names", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +salient_translation_error_detection = LightevalTaskConfig( + name="bigbench:salient_translation_error_detection", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="salient_translation_error_detection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +scientific_press_release = LightevalTaskConfig( + name="bigbench:scientific_press_release", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="scientific_press_release", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +semantic_parsing_in_context_sparc = LightevalTaskConfig( + name="bigbench:semantic_parsing_in_context_sparc", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="semantic_parsing_in_context_sparc", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +semantic_parsing_spider = LightevalTaskConfig( + name="bigbench:semantic_parsing_spider", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="semantic_parsing_spider", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +sentence_ambiguity = LightevalTaskConfig( + name="bigbench:sentence_ambiguity", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="sentence_ambiguity", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +similarities_abstraction = LightevalTaskConfig( + name="bigbench:similarities_abstraction", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="similarities_abstraction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +simp_turing_concept = LightevalTaskConfig( + name="bigbench:simp_turing_concept", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="simp_turing_concept", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +simple_arithmetic_json = LightevalTaskConfig( + name="bigbench:simple_arithmetic_json", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="simple_arithmetic_json", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +simple_arithmetic_json_multiple_choice = LightevalTaskConfig( + name="bigbench:simple_arithmetic_json_multiple_choice", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="simple_arithmetic_json_multiple_choice", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +simple_arithmetic_json_subtasks = LightevalTaskConfig( + name="bigbench:simple_arithmetic_json_subtasks", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="simple_arithmetic_json_subtasks", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +simple_arithmetic_multiple_targets_json = LightevalTaskConfig( + name="bigbench:simple_arithmetic_multiple_targets_json", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="simple_arithmetic_multiple_targets_json", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +simple_ethical_questions = LightevalTaskConfig( + name="bigbench:simple_ethical_questions", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="simple_ethical_questions", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +simple_text_editing = LightevalTaskConfig( + name="bigbench:simple_text_editing", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="simple_text_editing", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +snarks = LightevalTaskConfig( + name="bigbench:snarks", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="snarks", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +social_iqa = LightevalTaskConfig( + name="bigbench:social_iqa", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="social_iqa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +social_support = LightevalTaskConfig( + name="bigbench:social_support", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="social_support", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.f1_score_macro], + stop_sequence=["\n"], + version=0, +) + +sports_understanding = LightevalTaskConfig( + name="bigbench:sports_understanding", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="sports_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +strange_stories = LightevalTaskConfig( + name="bigbench:strange_stories", + suite=["lighteval"], + prompt_function=prompt.bigbench_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="strange_stories", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +strategyqa = LightevalTaskConfig( + name="bigbench:strategyqa", + suite=["lighteval"], + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="strategyqa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +sufficient_information = LightevalTaskConfig( + name="bigbench:sufficient_information", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="sufficient_information", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +suicide_risk = LightevalTaskConfig( + name="bigbench:suicide_risk", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="suicide_risk", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +swahili_english_proverbs = LightevalTaskConfig( + name="bigbench:swahili_english_proverbs", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="swahili_english_proverbs", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +swedish_to_german_proverbs = LightevalTaskConfig( + name="bigbench:swedish_to_german_proverbs", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="swedish_to_german_proverbs", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +symbol_interpretation = LightevalTaskConfig( + name="bigbench:symbol_interpretation", + suite=["lighteval"], + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="symbol_interpretation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +tellmewhy = LightevalTaskConfig( + name="bigbench:tellmewhy", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="tellmewhy", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.rouge_t5], + stop_sequence=["\n"], + version=0, +) + +temporal_sequences = LightevalTaskConfig( + name="bigbench:temporal_sequences", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="temporal_sequences", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +tense = LightevalTaskConfig( + name="bigbench:tense", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="tense", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +timedial = LightevalTaskConfig( + name="bigbench:timedial", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="timedial", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +topical_chat = LightevalTaskConfig( + name="bigbench:topical_chat", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="topical_chat", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.bleu, Metrics.rouge_t5, Metrics.loglikelihood_acc, Metrics.bleurt], + stop_sequence=["\n"], + version=0, +) + +tracking_shuffled_objects = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="tracking_shuffled_objects", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +understanding_fables = LightevalTaskConfig( + name="bigbench:understanding_fables", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="understanding_fables", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +undo_permutation = LightevalTaskConfig( + name="bigbench:undo_permutation", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="undo_permutation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +unit_conversion = LightevalTaskConfig( + name="bigbench:unit_conversion", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="unit_conversion", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +unit_interpretation = LightevalTaskConfig( + name="bigbench:unit_interpretation", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="unit_interpretation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +unnatural_in_context_learning = LightevalTaskConfig( + name="bigbench:unnatural_in_context_learning", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="unnatural_in_context_learning", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +vitaminc_fact_verification = LightevalTaskConfig( + name="bigbench:vitaminc_fact_verification", + suite=["lighteval"], + prompt_function=prompt.bigbench_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="vitaminc_fact_verification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +what_is_the_tao = LightevalTaskConfig( + name="bigbench:what_is_the_tao", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="what_is_the_tao", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +which_wiki_edit = LightevalTaskConfig( + name="bigbench:which_wiki_edit", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="which_wiki_edit", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +winowhy = LightevalTaskConfig( + name="bigbench:winowhy", + suite=["lighteval"], + prompt_function=prompt.bigbench_whitespace_after_query, + hf_repo="tasksource/bigbench", + hf_subset="winowhy", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +word_sorting = LightevalTaskConfig( + name="bigbench:word_sorting", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="word_sorting", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +word_unscrambling = LightevalTaskConfig( + name="bigbench:word_unscrambling", + suite=["lighteval"], + prompt_function=prompt.bigbench, + hf_repo="tasksource/bigbench", + hf_subset="word_unscrambling", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + abstract_narrative_understanding, + anachronisms, + analogical_similarity, + moral_permissibility, + movie_dialog_same_or_different, + movie_recommendation, + mult_data_wrangling, + simple_ethical_questions, + simple_text_editing, + snarks, + social_iqa, + social_support, + sports_understanding, + strange_stories, + strategyqa, + sufficient_information, + suicide_risk, + swahili_english_proverbs, + swedish_to_german_proverbs, + symbol_interpretation, + tellmewhy, + temporal_sequences, + tense, + timedial, + topical_chat, + tracking_shuffled_objects, + understanding_fables, + undo_permutation, + unit_conversion, + unit_interpretation, + unnatural_in_context_learning, + vitaminc_fact_verification, + what_is_the_tao, + which_wiki_edit, + winowhy, + word_sorting, + word_unscrambling, +] diff --git a/src/lighteval/tasks/tasks/bigbench_hard.py b/src/lighteval/tasks/tasks/bigbench_hard.py new file mode 100644 index 000000000..f17781c2b --- /dev/null +++ b/src/lighteval/tasks/tasks/bigbench_hard.py @@ -0,0 +1,330 @@ +""" +name: +Bigbench Hard + +dataset: +lighteval/bbh + +abstract: + +languages: + +tags: +reasoning + +paper: +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +causal_judgment = LightevalTaskConfig( + name="bigbench_hard:causal_judgment", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="causal_judgement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +date_understanding = LightevalTaskConfig( + name="bigbench_hard:date_understanding", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="date_understanding", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +disambiguation_qa = LightevalTaskConfig( + name="bigbench_hard:disambiguation_qa", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="disambiguation_qa", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +geometric_shapes = LightevalTaskConfig( + name="bigbench_hard:geometric_shapes", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="geometric_shapes", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +logical_deduction_five_objects = LightevalTaskConfig( + name="bigbench_hard:logical_deduction_five_objects", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_five_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +logical_deduction_seven_objects = LightevalTaskConfig( + name="bigbench_hard:logical_deduction_seven_objects", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_seven_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +logical_deduction_three_objects = LightevalTaskConfig( + name="bigbench_hard:logical_deduction_three_objects", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_three_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +movie_recommendation = LightevalTaskConfig( + name="bigbench_hard:movie_recommendation", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="movie_recommendation", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +navigate = LightevalTaskConfig( + name="bigbench_hard:navigate", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="navigate", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +reasoning_about_colored_objects = LightevalTaskConfig( + name="bigbench_hard:reasoning_about_colored_objects", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="reasoning_about_colored_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +ruin_names = LightevalTaskConfig( + name="bigbench_hard:ruin_names", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="ruin_names", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +salient_translation_error_detection = LightevalTaskConfig( + name="bigbench_hard:salient_translation_error_detection", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="salient_translation_error_detection", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +snarks = LightevalTaskConfig( + name="bigbench_hard:snarks", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="snarks", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +sports_understanding = LightevalTaskConfig( + name="bigbench_hard:sports_understanding", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="sports_understanding", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +temporal_sequences = LightevalTaskConfig( + name="bigbench_hard:temporal_sequences", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="temporal_sequences", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +tracking_shuffled_objects_five_objects = LightevalTaskConfig( + name="bigbench_hard:tracking_shuffled_objects_five_objects", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_five_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +tracking_shuffled_objects_seven_objects = LightevalTaskConfig( + name="bigbench_hard:tracking_shuffled_objects_seven_objects", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_seven_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +tracking_shuffled_objects_three_objects = LightevalTaskConfig( + name="bigbench_hard:tracking_shuffled_objects_three_objects", + suite=["lighteval"], + prompt_function=prompt.bbh_lighteval, + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_three_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["", "Q=", "\n\n"], + version=0, +) + +TASKS_TABLE = [ + causal_judgment, + date_understanding, + disambiguation_qa, + geometric_shapes, + logical_deduction_five_objects, + logical_deduction_seven_objects, + logical_deduction_three_objects, + movie_recommendation, + navigate, + reasoning_about_colored_objects, + ruin_names, + salient_translation_error_detection, + snarks, + sports_understanding, + temporal_sequences, + tracking_shuffled_objects_five_objects, + tracking_shuffled_objects_seven_objects, + tracking_shuffled_objects_three_objects, +] diff --git a/src/lighteval/tasks/tasks/blimp.py b/src/lighteval/tasks/tasks/blimp.py new file mode 100644 index 000000000..822122bda --- /dev/null +++ b/src/lighteval/tasks/tasks/blimp.py @@ -0,0 +1,1141 @@ +""" +name: +Blimp + +dataset: +nyu-mll/blimp + +abstract: +BLiMP is a challenge set for evaluating what language models (LMs) know +about major grammatical phenomena in English. BLiMP consists of 67 +sub-datasets, each containing 1000 minimal pairs isolating specific +contrasts in syntax, morphology, or semantics. The data is automatically +generated according to expert-crafted grammars. + +languages: +english + +tags: +language-modeling + +paper: +https://arxiv.org/abs/1912.00582 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +blimp_adjunct_island = LightevalTaskConfig( + name="blimp:adjunct_island", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="adjunct_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_anaphor_gender_agreement = LightevalTaskConfig( + name="blimp:anaphor_gender_agreement", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="anaphor_gender_agreement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_anaphor_number_agreement = LightevalTaskConfig( + name="blimp:anaphor_number_agreement", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="anaphor_number_agreement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_animate_subject_passive = LightevalTaskConfig( + name="blimp:animate_subject_passive", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="animate_subject_passive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_animate_subject_trans = LightevalTaskConfig( + name="blimp:animate_subject_trans", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="animate_subject_trans", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_causative = LightevalTaskConfig( + name="blimp:causative", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="causative", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_complex_NP_island = LightevalTaskConfig( + name="blimp:complex_NP_island", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="complex_NP_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_coordinate_structure_constraint_complex_left_branch = LightevalTaskConfig( + name="blimp:coordinate_structure_constraint_complex_left_branch", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="coordinate_structure_constraint_complex_left_branch", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_coordinate_structure_constraint_object_extraction = LightevalTaskConfig( + name="blimp:coordinate_structure_constraint_object_extraction", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="coordinate_structure_constraint_object_extraction", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_determiner_noun_agreement_1 = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="determiner_noun_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_determiner_noun_agreement_2 = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="determiner_noun_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_determiner_noun_agreement_irregular_1 = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_irregular_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="determiner_noun_agreement_irregular_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_determiner_noun_agreement_irregular_2 = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_irregular_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="determiner_noun_agreement_irregular_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_determiner_noun_agreement_with_adj_2 = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="determiner_noun_agreement_with_adj_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_determiner_noun_agreement_with_adj_irregular_1 = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_irregular_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="determiner_noun_agreement_with_adj_irregular_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_determiner_noun_agreement_with_adj_irregular_2 = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_irregular_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="determiner_noun_agreement_with_adj_irregular_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_determiner_noun_agreement_with_adjective_1 = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adjective_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="determiner_noun_agreement_with_adjective_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_distractor_agreement_relational_noun = LightevalTaskConfig( + name="blimp:distractor_agreement_relational_noun", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="distractor_agreement_relational_noun", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_distractor_agreement_relative_clause = LightevalTaskConfig( + name="blimp:distractor_agreement_relative_clause", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="distractor_agreement_relative_clause", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_drop_argument = LightevalTaskConfig( + name="blimp:drop_argument", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="drop_argument", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_ellipsis_n_bar_1 = LightevalTaskConfig( + name="blimp:ellipsis_n_bar_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="ellipsis_n_bar_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_ellipsis_n_bar_2 = LightevalTaskConfig( + name="blimp:ellipsis_n_bar_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="ellipsis_n_bar_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_existential_there_object_raising = LightevalTaskConfig( + name="blimp:existential_there_object_raising", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="existential_there_object_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_existential_there_quantifiers_1 = LightevalTaskConfig( + name="blimp:existential_there_quantifiers_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="existential_there_quantifiers_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_existential_there_quantifiers_2 = LightevalTaskConfig( + name="blimp:existential_there_quantifiers_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="existential_there_quantifiers_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_existential_there_subject_raising = LightevalTaskConfig( + name="blimp:existential_there_subject_raising", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="existential_there_subject_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_expletive_it_object_raising = LightevalTaskConfig( + name="blimp:expletive_it_object_raising", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="expletive_it_object_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_inchoative = LightevalTaskConfig( + name="blimp:inchoative", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="inchoative", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_intransitive = LightevalTaskConfig( + name="blimp:intransitive", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="intransitive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_irregular_past_participle_adjectives = LightevalTaskConfig( + name="blimp:irregular_past_participle_adjectives", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="irregular_past_participle_adjectives", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_irregular_past_participle_verbs = LightevalTaskConfig( + name="blimp:irregular_past_participle_verbs", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="irregular_past_participle_verbs", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_irregular_plural_subject_verb_agreement_1 = LightevalTaskConfig( + name="blimp:irregular_plural_subject_verb_agreement_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="irregular_plural_subject_verb_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_irregular_plural_subject_verb_agreement_2 = LightevalTaskConfig( + name="blimp:irregular_plural_subject_verb_agreement_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="irregular_plural_subject_verb_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_left_branch_island_echo_question = LightevalTaskConfig( + name="blimp:left_branch_island_echo_question", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="left_branch_island_echo_question", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_left_branch_island_simple_question = LightevalTaskConfig( + name="blimp:left_branch_island_simple_question", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="left_branch_island_simple_question", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_matrix_question_npi_licensor_present = LightevalTaskConfig( + name="blimp:matrix_question_npi_licensor_present", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="matrix_question_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_npi_present_1 = LightevalTaskConfig( + name="blimp:npi_present_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="npi_present_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_npi_present_2 = LightevalTaskConfig( + name="blimp:npi_present_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="npi_present_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_only_npi_licensor_present = LightevalTaskConfig( + name="blimp:only_npi_licensor_present", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="only_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_only_npi_scope = LightevalTaskConfig( + name="blimp:only_npi_scope", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="only_npi_scope", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_passive_1 = LightevalTaskConfig( + name="blimp:passive_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="passive_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_passive_2 = LightevalTaskConfig( + name="blimp:passive_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="passive_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_principle_A_c_command = LightevalTaskConfig( + name="blimp:principle_A_c_command", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="principle_A_c_command", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_principle_A_case_1 = LightevalTaskConfig( + name="blimp:principle_A_case_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="principle_A_case_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_principle_A_case_2 = LightevalTaskConfig( + name="blimp:principle_A_case_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="principle_A_case_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_principle_A_domain_1 = LightevalTaskConfig( + name="blimp:principle_A_domain_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="principle_A_domain_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_principle_A_domain_2 = LightevalTaskConfig( + name="blimp:principle_A_domain_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="principle_A_domain_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_principle_A_domain_3 = LightevalTaskConfig( + name="blimp:principle_A_domain_3", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="principle_A_domain_3", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_principle_A_reconstruction = LightevalTaskConfig( + name="blimp:principle_A_reconstruction", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="principle_A_reconstruction", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_regular_plural_subject_verb_agreement_1 = LightevalTaskConfig( + name="blimp:regular_plural_subject_verb_agreement_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="regular_plural_subject_verb_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_regular_plural_subject_verb_agreement_2 = LightevalTaskConfig( + name="blimp:regular_plural_subject_verb_agreement_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="regular_plural_subject_verb_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_sentential_negation_npi_licensor_present = LightevalTaskConfig( + name="blimp:sentential_negation_npi_licensor_present", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="sentential_negation_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_sentential_negation_npi_scope = LightevalTaskConfig( + name="blimp:sentential_negation_npi_scope", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="sentential_negation_npi_scope", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_sentential_subject_island = LightevalTaskConfig( + name="blimp:sentential_subject_island", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="sentential_subject_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_superlative_quantifiers_1 = LightevalTaskConfig( + name="blimp:superlative_quantifiers_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="superlative_quantifiers_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_superlative_quantifiers_2 = LightevalTaskConfig( + name="blimp:superlative_quantifiers_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="superlative_quantifiers_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_tough_vs_raising_1 = LightevalTaskConfig( + name="blimp:tough_vs_raising_1", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="tough_vs_raising_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_tough_vs_raising_2 = LightevalTaskConfig( + name="blimp:tough_vs_raising_2", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="tough_vs_raising_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_transitive = LightevalTaskConfig( + name="blimp:transitive", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="transitive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_wh_island = LightevalTaskConfig( + name="blimp:wh_island", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="wh_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_wh_questions_object_gap = LightevalTaskConfig( + name="blimp:wh_questions_object_gap", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="wh_questions_object_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_wh_questions_subject_gap = LightevalTaskConfig( + name="blimp:wh_questions_subject_gap", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="wh_questions_subject_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_wh_questions_subject_gap_long_distance = LightevalTaskConfig( + name="blimp:wh_questions_subject_gap_long_distance", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="wh_questions_subject_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_wh_vs_that_no_gap = LightevalTaskConfig( + name="blimp:wh_vs_that_no_gap", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="wh_vs_that_no_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_wh_vs_that_no_gap_long_distance = LightevalTaskConfig( + name="blimp:wh_vs_that_no_gap_long_distance", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="wh_vs_that_no_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_wh_vs_that_with_gap = LightevalTaskConfig( + name="blimp:wh_vs_that_with_gap", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="wh_vs_that_with_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +blimp_wh_vs_that_with_gap_long_distance = LightevalTaskConfig( + name="blimp:wh_vs_that_with_gap_long_distance", + suite=["lighteval"], + prompt_function=prompt.blimp, + hf_repo="nyu-mll/blimp", + hf_subset="wh_vs_that_with_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + blimp_adjunct_island, + blimp_anaphor_gender_agreement, + blimp_anaphor_number_agreement, + blimp_animate_subject_passive, + blimp_animate_subject_trans, + blimp_causative, + blimp_complex_NP_island, + blimp_drop_argument, + blimp_ellipsis_n_bar_1, + blimp_ellipsis_n_bar_2, + blimp_existential_there_object_raising, + blimp_inchoative, + blimp_intransitive, + blimp_irregular_past_participle_adjectives, + blimp_irregular_past_participle_verbs, + blimp_only_npi_scope, + blimp_passive_1, + blimp_passive_2, + blimp_principle_A_c_command, + blimp_principle_A_reconstruction, + blimp_regular_plural_subject_verb_agreement_1, + blimp_regular_plural_subject_verb_agreement_2, + blimp_sentential_negation_npi_licensor_present, + blimp_sentential_negation_npi_scope, + blimp_sentential_subject_island, + blimp_superlative_quantifiers_1, + blimp_superlative_quantifiers_2, + blimp_tough_vs_raising_1, + blimp_tough_vs_raising_2, + blimp_transitive, + blimp_wh_island, + blimp_wh_questions_object_gap, + blimp_wh_questions_subject_gap, + blimp_wh_questions_subject_gap_long_distance, + blimp_wh_vs_that_no_gap, + blimp_wh_vs_that_no_gap_long_distance, + blimp_wh_vs_that_with_gap, + blimp_wh_vs_that_with_gap_long_distance, +] diff --git a/src/lighteval/tasks/tasks/bold.py b/src/lighteval/tasks/tasks/bold.py new file mode 100644 index 000000000..f1345a533 --- /dev/null +++ b/src/lighteval/tasks/tasks/bold.py @@ -0,0 +1,130 @@ +""" +name: +Bold + +dataset: +lighteval/bold_helm + +abstract: +The Bias in Open-Ended Language Generation Dataset (BOLD) for measuring biases +and toxicity in open-ended language generation. + +languages: +english + +tags: +bias, generation + +paper: +https://dl.acm.org/doi/10.1145/3442188.3445924 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +bold = LightevalTaskConfig( + name="bold", + suite=["lighteval"], + prompt_function=prompt.bold, + hf_repo="lighteval/bold_helm", + hf_subset="all", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.prediction_perplexity], + stop_sequence=["\n"], + version=0, +) + +bold_gender = LightevalTaskConfig( + name="bold:gender", + suite=["lighteval"], + prompt_function=prompt.bold, + hf_repo="lighteval/bold_helm", + hf_subset="gender", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.prediction_perplexity], + stop_sequence=["\n"], + version=0, +) + +bold_political_ideology = LightevalTaskConfig( + name="bold:political_ideology", + suite=["lighteval"], + prompt_function=prompt.bold, + hf_repo="lighteval/bold_helm", + hf_subset="political_ideology", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.prediction_perplexity], + stop_sequence=["\n"], + version=0, +) + +bold_profession = LightevalTaskConfig( + name="bold:profession", + suite=["lighteval"], + prompt_function=prompt.bold, + hf_repo="lighteval/bold_helm", + hf_subset="profession", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.prediction_perplexity], + stop_sequence=["\n"], + version=0, +) + +bold_race = LightevalTaskConfig( + name="bold:race", + suite=["lighteval"], + prompt_function=prompt.bold, + hf_repo="lighteval/bold_helm", + hf_subset="race", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.prediction_perplexity], + stop_sequence=["\n"], + version=0, +) + +bold_religious_ideology = LightevalTaskConfig( + name="bold:religious_ideology", + suite=["lighteval"], + prompt_function=prompt.bold, + hf_repo="lighteval/bold_helm", + hf_subset="religious_ideology", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.prediction_perplexity], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + bold, + bold_gender, + bold_political_ideology, + bold_profession, + bold_race, + bold_religious_ideology, +] diff --git a/src/lighteval/tasks/tasks/boolq.py b/src/lighteval/tasks/tasks/boolq.py new file mode 100644 index 000000000..b086ab1cb --- /dev/null +++ b/src/lighteval/tasks/tasks/boolq.py @@ -0,0 +1,66 @@ +""" +name: +Boolq + +dataset: +lighteval/boolq_helm + +abstract: +The BoolQ benchmark for binary (yes/no) question answering. + +languages: +english + +tags: +qa + +paper: +https://arxiv.org/abs/1905.11946 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +boolq = LightevalTaskConfig( + name="boolq", + suite=["lighteval"], + prompt_function=prompt.boolq_helm, + hf_repo="lighteval/boolq_helm", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + + +boolq_contrastset = LightevalTaskConfig( + name="boolq:contrastset", + suite=["lighteval"], + prompt_function=prompt.boolq_helm_contrastset, + hf_repo="lighteval/boolq_helm", + hf_subset="default", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + boolq, + boolq_contrastset, +] diff --git a/src/lighteval/tasks/tasks/civil_comments.py b/src/lighteval/tasks/tasks/civil_comments.py new file mode 100644 index 000000000..608ab097c --- /dev/null +++ b/src/lighteval/tasks/tasks/civil_comments.py @@ -0,0 +1,180 @@ +""" +name: +Civil Comments + +dataset: +lighteval/civil_comments_helm + +abstract: +The CivilComments benchmark for toxicity detection. + +languages: +english + +tags: +bias, classification + +paper: +https://arxiv.org/abs/1903.04561 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +civil_comments = LightevalTaskConfig( + name="civil_comments", + suite=["lighteval"], + prompt_function=prompt.civil_comments, + hf_repo="lighteval/civil_comments_helm", + hf_subset="all", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +civil_comments_LGBTQ = LightevalTaskConfig( + name="civil_comments:LGBTQ", + suite=["lighteval"], + prompt_function=prompt.civil_comments, + hf_repo="lighteval/civil_comments_helm", + hf_subset="LGBTQ", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +civil_comments_black = LightevalTaskConfig( + name="civil_comments:black", + suite=["lighteval"], + prompt_function=prompt.civil_comments, + hf_repo="lighteval/civil_comments_helm", + hf_subset="black", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +civil_comments_christian = LightevalTaskConfig( + name="civil_comments:christian", + suite=["lighteval"], + prompt_function=prompt.civil_comments, + hf_repo="lighteval/civil_comments_helm", + hf_subset="christian", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +civil_comments_female = LightevalTaskConfig( + name="civil_comments:female", + suite=["lighteval"], + prompt_function=prompt.civil_comments, + hf_repo="lighteval/civil_comments_helm", + hf_subset="female", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +civil_comments_male = LightevalTaskConfig( + name="civil_comments:male", + suite=["lighteval"], + prompt_function=prompt.civil_comments, + hf_repo="lighteval/civil_comments_helm", + hf_subset="male", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +civil_comments_muslim = LightevalTaskConfig( + name="civil_comments:muslim", + suite=["lighteval"], + prompt_function=prompt.civil_comments, + hf_repo="lighteval/civil_comments_helm", + hf_subset="muslim", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +civil_comments_other_religions = LightevalTaskConfig( + name="civil_comments:other_religions", + suite=["lighteval"], + prompt_function=prompt.civil_comments, + hf_repo="lighteval/civil_comments_helm", + hf_subset="other_religions", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +civil_comments_white = LightevalTaskConfig( + name="civil_comments:white", + suite=["lighteval"], + prompt_function=prompt.civil_comments, + hf_repo="lighteval/civil_comments_helm", + hf_subset="white", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + civil_comments, + civil_comments_LGBTQ, + civil_comments_black, + civil_comments_christian, + civil_comments_female, + civil_comments_male, + civil_comments_muslim, + civil_comments_other_religions, + civil_comments_white, +] diff --git a/src/lighteval/tasks/tasks/commonsenseqa.py b/src/lighteval/tasks/tasks/commonsenseqa.py new file mode 100644 index 000000000..8c6f6c6de --- /dev/null +++ b/src/lighteval/tasks/tasks/commonsenseqa.py @@ -0,0 +1,49 @@ +""" +name: +Commonsenseqa + +dataset: +tau/commonsense_qa + +abstract: +CommonsenseQA is a new multiple-choice question answering dataset that requires +different types of commonsense knowledge to predict the correct answers . It +contains 12,102 questions with one correct answer and four distractor answers. +The dataset is provided in two major training/validation/testing set splits: +"Random split" which is the main evaluation split, and "Question token split", +see paper for details. + +languages: +english + +tags: +commonsense, multiple-choice, qa + +paper: +https://arxiv.org/abs/1811.00937 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +commonsenseqa = LightevalTaskConfig( + name="commonsenseqa", + suite=["lighteval"], + prompt_function=prompt.commonsense_qa, + hf_repo="tau/commonsense_qa", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + commonsenseqa, +] diff --git a/src/lighteval/tasks/tasks/coqa.py b/src/lighteval/tasks/tasks/coqa.py new file mode 100644 index 000000000..a11b6a7a1 --- /dev/null +++ b/src/lighteval/tasks/tasks/coqa.py @@ -0,0 +1,45 @@ +""" +name: +Coqa + +dataset: +stanfordnlp/coqa + +abstract: +CoQA is a large-scale dataset for building Conversational Question Answering +systems. The goal of the CoQA challenge is to measure the ability of machines to +understand a text passage and answer a series of interconnected questions that +appear in a conversation. + +languages: +english + +tags: +dialog, qa + +paper: +https://arxiv.org/abs/1808.07042 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +coqa_first_question = LightevalTaskConfig( + name="coqa", + prompt_function=prompt.coqa, + suite=["lighteval"], + hf_repo="stanfordnlp/coqa", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + stop_sequence=["\n", "Question:", "question:"], + generation_size=100, + version=1, + metrics=[Metrics.exact_match], +) + +TASKS_TABLE = [ + coqa_first_question, +] diff --git a/src/lighteval/tasks/tasks/covid_dialogue.py b/src/lighteval/tasks/tasks/covid_dialogue.py new file mode 100644 index 000000000..bce5e17ce --- /dev/null +++ b/src/lighteval/tasks/tasks/covid_dialogue.py @@ -0,0 +1,45 @@ +""" +name: +Covid Dialogue + +dataset: +lighteval/covid_dialogue + +abstract: +The COVID-19 Dialogue dataset is a collection of 500+ dialogues between +doctors and patients during the COVID-19 pandemic. + +languages: +english + +tags: +dialog, medical + +paper: +https://arxiv.org/abs/2004.06561 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +covid_dialogue = LightevalTaskConfig( + name="covid_dialogue", + suite=["lighteval"], + prompt_function=prompt.covid_dialogue, + hf_repo="lighteval/covid_dialogue", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + covid_dialogue, +] diff --git a/community_tasks/custom_task_classification_grammar_task.py b/src/lighteval/tasks/tasks/custom_task_classification_grammar_task.py similarity index 86% rename from community_tasks/custom_task_classification_grammar_task.py rename to src/lighteval/tasks/tasks/custom_task_classification_grammar_task.py index 5b248093b..04a715149 100644 --- a/community_tasks/custom_task_classification_grammar_task.py +++ b/src/lighteval/tasks/tasks/custom_task_classification_grammar_task.py @@ -1,59 +1,21 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -# ruff: noqa: F405, F403, F401 -"""Emotion Classification Task with Grammar Constraints using LightEval - -This module demonstrates how to create a classification task in LightEval with JSON grammar-constrained generation for structured responses. - - -The task performs emotion classification on the 'emotion' dataset from HuggingFace Hub, -classifying text into one of six emotion categories: sadness, joy, love, anger, fear, surprise. - -Example usage: - TGI endpoint evaluation: - ```bash - uv run --active --extra litellm --extra tgi lighteval endpoint tgi examples/model_configs/tgi_model.yaml "custom|emotion_classification|0" - --custom-tasks examples/custom_tasks_templates/custom_task_classification_grammar_task.py - --output-dir results - --save-details - --no-public-run - ``` - -Dataset: - The task uses the 'emotion' dataset from HuggingFace Hub, which contains - English Twitter messages labeled with one of six emotions. The dataset - includes train/validation/test splits with the following distribution: - - Total samples: ~416k (train: ~16k, validation: ~2k, test: ~2k) - - Labels: sadness, joy, love, anger, fear, surprise - - Text format: Short social media posts in English - -Customization: - To adapt this task for other classification problems: - 1. Update EMOTION_LABELS with your target labels - 2. Modify prompt_emotion_classification() for your use case - 3. Update the grammar schema in get_emotion_classification_grammar() - 4. Adjust the HuggingFace dataset reference in EMOTION_CLASSIFICATION_TASK - 5. Update metric calculations in emotion_classification_metric() if needed +""" +name: +Emotion Classification + +dataset: +dair-ai/emotion + +abstract: +This task performs emotion classification classifying text into one of six +emotion categories: sadness, joy, love, anger, fear, surprise. + +languages: +english + +tags: +emotion, classification, multiple-choice + +paper: """ import json diff --git a/src/lighteval/tasks/tasks/drop_qa.py b/src/lighteval/tasks/tasks/drop_qa.py new file mode 100644 index 000000000..9e4b23bd7 --- /dev/null +++ b/src/lighteval/tasks/tasks/drop_qa.py @@ -0,0 +1,68 @@ +""" +name: +Drop Qa + +dataset: +lighteval/drop_harness + +abstract: +The DROP dataset is a new question-answering dataset designed to evaluate the +ability of language models to answer complex questions that require reasoning +over multiple sentences. + +languages: +english + +tags: +math, qa, reasoning + +paper: +https://arxiv.org/abs/1810.00505 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +drop_qa = LightevalTaskConfig( + name="drop", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: { + "context": line["passage"], + "question": line["question"], + "choices": list( + filter( + lambda x: x, + [line["answer"].get("number")] + + line["answer"]["spans"] + + [prompt.get_drop_date(line["answer"].get("date"))], + ) + ), + }, + ), + suite=("lighteval",), + hf_repo="lighteval/drop_harness", + hf_subset="default", + hf_filter=lambda line: list( + filter( + lambda x: x, + [line["answer"].get("number")] + + line["answer"]["spans"] + + [prompt.get_drop_date(line["answer"].get("date"))], + ) + ), + evaluation_splits=("validation",), + few_shots_split="train", + generation_size=250, + stop_sequence=["Question:", "question:", "\n"], + metrics=[Metrics.exact_match], + version=1, +) + +TASKS_TABLE = [ + drop_qa, +] diff --git a/src/lighteval/tasks/tasks/dyck_language.py b/src/lighteval/tasks/tasks/dyck_language.py new file mode 100644 index 000000000..ff2e536ea --- /dev/null +++ b/src/lighteval/tasks/tasks/dyck_language.py @@ -0,0 +1,80 @@ +""" +name: +Dyck Language + +dataset: +lighteval/DyckLanguage + +abstract: +Scenario testing hierarchical reasoning through the Dyck formal languages. + +languages: +english + +tags: +reasoning + +paper: +https://aclanthology.org/W19-3905/ +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +dyck_language_2 = LightevalTaskConfig( + name="dyck_language:2", + suite=["lighteval"], + prompt_function=prompt.dyck_language, + hf_repo="lighteval/DyckLanguage", + hf_subset="2", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + + +dyck_language_3 = LightevalTaskConfig( + name="dyck_language:3", + suite=["lighteval"], + prompt_function=prompt.dyck_language, + hf_repo="lighteval/DyckLanguage", + hf_subset="3", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + + +dyck_language_4 = LightevalTaskConfig( + name="dyck_language:4", + suite=["lighteval"], + prompt_function=prompt.dyck_language, + hf_repo="lighteval/DyckLanguage", + hf_subset="4", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + dyck_language_2, + dyck_language_3, + dyck_language_4, +] diff --git a/src/lighteval/tasks/tasks/entity_data_imputation.py b/src/lighteval/tasks/tasks/entity_data_imputation.py new file mode 100644 index 000000000..309e0585d --- /dev/null +++ b/src/lighteval/tasks/tasks/entity_data_imputation.py @@ -0,0 +1,66 @@ +""" +name: +Entity Data Imputation + +dataset: +lighteval/Buy, lighteval/Restaurant + +abstract: +Scenario that tests the ability to impute missing entities in a data table. + +languages: +english + +tags: +reasoning + +paper: +https://ieeexplore.ieee.org/document/9458712 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +entity_data_imputation_Buy = LightevalTaskConfig( + name="entity_data_imputation:Buy", + suite=["lighteval"], + prompt_function=prompt.entity_data_imputation, + hf_repo="lighteval/Buy", + hf_subset="default", + hf_avail_splits=["train", "test", "valid"], + evaluation_splits=["valid", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + + +entity_data_imputation_Restaurant = LightevalTaskConfig( + name="entity_data_imputation:Restaurant", + suite=["lighteval"], + prompt_function=prompt.entity_data_imputation, + hf_repo="lighteval/Restaurant", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + entity_data_imputation_Buy, + entity_data_imputation_Restaurant, +] diff --git a/src/lighteval/tasks/tasks/entitymatching.py b/src/lighteval/tasks/tasks/entitymatching.py new file mode 100644 index 000000000..c251244b2 --- /dev/null +++ b/src/lighteval/tasks/tasks/entitymatching.py @@ -0,0 +1,248 @@ +""" +name: +Entitymatching + +dataset: +lighteval/EntityMatching + +abstract: +Simple entity matching benchmark. + +languages: +english + +tags: +classification, reasoning + +paper: +https://dl.acm.org/doi/10.14778/3007263.3007314 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +entity_matching_Abt_Buy = LightevalTaskConfig( + name="entity_matching:Abt_Buy", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="Abt_Buy", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +entity_matching_Amazon_Google = LightevalTaskConfig( + name="entity_matching:Amazon_Google", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="Amazon_Google", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +entity_matching_Beer = LightevalTaskConfig( + name="entity_matching:Beer", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="Beer", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +entity_matching_Company = LightevalTaskConfig( + name="entity_matching:Company", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="Company", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +entity_matching_DBLP_ACM = LightevalTaskConfig( + name="entity_matching:DBLP_ACM", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="DBLP_ACM", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +entity_matching_DBLP_GoogleScholar = LightevalTaskConfig( + name="entity_matching:DBLP_GoogleScholar", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="DBLP_GoogleScholar", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +entity_matching_Dirty_DBLP_ACM = LightevalTaskConfig( + name="entity_matching:Dirty_DBLP_ACM", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="Dirty_DBLP_ACM", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +entity_matching_Dirty_DBLP_GoogleScholar = LightevalTaskConfig( + name="entity_matching:Dirty_DBLP_GoogleScholar", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="Dirty_DBLP_GoogleScholar", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +entity_matching_Dirty_Walmart_Amazon = LightevalTaskConfig( + name="entity_matching:Dirty_Walmart_Amazon", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="Dirty_Walmart_Amazon", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +entity_matching_Dirty_iTunes_Amazon = LightevalTaskConfig( + name="entity_matching:Dirty_iTunes_Amazon", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="Dirty_iTunes_Amazon", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +entity_matching_Fodors_Zagats = LightevalTaskConfig( + name="entity_matching=Fodors_Zagats", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="Fodors_Zagats", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +entity_matching_Walmart_Amazon = LightevalTaskConfig( + name="entity_matching:Walmart_Amazon", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="Walmart_Amazon", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +entity_matching_iTunes_Amazon = LightevalTaskConfig( + name="entity_matching:iTunes_Amazon", + suite=["lighteval"], + prompt_function=prompt.entity_matching, + hf_repo="lighteval/EntityMatching", + hf_subset="iTunes_Amazon", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + entity_matching_Abt_Buy, + entity_matching_Amazon_Google, + entity_matching_Beer, + entity_matching_Company, + entity_matching_DBLP_ACM, + entity_matching_DBLP_GoogleScholar, + entity_matching_Dirty_DBLP_ACM, + entity_matching_Dirty_DBLP_GoogleScholar, + entity_matching_Dirty_Walmart_Amazon, + entity_matching_Dirty_iTunes_Amazon, + entity_matching_Fodors_Zagats, + entity_matching_Walmart_Amazon, + entity_matching_iTunes_Amazon, +] diff --git a/src/lighteval/tasks/tasks/ethics.py b/src/lighteval/tasks/tasks/ethics.py new file mode 100644 index 000000000..bb45a2f2e --- /dev/null +++ b/src/lighteval/tasks/tasks/ethics.py @@ -0,0 +1,113 @@ +""" +name: +Ethics + +dataset: +lighteval/hendrycks_ethics + +abstract: +The Ethics benchmark for evaluating the ability of language models to reason about +ethical issues. + +languages: +english + +tags: +classification, ethics, justice, morality, utilitarianism, virtue + +paper: +https://arxiv.org/abs/2008.02275 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +ethics_commonsense = LightevalTaskConfig( + name="ethics:commonsense", + suite=["lighteval"], + prompt_function=prompt.ethics_commonsense, + hf_repo="lighteval/hendrycks_ethics", + hf_subset="commonsense", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +ethics_deontology = LightevalTaskConfig( + name="ethics:deontology", + suite=["lighteval"], + prompt_function=prompt.ethics_deontology, + hf_repo="lighteval/hendrycks_ethics", + hf_subset="deontology", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +ethics_justice = LightevalTaskConfig( + name="ethics:justice", + suite=["lighteval"], + prompt_function=prompt.ethics_justice, + hf_repo="lighteval/hendrycks_ethics", + hf_subset="justice", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +ethics_utilitarianism = LightevalTaskConfig( + name="ethics:utilitarianism", + suite=["lighteval"], + prompt_function=prompt.ethics_utilitarianism, + hf_repo="lighteval/hendrycks_ethics", + hf_subset="utilitarianism", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +ethics_virtue = LightevalTaskConfig( + name="ethics:virtue", + suite=["lighteval"], + prompt_function=prompt.ethics_virtue, + hf_repo="lighteval/hendrycks_ethics", + hf_subset="virtue", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + ethics_commonsense, + ethics_deontology, + ethics_justice, + ethics_utilitarianism, + ethics_virtue, +] diff --git a/src/lighteval/tasks/tasks/glue.py b/src/lighteval/tasks/tasks/glue.py new file mode 100644 index 000000000..69b9c0dc3 --- /dev/null +++ b/src/lighteval/tasks/tasks/glue.py @@ -0,0 +1,317 @@ +""" +name: +GLUE + +dataset: +nyu-mll/glue, aps/super_glue + +abstract: +The General Language Understanding Evaluation (GLUE) benchmark is a collection +of resources for training, evaluating, and analyzing natural language +understanding systems. + +languages: +english + +tags: +classification, language-understanding + +paper: +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +glue_cola = LightevalTaskConfig( + name="glue:cola", + suite=["lighteval"], + prompt_function=prompt.cola, + hf_repo="nyu-mll/glue", + hf_subset="cola", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc, Metrics.mcc], + stop_sequence=["\n"], + version=0, +) + +glue_mnli = LightevalTaskConfig( + name="glue:mnli", + suite=["lighteval"], + prompt_function=prompt.mnli, + hf_repo="nyu-mll/glue", + hf_subset="mnli_matched", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +glue_mnli_mismatched = LightevalTaskConfig( + name="glue:mnli_mismatched", + suite=["lighteval"], + prompt_function=prompt.mnli, + hf_repo="nyu-mll/glue", + hf_subset="mnli_mismatched", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +glue_mrpc = LightevalTaskConfig( + name="glue:mrpc", + suite=["lighteval"], + prompt_function=prompt.mrpc, + hf_repo="nyu-mll/glue", + hf_subset="mrpc", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_f1], + stop_sequence=["\n"], + version=0, +) + +glue_qnli = LightevalTaskConfig( + name="glue:qnli", + suite=["lighteval"], + prompt_function=prompt.qnli, + hf_repo="nyu-mll/glue", + hf_subset="qnli", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +glue_qqp = LightevalTaskConfig( + name="glue:qqp", + suite=["lighteval"], + prompt_function=prompt.qqp, + hf_repo="nyu-mll/glue", + hf_subset="qqp", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc, Metrics.loglikelihood_f1], + stop_sequence=["\n"], + version=0, +) + +glue_rte = LightevalTaskConfig( + name="glue:rte", + suite=["lighteval"], + prompt_function=prompt.rte, + hf_repo="nyu-mll/glue", + hf_subset="rte", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +glue_sst2 = LightevalTaskConfig( + name="glue:sst2", + suite=["lighteval"], + prompt_function=prompt.sst, + hf_repo="nyu-mll/glue", + hf_subset="sst2", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +glue_stsb = LightevalTaskConfig( + name="glue:stsb", + suite=["lighteval"], + prompt_function=prompt.stsb, + hf_repo="nyu-mll/glue", + hf_subset="stsb", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +glue_wnli = LightevalTaskConfig( + name="glue:wnli", + suite=["lighteval"], + prompt_function=prompt.wnli, + hf_repo="nyu-mll/glue", + hf_subset="wnli", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +super_glue_boolq = LightevalTaskConfig( + name="super_glue:boolq", + suite=["lighteval"], + prompt_function=prompt.boolq_harness, + hf_repo="aps/super_glue", + hf_subset="boolq", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +super_glue_cb = LightevalTaskConfig( + name="super_glue:cb", + suite=["lighteval"], + prompt_function=prompt.cb, + hf_repo="aps/super_glue", + hf_subset="cb", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc, Metrics.multi_f1_numeric], + stop_sequence=["\n"], + version=0, +) + +super_glue_copa = LightevalTaskConfig( + name="super_glue:copa", + suite=["lighteval"], + prompt_function=prompt.copa, + hf_repo="aps/super_glue", + hf_subset="copa", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +super_glue_rte = LightevalTaskConfig( + name="super_glue:rte", + suite=["lighteval"], + prompt_function=prompt.rte, + hf_repo="aps/super_glue", + hf_subset="rte", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +super_glue_multirc = LightevalTaskConfig( + name="super_glue:multirc", + suite=["lighteval"], + prompt_function=prompt.multirc, + hf_repo="aps/super_glue", + hf_subset="multirc", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +super_glue_wic = LightevalTaskConfig( + name="super_glue:wic", + suite=["lighteval"], + prompt_function=prompt.wic, + hf_repo="aps/super_glue", + hf_subset="wic", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +super_glue_wsc = LightevalTaskConfig( + name="super_glue:wsc", + suite=["lighteval"], + prompt_function=prompt.wsc, + hf_repo="aps/super_glue", + hf_subset="wsc", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + glue_cola, + glue_mnli, + glue_mnli_mismatched, + glue_mrpc, + glue_qnli, + glue_qqp, + glue_rte, + glue_sst2, + glue_stsb, + glue_wnli, + super_glue_boolq, + super_glue_cb, + super_glue_copa, + super_glue_rte, + super_glue_multirc, + super_glue_wic, + super_glue_wsc, +] diff --git a/src/lighteval/tasks/tasks/gpqa.py b/src/lighteval/tasks/tasks/gpqa.py new file mode 100644 index 000000000..5d0e67bda --- /dev/null +++ b/src/lighteval/tasks/tasks/gpqa.py @@ -0,0 +1,100 @@ +""" +name: +Gpqa + +dataset: +Idavidrein/gpqa + +abstract: +GPQA is a dataset of 448 expert-written multiple-choice questions in biology, +physics, and chemistry, designed to test graduate-level reasoning. The questions +are extremely difficult—PhD-level experts score about 65%, skilled non-experts +34% (even with web access), and GPT-4 around 39%. GPQA aims to support research +on scalable oversight, helping humans evaluate and trust AI systems that may +exceed human expertise. + +languages: +english + +tags: +biology, chemistry, graduate-level, multiple-choice, physics, qa, reasoning, science + +paper: +https://arxiv.org/abs/2311.12022 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +gpqa = LightevalTaskConfig( + name="gpqa:mc", + suite=["lighteval"], + prompt_function=prompt.gpqa, + hf_repo="Idavidrein/gpqa", + hf_subset="gpqa_main", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +gpqa_diamond_instruct = LightevalTaskConfig( + name="gpqa:diamond", + suite=["lighteval"], + prompt_function=prompt.gpqa_instruct, + hf_repo="Idavidrein/gpqa", + hf_subset="gpqa_diamond", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=32768, # needed for reasoning models like R1 + metrics=[Metrics.gpqa_instruct_pass_at_k(sample_params={"k": 1})], + stop_sequence=[], # no stop sequence, will use eos token + version=1, +) + +gpqa_extended_instruct = LightevalTaskConfig( + name="gpqa:extended", + suite=["lighteval"], + prompt_function=prompt.gpqa_instruct, + hf_repo="Idavidrein/gpqa", + hf_subset="gpqa_extended", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=32768, # needed for reasoning models like R1 + metrics=[Metrics.gpqa_instruct_metric], + stop_sequence=[], # no stop sequence, will use eos token + version=0, +) + +gpqa_main_instruct = LightevalTaskConfig( + name="gpqa:main", + suite=["lighteval"], + prompt_function=prompt.gpqa_instruct, + hf_repo="Idavidrein/gpqa", + hf_subset="gpqa_main", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=32768, # needed for reasoning models like R1 + metrics=[Metrics.gpqa_instruct_metric], + stop_sequence=[], # no stop sequence, will use eos token + version=0, +) + +TASKS_TABLE = [ + gpqa, + gpqa_diamond_instruct, + gpqa_extended_instruct, + gpqa_main_instruct, +] diff --git a/src/lighteval/tasks/tasks/gsm8k.py b/src/lighteval/tasks/tasks/gsm8k.py new file mode 100644 index 000000000..c4b5a51a6 --- /dev/null +++ b/src/lighteval/tasks/tasks/gsm8k.py @@ -0,0 +1,46 @@ +""" +name: +Gsm8K + +dataset: +openai/gsm8k + +abstract: +GSM8K is a dataset of 8,000+ high-quality, single-step arithmetic word problems. + +languages: +english + +tags: +math, reasoning + +paper: +https://arxiv.org/abs/2110.14168 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +gsm8k = LightevalTaskConfig( + name="gsm8k", + suite=["lighteval"], + prompt_function=prompt.gsm8k, + hf_repo="openai/gsm8k", + hf_subset="main", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=256, + metrics=[ + Metrics.expr_gold_metric, + ], + stop_sequence=["Question:"], + version=0, +) + +TASKS_TABLE = [ + gsm8k, +] diff --git a/src/lighteval/tasks/tasks/gsm_plus.py b/src/lighteval/tasks/tasks/gsm_plus.py new file mode 100644 index 000000000..65afadef2 --- /dev/null +++ b/src/lighteval/tasks/tasks/gsm_plus.py @@ -0,0 +1,46 @@ +""" +name: +Gsm Plus + +dataset: +qintongli/GSM-Plus + +abstract: +GSM-Plus is an adversarial extension of GSM8K that tests the robustness of LLMs' +mathematical reasoning by introducing varied perturbations to grade-school math +problems. + +languages: +english + +tags: +math, reasoning + +paper: +https://arxiv.org/abs/2402.19255 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +gsm_plus = LightevalTaskConfig( + name="gsm_plus", + suite=["lighteval"], + prompt_function=prompt.gsm_plus, + hf_repo="qintongli/GSM-Plus", + hf_subset="default", + hf_avail_splits=["test", "testmini"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.expr_gold_metric], + stop_sequence=None, + version=0, +) + +TASKS_TABLE = [ + gsm_plus, +] diff --git a/src/lighteval/tasks/tasks/headqa.py b/src/lighteval/tasks/tasks/headqa.py new file mode 100644 index 000000000..2d7eb36ea --- /dev/null +++ b/src/lighteval/tasks/tasks/headqa.py @@ -0,0 +1,70 @@ +""" +name: +Headqa + +dataset: +lighteval/headqa_harness + +abstract: +HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to +access a specialized position in the Spanish healthcare system, and are +challenging even for highly specialized humans. They are designed by the +Ministerio de Sanidad, Consumo y Bienestar Social, who also provides direct +access to the exams of the last 5 years. + +languages: +english, spanish + +tags: +health, medical, multiple-choice, qa + +paper: +https://arxiv.org/abs/1906.04701 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +headqa_en = LightevalTaskConfig( + name="headqa:en", + suite=["lighteval"], + prompt_function=prompt.headqa, + hf_repo="lighteval/headqa_harness", + hf_subset="en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=["\n"], + version=0, +) + + +headqa_es = LightevalTaskConfig( + name="headqa:es", + suite=["lighteval"], + prompt_function=prompt.headqa, + hf_repo="lighteval/headqa_harness", + hf_subset="es", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + headqa_en, + headqa_es, +] diff --git a/src/lighteval/tasks/tasks/hellaswag.py b/src/lighteval/tasks/tasks/hellaswag.py new file mode 100644 index 000000000..76e02fee0 --- /dev/null +++ b/src/lighteval/tasks/tasks/hellaswag.py @@ -0,0 +1,47 @@ +""" +name: +Hellaswag + +dataset: +Rowan/hellaswag + +abstract: +HellaSwag is a commonsense inference benchmark designed to challenge language +models with adversarially filtered multiple-choice questions. + +languages: +english + +tags: +multiple-choice, narrative, reasoning + +paper: +https://arxiv.org/abs/1905.07830 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +hellaswag = LightevalTaskConfig( + name="hellaswag", + suite=["lighteval"], + prompt_function=prompt.hellaswag_generative, + hf_repo="Rowan/hellaswag", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + hellaswag, +] diff --git a/src/lighteval/tasks/extended/hle/main.py b/src/lighteval/tasks/tasks/hle/main.py similarity index 85% rename from src/lighteval/tasks/extended/hle/main.py rename to src/lighteval/tasks/tasks/hle/main.py index 1e2540984..c22dcaf72 100644 --- a/src/lighteval/tasks/extended/hle/main.py +++ b/src/lighteval/tasks/tasks/hle/main.py @@ -1,25 +1,25 @@ -# MIT License +""" +name: +Humanity's Last Exam -# Copyright (c) 2024 The HuggingFace Team +dataset: +cais/hle -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +abstract: +Humanity's Last Exam (HLE) is a global collaborative effort, with questions from +nearly 1,000 subject expert contributors affiliated with over 500 institutions +across 50 countries - comprised mostly of professors, researchers, and graduate +degree holders. -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +languages: +english -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +tags: +qa, reasoning, general-knowledge +paper: +https://arxiv.org/abs/2501.14249 +""" import logging import math @@ -47,8 +47,7 @@ class ExtractedAnswer(BaseModel): strict: Literal[True] # 100% reliability -"""Adaptation from https://github.com/centerforaisafety/hle/blob/main/hle_eval/run_judge_results.py -""" +# Adaptation from https://github.com/centerforaisafety/hle/blob/main/hle_eval/run_judge_results.py def get_judge_prompt(question: str, answer: str, gold: str, **kwargs): diff --git a/src/lighteval/tasks/extended/ifbench/evaluation_lib.py b/src/lighteval/tasks/tasks/ifbench/evaluation_lib.py similarity index 98% rename from src/lighteval/tasks/extended/ifbench/evaluation_lib.py rename to src/lighteval/tasks/tasks/ifbench/evaluation_lib.py index 493362866..2c4b761e8 100644 --- a/src/lighteval/tasks/extended/ifbench/evaluation_lib.py +++ b/src/lighteval/tasks/tasks/ifbench/evaluation_lib.py @@ -20,7 +20,7 @@ import json from typing import Dict, Optional, Union -import lighteval.tasks.extended.ifbench.instructions_registry as instructions_registry +import lighteval.tasks.tasks.ifbench.instructions_registry as instructions_registry @dataclasses.dataclass diff --git a/src/lighteval/tasks/extended/ifbench/instructions.py b/src/lighteval/tasks/tasks/ifbench/instructions.py similarity index 99% rename from src/lighteval/tasks/extended/ifbench/instructions.py rename to src/lighteval/tasks/tasks/ifbench/instructions.py index 0c4f0a9a0..f691a26f8 100644 --- a/src/lighteval/tasks/extended/ifbench/instructions.py +++ b/src/lighteval/tasks/tasks/ifbench/instructions.py @@ -23,7 +23,6 @@ import unicodedata from collections import Counter -import emoji import nltk from lighteval.utils.imports import is_package_available, requires @@ -35,7 +34,10 @@ if is_package_available("spacy"): import spacy -import lighteval.tasks.extended.ifeval.instructions_utils as instructions_util +if is_package_available("emoji"): + import emoji + +import lighteval.tasks.tasks.ifeval.instructions_utils as instructions_util logger = logging.getLogger(__name__) diff --git a/src/lighteval/tasks/extended/ifbench/instructions_registry.py b/src/lighteval/tasks/tasks/ifbench/instructions_registry.py similarity index 98% rename from src/lighteval/tasks/extended/ifbench/instructions_registry.py rename to src/lighteval/tasks/tasks/ifbench/instructions_registry.py index b47494dd2..b146bd06d 100644 --- a/src/lighteval/tasks/extended/ifbench/instructions_registry.py +++ b/src/lighteval/tasks/tasks/ifbench/instructions_registry.py @@ -14,7 +14,7 @@ """Registry of all instructions.""" -import lighteval.tasks.extended.ifbench.instructions as instructions +import lighteval.tasks.tasks.ifbench.instructions as instructions INSTRUCTION_DICT = { diff --git a/src/lighteval/tasks/extended/ifbench/main.py b/src/lighteval/tasks/tasks/ifbench/main.py similarity index 75% rename from src/lighteval/tasks/extended/ifbench/main.py rename to src/lighteval/tasks/tasks/ifbench/main.py index 6f948203a..419c86600 100644 --- a/src/lighteval/tasks/extended/ifbench/main.py +++ b/src/lighteval/tasks/tasks/ifbench/main.py @@ -1,25 +1,22 @@ -# MIT License +""" +name: +IFBench -# Copyright (c) 2024 The HuggingFace Team +dataset: +allenai/IFBench_test, allenai/IFBench_multi-turn -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +abstract: +Challenging benchmark for precise instruction following. -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +languages: +english -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +tags: +instruction-following +paper: +https://arxiv.org/abs/2507.02833 +""" import numpy as np from aenum import extend_enum @@ -30,9 +27,9 @@ SampleLevelMetricGrouping, ) from lighteval.models.model_output import ModelResponse -from lighteval.tasks.extended.ifbench import evaluation_lib from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc, SamplingMethod +from lighteval.tasks.tasks.ifbench import evaluation_lib def ifbench_prompt(line, task_name: str = ""): @@ -104,7 +101,7 @@ def agg_inst_level_acc(items): ifbench_test = LightevalTaskConfig( name="ifbench_test", prompt_function=ifbench_prompt, - suite=["extended"], + suite=["lighteval"], hf_repo="allenai/IFBench_test", hf_subset="default", metrics=[ifbench_metrics], @@ -121,7 +118,7 @@ def agg_inst_level_acc(items): ifbench_multiturn = LightevalTaskConfig( name="ifbench_multiturn", prompt_function=ifbench_prompt, - suite=["extended"], + suite=["lighteval"], hf_repo="allenai/IFBench_multi-turn", hf_subset="default", metrics=[ifbench_metrics], diff --git a/src/lighteval/tasks/extended/ifeval/instructions.py b/src/lighteval/tasks/tasks/ifeval/instructions.py similarity index 99% rename from src/lighteval/tasks/extended/ifeval/instructions.py rename to src/lighteval/tasks/tasks/ifeval/instructions.py index 06b7cf85c..70a87e893 100644 --- a/src/lighteval/tasks/extended/ifeval/instructions.py +++ b/src/lighteval/tasks/tasks/ifeval/instructions.py @@ -27,7 +27,7 @@ if is_package_available("langdetect"): import langdetect -import lighteval.tasks.extended.ifeval.instructions_utils as instructions_util +import lighteval.tasks.tasks.ifeval.instructions_utils as instructions_util logger = logging.getLogger(__name__) diff --git a/src/lighteval/tasks/extended/ifeval/instructions_registry.py b/src/lighteval/tasks/tasks/ifeval/instructions_registry.py similarity index 99% rename from src/lighteval/tasks/extended/ifeval/instructions_registry.py rename to src/lighteval/tasks/tasks/ifeval/instructions_registry.py index 62becfbaa..4dada73d4 100644 --- a/src/lighteval/tasks/extended/ifeval/instructions_registry.py +++ b/src/lighteval/tasks/tasks/ifeval/instructions_registry.py @@ -14,7 +14,7 @@ """Registry of all instructions.""" -import lighteval.tasks.extended.ifeval.instructions as instructions +import lighteval.tasks.tasks.ifeval.instructions as instructions _KEYWORD = "keywords:" diff --git a/src/lighteval/tasks/extended/ifeval/instructions_utils.py b/src/lighteval/tasks/tasks/ifeval/instructions_utils.py similarity index 100% rename from src/lighteval/tasks/extended/ifeval/instructions_utils.py rename to src/lighteval/tasks/tasks/ifeval/instructions_utils.py diff --git a/src/lighteval/tasks/extended/ifeval/main.py b/src/lighteval/tasks/tasks/ifeval/main.py similarity index 79% rename from src/lighteval/tasks/extended/ifeval/main.py rename to src/lighteval/tasks/tasks/ifeval/main.py index ae7d42809..2922e5fb6 100644 --- a/src/lighteval/tasks/extended/ifeval/main.py +++ b/src/lighteval/tasks/tasks/ifeval/main.py @@ -1,29 +1,27 @@ -# MIT License +""" +name: +IFEval -# Copyright (c) 2024 The HuggingFace Team +dataset: +google/IFEval -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +abstract: +Very specific task where there are no precise outputs but instead we test if the +format obeys rules. -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +languages: +english -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +tags: +instruction-following +paper: +https://arxiv.org/abs/2311.07911 +""" import numpy as np -import lighteval.tasks.extended.ifeval.instructions_registry as instructions_registry +import lighteval.tasks.tasks.ifeval.instructions_registry as instructions_registry from lighteval.metrics.metrics_sample import SampleLevelComputation from lighteval.metrics.utils.metric_utils import ( SampleLevelMetricGrouping, @@ -149,7 +147,7 @@ def agg_inst_level_acc(items): ifeval = LightevalTaskConfig( name="ifeval", prompt_function=ifeval_prompt, - suite=["extended"], + suite=["lighteval"], hf_repo="google/IFEval", hf_subset="default", metrics=[ifeval_metrics], diff --git a/src/lighteval/tasks/tasks/imdb.py b/src/lighteval/tasks/tasks/imdb.py new file mode 100644 index 000000000..e7073699e --- /dev/null +++ b/src/lighteval/tasks/tasks/imdb.py @@ -0,0 +1,67 @@ +""" +name: +Imdb + +dataset: +lighteval/IMDB_helm + +abstract: +The IMDB benchmark for sentiment analysis in movie review, from: +Learning Word Vectors for Sentiment Analysis + +languages: +english + +tags: +classification + +paper: +https://aclanthology.org/P11-1015/ +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +imdb = LightevalTaskConfig( + name="imdb", + suite=["lighteval"], + prompt_function=prompt.imdb, + hf_repo="lighteval/IMDB_helm", + hf_subset="default", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + + +imdb_contrastset = LightevalTaskConfig( + name="imdb:contrastset", + suite=["lighteval"], + prompt_function=prompt.imdb_contrastset, + hf_repo="lighteval/IMDB_helm", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + imdb, + imdb_contrastset, +] diff --git a/src/lighteval/tasks/tasks/jeopardy.py b/src/lighteval/tasks/tasks/jeopardy.py new file mode 100644 index 000000000..5044602fe --- /dev/null +++ b/src/lighteval/tasks/tasks/jeopardy.py @@ -0,0 +1,48 @@ +""" +name: +Jeopardy + +dataset: +openaccess-ai-collective/jeopardy + +abstract: +Jeopardy is a dataset of questions and answers from the Jeopardy game show. + +languages: +english + +tags: +knowledge, qa + +paper: +""" + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +jeopardy = LightevalTaskConfig( + name="jeopardy", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: { + "question": line["question"], + "choices": [line["answer"]], + }, + ), + suite=("lighteval",), + hf_repo="openaccess-ai-collective/jeopardy", + hf_subset="default", + evaluation_splits=("train",), + few_shots_split="train", + generation_size=250, + stop_sequence=["\n", "Question:", "question:"], + metrics=[Metrics.exact_match], + version=1, +) + +TASKS_TABLE = [ + jeopardy, +] diff --git a/src/lighteval/tasks/tasks/lambada.py b/src/lighteval/tasks/tasks/lambada.py new file mode 100644 index 000000000..3a7292a3f --- /dev/null +++ b/src/lighteval/tasks/tasks/lambada.py @@ -0,0 +1,65 @@ +""" +name: +Lambada + +dataset: +cimec/lambada + +abstract: +LAMBADA is a benchmark for testing language models’ ability to understand broad +narrative context. Each passage requires predicting its final word—easy for +humans given the full passage but impossible from just the last sentence. +Success demands long-range discourse comprehension. + +languages: +english + +tags: +language-modeling + +paper: +https://arxiv.org/abs/1606.06031 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +lambada_standard = LightevalTaskConfig( + name="lambada:standard", + suite=["lighteval"], + prompt_function=prompt.lambada, + hf_repo="cimec/lambada", + hf_subset="plain_text", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metrics=[Metrics.target_perplexity], + stop_sequence=["\n"], + version=0, +) + + +lambada_standard_cloze = LightevalTaskConfig( + name="lambada:standard_cloze", + suite=["lighteval"], + prompt_function=prompt.lambada_cloze, + hf_repo="cimec/lambada", + hf_subset="plain_text", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metrics=[Metrics.target_perplexity], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + lambada_standard, + lambada_standard_cloze, +] diff --git a/src/lighteval/tasks/extended/lcb/codegen_metrics.py b/src/lighteval/tasks/tasks/lcb/codegen_metrics.py similarity index 94% rename from src/lighteval/tasks/extended/lcb/codegen_metrics.py rename to src/lighteval/tasks/tasks/lcb/codegen_metrics.py index 08246806a..e2617ed44 100644 --- a/src/lighteval/tasks/extended/lcb/codegen_metrics.py +++ b/src/lighteval/tasks/tasks/lcb/codegen_metrics.py @@ -1,28 +1,16 @@ -# MIT License - -# Copyright (c) 2025 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -"""This module contains helper functions copied and modified from -https://github.com/LiveCodeBench/LiveCodeBench -and -https://github.com/QwenLM/Qwen2.5-Coder/tree/main/qwencoder-eval/instruct/livecode_bench +""" +name: +Codegen Metrics + +dataset: + +abstract: + +languages: + +tags: + +paper: """ import ast diff --git a/src/lighteval/tasks/extended/lcb/main.py b/src/lighteval/tasks/tasks/lcb/main.py similarity index 75% rename from src/lighteval/tasks/extended/lcb/main.py rename to src/lighteval/tasks/tasks/lcb/main.py index 299ae9073..0f2f5d52e 100644 --- a/src/lighteval/tasks/extended/lcb/main.py +++ b/src/lighteval/tasks/tasks/lcb/main.py @@ -1,32 +1,24 @@ -# MIT License - -# Copyright (c) 2025 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -"""Usage: -lighteval vllm \ - "pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={temperature:0.6,top_p:0.95}" \ - "extended|lcb:codegeneration|0" - -lighteval vllm \ - "pretrained=Qwen/Qwen2.5-Coder-3B-Instruct,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={temperature:0.2,top_p:0.95}" \ - "extended|lcb:codegeneration|0" +""" +name: +Live Code Bench + +dataset: +lighteval/code_generation_lite + +abstract: +LiveCodeBench collects problems from periodic contests on LeetCode, AtCoder, and +Codeforces platforms and uses them for constructing a holistic benchmark for +evaluating Code LLMs across variety of code-related scenarios continuously over +time. + +languages: +english + +tags: +code-generation + +paper: +https://livecodebench.github.io/ """ import json @@ -38,13 +30,13 @@ from lighteval.metrics.metrics import Metrics, SampleLevelMetric from lighteval.metrics.metrics_sample import SampleLevelComputation from lighteval.models.model_output import ModelResponse -from lighteval.tasks.extended.lcb.codegen_metrics import ( +from lighteval.tasks.lighteval_task import Doc, LightevalTaskConfig +from lighteval.tasks.requests import SamplingMethod +from lighteval.tasks.tasks.lcb.codegen_metrics import ( codegen_metrics, extract_code, translate_private_test_cases, ) -from lighteval.tasks.lighteval_task import Doc, LightevalTaskConfig -from lighteval.tasks.requests import SamplingMethod def prepare_prompt(line: dict[str, Any]) -> str: @@ -154,7 +146,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> dict: name = "lcb:codegeneration" if subset == "v4_v5" else f"lcb:codegeneration_{subset}" task = LightevalTaskConfig( name=name, - suite=["extended"], + suite=["lighteval"], prompt_function=lcb_codegeneration_prompt_fn, hf_repo="lighteval/code_generation_lite", hf_subset=subset, # https://github.com/LiveCodeBench/LiveCodeBench/tree/main?tab=readme-ov-file#dataset-versions diff --git a/src/lighteval/tasks/tasks/legal_summarization.py b/src/lighteval/tasks/tasks/legal_summarization.py new file mode 100644 index 000000000..3e31b67ba --- /dev/null +++ b/src/lighteval/tasks/tasks/legal_summarization.py @@ -0,0 +1,102 @@ +""" +name: +Legal Summarization + +dataset: +lighteval/legal_summarization + +abstract: +LegalSummarization is a dataset for legal summarization. + +languages: +english + +tags: +legal, summarization + +paper: +https://arxiv.org/abs/2210.13448 +https://arxiv.org/abs/2210.13448 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +legal_summarization_billsum = LightevalTaskConfig( + name="legal_summarization:billsum", + suite=["lighteval"], + prompt_function=prompt.legal_summarization, + hf_repo="lighteval/legal_summarization", + hf_subset="BillSum", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1024, + metrics=[ + Metrics.rouge1, + Metrics.rouge2, + Metrics.rougeL, + Metrics.faithfulness, + Metrics.extractiveness, + Metrics.bert_score, + ], + stop_sequence=["\n"], + version=0, +) + + +legal_summarization_eurlexsum = LightevalTaskConfig( + name="legal_summarization:eurlexsum", + suite=["lighteval"], + prompt_function=prompt.legal_summarization, + hf_repo="lighteval/legal_summarization", + hf_subset="EurLexSum", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metrics=[ + Metrics.rouge1, + Metrics.rouge2, + Metrics.rougeL, + Metrics.faithfulness, + Metrics.extractiveness, + Metrics.bert_score, + ], + stop_sequence=["\n"], + version=0, +) + + +legal_summarization_multilexsum = LightevalTaskConfig( + name="legal_summarization:multilexsum", + suite=["lighteval"], + prompt_function=prompt.multilexsum, + hf_repo="lighteval/legal_summarization", + hf_subset="MultiLexSum", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metrics=[ + Metrics.rouge1, + Metrics.rouge2, + Metrics.rougeL, + Metrics.faithfulness, + Metrics.extractiveness, + Metrics.bert_score, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + legal_summarization_billsum, + legal_summarization_eurlexsum, + legal_summarization_multilexsum, +] diff --git a/src/lighteval/tasks/tasks/legalsupport.py b/src/lighteval/tasks/tasks/legalsupport.py new file mode 100644 index 000000000..82ea8c864 --- /dev/null +++ b/src/lighteval/tasks/tasks/legalsupport.py @@ -0,0 +1,43 @@ +""" +name: +Legalsupport + +dataset: +lighteval/LegalSupport + +abstract: +Measures fine-grained legal reasoning through reverse entailment. + +languages: +english + +tags: +legal + +paper: +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +legalsupport = LightevalTaskConfig( + name="legalsupport", + suite=["lighteval"], + prompt_function=prompt.legal_support, + hf_repo="lighteval/LegalSupport", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + legalsupport, +] diff --git a/src/lighteval/tasks/tasks/lexglue.py b/src/lighteval/tasks/tasks/lexglue.py new file mode 100644 index 000000000..4206225a3 --- /dev/null +++ b/src/lighteval/tasks/tasks/lexglue.py @@ -0,0 +1,146 @@ +""" +name: +Lexglue + +dataset: +lighteval/lexglue + +abstract: +LexGLUE: A Benchmark Dataset for Legal Language Understanding in English + +languages: +english + +tags: +classification, legal + +paper: +https://arxiv.org/abs/2110.00976 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +lexglue_case_hold = LightevalTaskConfig( + name="lexglue:case_hold", + suite=["lighteval"], + prompt_function=prompt.lex_glue_case_hold, + hf_repo="lighteval/lexglue", + hf_subset="case_hold", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lexglue_ecthr_a = LightevalTaskConfig( + name="lexglue:ecthr_a", + suite=["lighteval"], + prompt_function=prompt.lex_glue_ecthr_a, + hf_repo="lighteval/lexglue", + hf_subset="ecthr_a", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lexglue_ecthr_b = LightevalTaskConfig( + name="lexglue:ecthr_b", + suite=["lighteval"], + prompt_function=prompt.lex_glue_ecthr_b, + hf_repo="lighteval/lexglue", + hf_subset="ecthr_b", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lexglue_eurlex = LightevalTaskConfig( + name="lexglue:eurlex", + suite=["lighteval"], + prompt_function=prompt.lex_glue_eurlex, + hf_repo="lighteval/lexglue", + hf_subset="eurlex", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lexglue_ledgar = LightevalTaskConfig( + name="lexglue:ledgar", + suite=["lighteval"], + prompt_function=prompt.lex_glue_ledgar, + hf_repo="lighteval/lexglue", + hf_subset="ledgar", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lexglue_scotus = LightevalTaskConfig( + name="lexglue:scotus", + suite=["lighteval"], + prompt_function=prompt.lex_glue_scotus, + hf_repo="lighteval/lexglue", + hf_subset="scotus", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lexglue_unfair_tos = LightevalTaskConfig( + name="lexglue:unfair_tos", + suite=["lighteval"], + prompt_function=prompt.lex_glue_unfair_tos, + hf_repo="lighteval/lexglue", + hf_subset="unfair_tos", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + lexglue_case_hold, + lexglue_ecthr_a, + lexglue_ecthr_b, + lexglue_eurlex, + lexglue_ledgar, + lexglue_scotus, + lexglue_unfair_tos, +] diff --git a/src/lighteval/tasks/tasks/lextreme.py b/src/lighteval/tasks/tasks/lextreme.py new file mode 100644 index 000000000..7ba9df453 --- /dev/null +++ b/src/lighteval/tasks/tasks/lextreme.py @@ -0,0 +1,333 @@ +""" +name: +Lextreme + +dataset: +lighteval/lextreme + +abstract: +LEXTREME: A Multi-Lingual and Multi-Task Benchmark for the Legal Domain + +languages: +bulgarian, czech, danish, german, greek, english, spanish, estonian, finnish, french, ga, croatian, hungarian, italian, lithuanian, latvian, mt, dutch, polish, portuguese, romanian, slovak, slovenian, swedish + +tags: +classification, legal + +paper: +https://arxiv.org/abs/2301.13126 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +lextreme_brazilian_court_decisions_judgment = LightevalTaskConfig( + name="lextreme:brazilian_court_decisions_judgment", + suite=["lighteval"], + prompt_function=prompt.lextreme_brazilian_court_decisions_judgment, + hf_repo="lighteval/lextreme", + hf_subset="brazilian_court_decisions_judgment", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_brazilian_court_decisions_unanimity = LightevalTaskConfig( + name="lextreme:brazilian_court_decisions_unanimity", + suite=["lighteval"], + prompt_function=prompt.lextreme_brazilian_court_decisions_unanimity, + hf_repo="lighteval/lextreme", + hf_subset="brazilian_court_decisions_unanimity", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_covid19_emergency_event = LightevalTaskConfig( + name="lextreme:covid19_emergency_event", + suite=["lighteval"], + prompt_function=prompt.lextreme_covid19_emergency_event, + hf_repo="lighteval/lextreme", + hf_subset="covid19_emergency_event", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_german_argument_mining = LightevalTaskConfig( + name="lextreme:german_argument_mining", + suite=["lighteval"], + prompt_function=prompt.lextreme_german_argument_mining, + hf_repo="lighteval/lextreme", + hf_subset="german_argument_mining", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_greek_legal_code_chapter = LightevalTaskConfig( + name="lextreme:greek_legal_code_chapter", + suite=["lighteval"], + prompt_function=prompt.lextreme_greek_legal_code_chapter, + hf_repo="lighteval/lextreme", + hf_subset="greek_legal_code_chapter", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_greek_legal_code_subject = LightevalTaskConfig( + name="lextreme:greek_legal_code_subject", + suite=["lighteval"], + prompt_function=prompt.lextreme_greek_legal_code_subject, + hf_repo="lighteval/lextreme", + hf_subset="greek_legal_code_subject", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_greek_legal_code_volume = LightevalTaskConfig( + name="lextreme:greek_legal_code_volume", + suite=["lighteval"], + prompt_function=prompt.lextreme_greek_legal_code_volume, + hf_repo="lighteval/lextreme", + hf_subset="greek_legal_code_volume", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_greek_legal_ner = LightevalTaskConfig( + name="lextreme:greek_legal_ner", + suite=["lighteval"], + prompt_function=prompt.lextreme_greek_legal_ner, + hf_repo="lighteval/lextreme", + hf_subset="greek_legal_ner", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=430, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_legalnero = LightevalTaskConfig( + name="lextreme:legalnero", + suite=["lighteval"], + prompt_function=prompt.lextreme_legalnero, + hf_repo="lighteval/lextreme", + hf_subset="legalnero", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=788, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_lener_br = LightevalTaskConfig( + name="lextreme:lener_br", + suite=["lighteval"], + prompt_function=prompt.lextreme_lener_br, + hf_repo="lighteval/lextreme", + hf_subset="lener_br", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=338, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_mapa_coarse = LightevalTaskConfig( + name="lextreme:mapa_coarse", + suite=["lighteval"], + prompt_function=prompt.lextreme_mapa_coarse, + hf_repo="lighteval/lextreme", + hf_subset="mapa_coarse", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=274, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_mapa_fine = LightevalTaskConfig( + name="lextreme:mapa_fine", + suite=["lighteval"], + prompt_function=prompt.lextreme_mapa_fine, + hf_repo="lighteval/lextreme", + hf_subset="mapa_fine", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=274, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_multi_eurlex_level_1 = LightevalTaskConfig( + name="lextreme:multi_eurlex_level_1", + suite=["lighteval"], + prompt_function=prompt.lextreme_multi_eurlex_level_1, + hf_repo="lighteval/lextreme", + hf_subset="multi_eurlex_level_1", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_multi_eurlex_level_2 = LightevalTaskConfig( + name="lextreme:multi_eurlex_level_2", + suite=["lighteval"], + prompt_function=prompt.lextreme_multi_eurlex_level_2, + hf_repo="lighteval/lextreme", + hf_subset="multi_eurlex_level_2", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_multi_eurlex_level_3 = LightevalTaskConfig( + name="lextreme:multi_eurlex_level_3", + suite=["lighteval"], + prompt_function=prompt.lextreme_multi_eurlex_level_3, + hf_repo="lighteval/lextreme", + hf_subset="multi_eurlex_level_3", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_online_terms_of_service_clause_topics = LightevalTaskConfig( + name="lextreme:online_terms_of_service_clause_topics", + suite=["lighteval"], + prompt_function=prompt.lextreme_online_terms_of_service_clause_topics, + hf_repo="lighteval/lextreme", + hf_subset="online_terms_of_service_clause_topics", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_online_terms_of_service_unfairness_levels = LightevalTaskConfig( + name="lextreme:online_terms_of_service_unfairness_levels", + suite=["lighteval"], + prompt_function=prompt.lextreme_online_terms_of_service_unfairness_levels, + hf_repo="lighteval/lextreme", + hf_subset="online_terms_of_service_unfairness_levels", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lextreme_swiss_judgment_prediction = LightevalTaskConfig( + name="lextreme:swiss_judgment_prediction", + suite=["lighteval"], + prompt_function=prompt.lextreme_swiss_judgment_prediction, + hf_repo="lighteval/lextreme", + hf_subset="swiss_judgment_prediction", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + lextreme_brazilian_court_decisions_judgment, + lextreme_brazilian_court_decisions_unanimity, + lextreme_covid19_emergency_event, + lextreme_german_argument_mining, + lextreme_greek_legal_code_chapter, + lextreme_greek_legal_code_subject, + lextreme_greek_legal_code_volume, + lextreme_greek_legal_ner, + lextreme_legalnero, + lextreme_lener_br, + lextreme_mapa_coarse, + lextreme_mapa_fine, + lextreme_multi_eurlex_level_1, + lextreme_multi_eurlex_level_2, + lextreme_multi_eurlex_level_3, + lextreme_online_terms_of_service_clause_topics, + lextreme_online_terms_of_service_unfairness_levels, + lextreme_swiss_judgment_prediction, +] diff --git a/src/lighteval/tasks/tasks/logiqa.py b/src/lighteval/tasks/tasks/logiqa.py new file mode 100644 index 000000000..2439ddf69 --- /dev/null +++ b/src/lighteval/tasks/tasks/logiqa.py @@ -0,0 +1,48 @@ +""" +name: +Logiqa + +dataset: +lighteval/logiqa_harness + +abstract: +LogiQA is a machine reading comprehension dataset focused on testing logical +reasoning abilities. It contains 8,678 expert-written multiple-choice questions +covering various types of deductive reasoning. While humans perform strongly, +state-of-the-art models lag far behind, making LogiQA a benchmark for advancing +logical reasoning in NLP systems. + +languages: +english + +tags: +qa + +paper: +https://arxiv.org/abs/2007.08124 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +logiqa = LightevalTaskConfig( + name="logiqa", + suite=["lighteval"], + prompt_function=prompt.logiqa, + hf_repo="lighteval/logiqa_harness", + hf_subset="logiqa", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + logiqa, +] diff --git a/src/lighteval/tasks/tasks/lsat_qa.py b/src/lighteval/tasks/tasks/lsat_qa.py new file mode 100644 index 000000000..8d14fb86b --- /dev/null +++ b/src/lighteval/tasks/tasks/lsat_qa.py @@ -0,0 +1,111 @@ +""" +name: +Lsat Qa + +dataset: +lighteval/lsat_qa + +abstract: +Questions from law school admission tests. + +languages: +english + +tags: +legal, qa + +paper: +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +lsat_qa = LightevalTaskConfig( + name="lsat_qa", + suite=["lighteval"], + prompt_function=prompt.lsat_qa, + hf_repo="lighteval/lsat_qa", + hf_subset="all", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lsat_qa_assignment = LightevalTaskConfig( + name="lsat_qa:assignment", + suite=["lighteval"], + prompt_function=prompt.lsat_qa, + hf_repo="lighteval/lsat_qa", + hf_subset="assignment", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lsat_qa_grouping = LightevalTaskConfig( + name="lsat_qa:grouping", + suite=["lighteval"], + prompt_function=prompt.lsat_qa, + hf_repo="lighteval/lsat_qa", + hf_subset="grouping", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lsat_qa_miscellaneous = LightevalTaskConfig( + name="lsat_qa:miscellaneous", + suite=["lighteval"], + prompt_function=prompt.lsat_qa, + hf_repo="lighteval/lsat_qa", + hf_subset="miscellaneous", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +lsat_qa_ordering = LightevalTaskConfig( + name="lsat_qa:ordering", + suite=["lighteval"], + prompt_function=prompt.lsat_qa, + hf_repo="lighteval/lsat_qa", + hf_subset="ordering", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + lsat_qa, + lsat_qa_assignment, + lsat_qa_grouping, + lsat_qa_miscellaneous, + lsat_qa_ordering, +] diff --git a/src/lighteval/tasks/tasks/math.py b/src/lighteval/tasks/tasks/math.py new file mode 100644 index 000000000..8ae7bd243 --- /dev/null +++ b/src/lighteval/tasks/tasks/math.py @@ -0,0 +1,209 @@ +""" +name: +Math + +dataset: +DigitalLearningGmbH/MATH-lighteval + +abstract: + +languages: +english + +tags: +math, reasoning + +paper: +https://arxiv.org/abs/2305.20050 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.metrics.normalizations import math_normalizer +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +math_algebra = LightevalTaskConfig( + name="math:algebra", + suite=["lighteval"], + prompt_function=prompt.math, + hf_repo="DigitalLearningGmbH/MATH-lighteval", + hf_subset="algebra", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metrics=[ + Metrics.maj_at_n( + sample_params={ + "n": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], + stop_sequence=["\n"], + version=1, +) + +math_counting_and_probability = LightevalTaskConfig( + name="math:counting_and_probability", + suite=["lighteval"], + prompt_function=prompt.math, + hf_repo="DigitalLearningGmbH/MATH-lighteval", + hf_subset="counting_and_probability", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metrics=[ + Metrics.maj_at_n( + sample_params={ + "n": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], + stop_sequence=["\n"], + version=1, +) + +math_geometry = LightevalTaskConfig( + name="math:geometry", + suite=["lighteval"], + prompt_function=prompt.math, + hf_repo="DigitalLearningGmbH/MATH-lighteval", + hf_subset="geometry", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metrics=[ + Metrics.maj_at_n( + sample_params={ + "n": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], + stop_sequence=["\n"], + version=1, +) + +math_intermediate_algebra = LightevalTaskConfig( + name="math:intermediate_algebra", + suite=["lighteval"], + prompt_function=prompt.math, + hf_repo="DigitalLearningGmbH/MATH-lighteval", + hf_subset="intermediate_algebra", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metrics=[ + Metrics.maj_at_n( + sample_params={ + "n": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], + stop_sequence=["\n"], + version=1, +) + +math_number_theory = LightevalTaskConfig( + name="math:number_theory", + suite=["lighteval"], + prompt_function=prompt.math, + hf_repo="DigitalLearningGmbH/MATH-lighteval", + hf_subset="number_theory", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metrics=[ + Metrics.maj_at_n( + sample_params={ + "n": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], + stop_sequence=["\n"], + version=1, +) + +math_prealgebra = LightevalTaskConfig( + name="math:prealgebra", + suite=["lighteval"], + prompt_function=prompt.math, + hf_repo="DigitalLearningGmbH/MATH-lighteval", + hf_subset="prealgebra", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metrics=[ + Metrics.maj_at_n( + sample_params={ + "n": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], + stop_sequence=["\n"], + version=1, +) + +math_precalculus = LightevalTaskConfig( + name="math:precalculus", + suite=["lighteval"], + prompt_function=prompt.math, + hf_repo="DigitalLearningGmbH/MATH-lighteval", + hf_subset="precalculus", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metrics=[ + Metrics.maj_at_n( + sample_params={ + "n": 4, + "strip_strings": True, + "normalize_pred": math_normalizer, + "normalize_gold": math_normalizer, + } + ), + ], + stop_sequence=["\n"], + version=1, +) + +TASKS_TABLE = [ + math_algebra, + math_counting_and_probability, + math_geometry, + math_intermediate_algebra, + math_number_theory, + math_prealgebra, + math_precalculus, +] diff --git a/src/lighteval/tasks/tasks/math_500.py b/src/lighteval/tasks/tasks/math_500.py new file mode 100644 index 000000000..961250b5d --- /dev/null +++ b/src/lighteval/tasks/tasks/math_500.py @@ -0,0 +1,46 @@ +""" +name: +Math 500 + +dataset: +HuggingFaceH4/MATH-500 + +abstract: +This dataset contains a subset of 500 problems from the MATH benchmark that +OpenAI created in their Let's Verify Step by Step paper. + +languages: +english + +tags: +math, reasoning + +paper: +https://arxiv.org/abs/2305.20050 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +math_500 = LightevalTaskConfig( + name="math_500", + suite=["lighteval"], + prompt_function=prompt.math_500, + hf_repo="HuggingFaceH4/MATH-500", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=32768, + metrics=[ + Metrics.pass_at_k_math(sample_params={"k": 1, "n": 1}), + ], + version=2, +) + +TASKS_TABLE = [ + math_500, +] diff --git a/src/lighteval/tasks/tasks/mathqa.py b/src/lighteval/tasks/tasks/mathqa.py new file mode 100644 index 000000000..4eccd9a75 --- /dev/null +++ b/src/lighteval/tasks/tasks/mathqa.py @@ -0,0 +1,47 @@ +""" +name: +Mathqa + +dataset: +allenai/math_qa + +abstract: +large-scale dataset of math word problems. Our dataset is gathered by using a +new representation language to annotate over the AQuA-RAT dataset with +fully-specified operational programs. AQuA-RAT has provided the questions, +options, rationale, and the correct options. + +languages: +english + +tags: +math, qa, reasoning + +paper: +https://arxiv.org/abs/1905.13319 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +mathqa = LightevalTaskConfig( + name="mathqa", + suite=["lighteval"], + prompt_function=prompt.mathqa, + hf_repo="allenai/math_qa", + hf_subset="default", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + mathqa, +] diff --git a/src/lighteval/tasks/tasks/med.py b/src/lighteval/tasks/tasks/med.py new file mode 100644 index 000000000..49496dae3 --- /dev/null +++ b/src/lighteval/tasks/tasks/med.py @@ -0,0 +1,86 @@ +""" +name: +Med + +dataset: +lighteval/med_mcqa, lighteval/med_paragraph_simplification, bigbio/med_qa + +abstract: +A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering + +languages: +english + +tags: +health, medical + +paper: +https://medmcqa.github.io/ +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +med_mcqa = LightevalTaskConfig( + name="med_mcqa", + suite=["lighteval"], + prompt_function=prompt.med_mcqa, + hf_repo="lighteval/med_mcqa", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + + +med_paragraph_simplification = LightevalTaskConfig( + name="med_paragraph_simplification", + suite=["lighteval"], + prompt_function=prompt.med_paragraph_simplification, + hf_repo="lighteval/med_paragraph_simplification", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=512, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + + +med_qa = LightevalTaskConfig( + name="med_qa", + suite=["lighteval"], + prompt_function=prompt.med_qa, + hf_repo="bigbio/med_qa", + hf_subset="med_qa_en_source", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + med_mcqa, + med_paragraph_simplification, + med_qa, +] diff --git a/src/lighteval/tasks/tasks/med_dialog.py b/src/lighteval/tasks/tasks/med_dialog.py new file mode 100644 index 000000000..70a7c08ee --- /dev/null +++ b/src/lighteval/tasks/tasks/med_dialog.py @@ -0,0 +1,65 @@ +""" +name: +Med Dialog + +dataset: +lighteval/med_dialog + +abstract: +A collection of medical dialogue datasets. + +languages: +english + +tags: +dialog, health, medical + +paper: +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +med_dialog_healthcaremagic = LightevalTaskConfig( + name="med_dialog:healthcaremagic", + suite=["lighteval"], + prompt_function=prompt.med_dialog, + hf_repo="lighteval/med_dialog", + hf_subset="healthcaremagic", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + + +med_dialog_icliniq = LightevalTaskConfig( + name="med_dialog:icliniq", + suite=["lighteval"], + prompt_function=prompt.med_dialog, + hf_repo="lighteval/med_dialog", + hf_subset="icliniq", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + med_dialog_healthcaremagic, + med_dialog_icliniq, +] diff --git a/src/lighteval/tasks/tasks/mgsm.py b/src/lighteval/tasks/tasks/mgsm.py new file mode 100644 index 000000000..e6391ec01 --- /dev/null +++ b/src/lighteval/tasks/tasks/mgsm.py @@ -0,0 +1,217 @@ +""" +name: +Mgsm + +dataset: +juletxara/mgsm + +abstract: +Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school +math problems. +The same 250 problems from GSM8K are each translated via human annotators in 10 +languages. + +languages: +english, spanish, french, german, russian, chinese, japanese, thai, swahili, bengali, telugu + +tags: +math, multilingual, reasoning + +paper: +https://arxiv.org/abs/2210.03057 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +mgsm_en = LightevalTaskConfig( + name="mgsm:en", + suite=["lighteval"], + prompt_function=prompt.mgsm_en, + hf_repo="juletxara/mgsm", + hf_subset="en", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.exact_match], + stop_sequence=None, + version=0, +) + +mgsm_es = LightevalTaskConfig( + name="mgsm:es", + suite=["lighteval"], + prompt_function=prompt.mgsm_es, + hf_repo="juletxara/mgsm", + hf_subset="es", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.exact_match], + stop_sequence=None, + version=0, +) + +mgsm_fr = LightevalTaskConfig( + name="mgsm:fr", + suite=["lighteval"], + prompt_function=prompt.mgsm_fr, + hf_repo="juletxara/mgsm", + hf_subset="fr", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.exact_match], + stop_sequence=None, + version=0, +) + +mgsm_de = LightevalTaskConfig( + name="mgsm:de", + suite=["lighteval"], + prompt_function=prompt.mgsm_de, + hf_repo="juletxara/mgsm", + hf_subset="de", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.exact_match], + stop_sequence=None, + version=0, +) + +mgsm_ru = LightevalTaskConfig( + name="mgsm:ru", + suite=["lighteval"], + prompt_function=prompt.mgsm_ru, + hf_repo="juletxara/mgsm", + hf_subset="ru", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.exact_match], + stop_sequence=None, + version=0, +) + +mgsm_zh = LightevalTaskConfig( + name="mgsm:zh", + suite=["lighteval"], + prompt_function=prompt.mgsm_zh, + hf_repo="juletxara/mgsm", + hf_subset="zh", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.exact_match], + stop_sequence=None, + version=0, +) + +mgsm_ja = LightevalTaskConfig( + name="mgsm:ja", + suite=["lighteval"], + prompt_function=prompt.mgsm_ja, + hf_repo="juletxara/mgsm", + hf_subset="ja", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.exact_match], + stop_sequence=None, + version=0, +) + +mgsm_th = LightevalTaskConfig( + name="mgsm:th", + suite=["lighteval"], + prompt_function=prompt.mgsm_th, + hf_repo="juletxara/mgsm", + hf_subset="th", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.exact_match], + stop_sequence=None, + version=0, +) + +mgsm_sw = LightevalTaskConfig( + name="mgsm:sw", + suite=["lighteval"], + prompt_function=prompt.mgsm_sw, + hf_repo="juletxara/mgsm", + hf_subset="sw", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.exact_match], + stop_sequence=None, + version=0, +) + +mgsm_bn = LightevalTaskConfig( + name="mgsm:bn", + suite=["lighteval"], + prompt_function=prompt.mgsm_bn, + hf_repo="juletxara/mgsm", + hf_subset="bn", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.exact_match], + stop_sequence=None, + version=0, +) + +mgsm_te = LightevalTaskConfig( + name="mgsm:te", + suite=["lighteval"], + prompt_function=prompt.mgsm_te, + hf_repo="juletxara/mgsm", + hf_subset="te", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.exact_match], + stop_sequence=None, + version=0, +) + +TASKS_TABLE = [ + mgsm_en, + mgsm_es, + mgsm_fr, + mgsm_de, + mgsm_ru, + mgsm_zh, + mgsm_ja, + mgsm_th, + mgsm_sw, + mgsm_bn, + mgsm_te, +] diff --git a/src/lighteval/tasks/extended/mix_eval/judge_prompts.py b/src/lighteval/tasks/tasks/mix_eval/judge_prompts.py similarity index 91% rename from src/lighteval/tasks/extended/mix_eval/judge_prompts.py rename to src/lighteval/tasks/tasks/mix_eval/judge_prompts.py index ab2a03405..48850b820 100644 --- a/src/lighteval/tasks/extended/mix_eval/judge_prompts.py +++ b/src/lighteval/tasks/tasks/mix_eval/judge_prompts.py @@ -1,26 +1,4 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -from lighteval.tasks.extended.mix_eval.prompts import parse_options +from lighteval.tasks.tasks.mix_eval.prompts import parse_options def flow_judge_for_freeform_template(question, options, answer, gold): diff --git a/src/lighteval/tasks/extended/mix_eval/main.py b/src/lighteval/tasks/tasks/mix_eval/main.py similarity index 83% rename from src/lighteval/tasks/extended/mix_eval/main.py rename to src/lighteval/tasks/tasks/mix_eval/main.py index e57faa1bd..2b65ab817 100644 --- a/src/lighteval/tasks/extended/mix_eval/main.py +++ b/src/lighteval/tasks/tasks/mix_eval/main.py @@ -1,24 +1,26 @@ -# MIT License +""" +name: +Mix Eval -# Copyright (c) 2024 The HuggingFace Team +dataset: +MixEval/MixEval -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +abstract: +Ground-truth-based dynamic benchmark derived from off-the-shelf benchmark +mixtures, which evaluates LLMs with a highly capable model ranking (i.e., 0.96 +correlation with Chatbot Arena) while running locally and quickly (6% the time +and cost of running MMLU), with its queries being stably and effortlessly +updated every month to avoid contamination. -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +languages: +english -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +tags: +general-knowledge, reasoning, qa + +paper: +https://mixeval.github.io/ +""" import logging import re @@ -27,15 +29,15 @@ from lighteval.metrics.metrics_sample import JudgeLLMMixEval from lighteval.metrics.utils.metric_utils import SampleLevelMetricGrouping -from lighteval.tasks.extended.mix_eval.judge_prompts import ( +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc, SamplingMethod +from lighteval.tasks.tasks.mix_eval.judge_prompts import ( flow_judge_for_freeform_template, flow_judge_for_multichoice_template, gpt_judge_for_closeended_freeform, gpt_judge_for_closeended_multiplechoice, ) -from lighteval.tasks.extended.mix_eval.prompts import construct_prompt_freeform, construct_prompt_multichoice -from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.requests import Doc, SamplingMethod +from lighteval.tasks.tasks.mix_eval.prompts import construct_prompt_freeform, construct_prompt_multichoice logger = logging.getLogger(__name__) @@ -178,7 +180,7 @@ def mean_dv_5(x): mixeval_freeform_easy = LightevalTaskConfig( name="mixeval_easy:freeform", prompt_function=mixeval_freeform_prompt, - suite=["extended"], + suite=["lighteval"], hf_repo="MixEval/MixEval", hf_subset="MixEval", metrics=[llm_judge_mixeval_freeform_flow_judge, llm_judge_mixeval_freeform_gpt_judge], @@ -195,7 +197,7 @@ def mean_dv_5(x): mixeval_multichoice_easy = LightevalTaskConfig( name="mixeval_easy:multichoice", prompt_function=mixeval_multichoice_prompt, - suite=["extended"], + suite=["lighteval"], hf_repo="MixEval/MixEval", hf_subset="MixEval", metrics=[llm_judge_mixeval_multichoice_flow_judge, llm_judge_mixeval_multichoice_gpt_judge], @@ -211,7 +213,7 @@ def mean_dv_5(x): mixeval_freeform_hard = LightevalTaskConfig( name="mixeval_hard:freeform", prompt_function=mixeval_freeform_prompt, - suite=["extended"], + suite=["lighteval"], hf_repo="MixEval/MixEval", hf_subset="MixEval_Hard", metrics=[llm_judge_mixeval_freeform_flow_judge, llm_judge_mixeval_freeform_gpt_judge], @@ -228,7 +230,7 @@ def mean_dv_5(x): mixeval_multichoice_hard = LightevalTaskConfig( name="mixeval_hard:multichoice", prompt_function=mixeval_multichoice_prompt, - suite=["extended"], + suite=["lighteval"], hf_repo="MixEval/MixEval", hf_subset="MixEval_Hard", metrics=[llm_judge_mixeval_multichoice_flow_judge, llm_judge_mixeval_multichoice_gpt_judge], diff --git a/src/lighteval/tasks/extended/mix_eval/prompts.py b/src/lighteval/tasks/tasks/mix_eval/prompts.py similarity index 88% rename from src/lighteval/tasks/extended/mix_eval/prompts.py rename to src/lighteval/tasks/tasks/mix_eval/prompts.py index d5cb2f06b..bd859a967 100644 --- a/src/lighteval/tasks/extended/mix_eval/prompts.py +++ b/src/lighteval/tasks/tasks/mix_eval/prompts.py @@ -1,25 +1,3 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team and MixEval team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - MULTI_CHOICE_PROMPT = "Answer with the option letter from the given choices directly." FREE_FORM_PROMPT = "Answer the question shortly." # FREE_FORM_PROMPT_QUAC = "Answer the question using a short excerpt (span) from the given text." diff --git a/src/lighteval/tasks/tasks/mmlu.py b/src/lighteval/tasks/tasks/mmlu.py new file mode 100644 index 000000000..2791b6e4c --- /dev/null +++ b/src/lighteval/tasks/tasks/mmlu.py @@ -0,0 +1,996 @@ +""" +name: +Mmlu + +dataset: +lighteval/mmlu + +abstract: +MMMLU is a benchmark of general-knowledge and English language understanding. + +languages: +english + +tags: +general-knowledge, knowledge, multiple-choice + +paper: +https://arxiv.org/abs/2009.03300 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +mmlu_abstract_algebra = LightevalTaskConfig( + name="mmlu:abstract_algebra", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="abstract_algebra", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_anatomy = LightevalTaskConfig( + name="mmlu:anatomy", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="anatomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_astronomy = LightevalTaskConfig( + name="mmlu:astronomy", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="astronomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_business_ethics = LightevalTaskConfig( + name="mmlu:business_ethics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="business_ethics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_clinical_knowledge = LightevalTaskConfig( + name="mmlu:clinical_knowledge", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="clinical_knowledge", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_college_biology = LightevalTaskConfig( + name="mmlu:college_biology", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="college_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_college_chemistry = LightevalTaskConfig( + name="mmlu:college_chemistry", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="college_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_college_computer_science = LightevalTaskConfig( + name="mmlu:college_computer_science", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="college_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_college_mathematics = LightevalTaskConfig( + name="mmlu:college_mathematics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="college_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_college_medicine = LightevalTaskConfig( + name="mmlu:college_medicine", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="college_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_college_physics = LightevalTaskConfig( + name="mmlu:college_physics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="college_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_computer_security = LightevalTaskConfig( + name="mmlu:computer_security", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="computer_security", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_conceptual_physics = LightevalTaskConfig( + name="mmlu:conceptual_physics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="conceptual_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_econometrics = LightevalTaskConfig( + name="mmlu:econometrics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="econometrics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_electrical_engineering = LightevalTaskConfig( + name="mmlu:electrical_engineering", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="electrical_engineering", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_elementary_mathematics = LightevalTaskConfig( + name="mmlu:elementary_mathematics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="elementary_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_formal_logic = LightevalTaskConfig( + name="mmlu:formal_logic", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="formal_logic", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_global_facts = LightevalTaskConfig( + name="mmlu:global_facts", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="global_facts", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_biology = LightevalTaskConfig( + name="mmlu:high_school_biology", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_chemistry = LightevalTaskConfig( + name="mmlu:high_school_chemistry", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_computer_science = LightevalTaskConfig( + name="mmlu:high_school_computer_science", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_european_history = LightevalTaskConfig( + name="mmlu:high_school_european_history", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_european_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_geography = LightevalTaskConfig( + name="mmlu:high_school_geography", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_geography", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_government_and_politics = LightevalTaskConfig( + name="mmlu:high_school_government_and_politics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_government_and_politics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_macroeconomics = LightevalTaskConfig( + name="mmlu:high_school_macroeconomics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_macroeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_mathematics = LightevalTaskConfig( + name="mmlu:high_school_mathematics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_microeconomics = LightevalTaskConfig( + name="mmlu:high_school_microeconomics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_microeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_physics = LightevalTaskConfig( + name="mmlu:high_school_physics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_psychology = LightevalTaskConfig( + name="mmlu:high_school_psychology", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_statistics = LightevalTaskConfig( + name="mmlu:high_school_statistics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_statistics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_us_history = LightevalTaskConfig( + name="mmlu:high_school_us_history", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_us_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_high_school_world_history = LightevalTaskConfig( + name="mmlu:high_school_world_history", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="high_school_world_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_human_aging = LightevalTaskConfig( + name="mmlu:human_aging", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="human_aging", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_human_sexuality = LightevalTaskConfig( + name="mmlu:human_sexuality", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="human_sexuality", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_international_law = LightevalTaskConfig( + name="mmlu:international_law", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="international_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_jurisprudence = LightevalTaskConfig( + name="mmlu:jurisprudence", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="jurisprudence", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_logical_fallacies = LightevalTaskConfig( + name="mmlu:logical_fallacies", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="logical_fallacies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_machine_learning = LightevalTaskConfig( + name="mmlu:machine_learning", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="machine_learning", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_management = LightevalTaskConfig( + name="mmlu:management", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="management", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_marketing = LightevalTaskConfig( + name="mmlu:marketing", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="marketing", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_medical_genetics = LightevalTaskConfig( + name="mmlu:medical_genetics", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="medical_genetics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_miscellaneous = LightevalTaskConfig( + name="mmlu:miscellaneous", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="miscellaneous", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_moral_disputes = LightevalTaskConfig( + name="mmlu:moral_disputes", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="moral_disputes", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_moral_scenarios = LightevalTaskConfig( + name="mmlu:moral_scenarios", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="moral_scenarios", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_nutrition = LightevalTaskConfig( + name="mmlu:nutrition", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="nutrition", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_philosophy = LightevalTaskConfig( + name="mmlu:philosophy", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="philosophy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_prehistory = LightevalTaskConfig( + name="mmlu:prehistory", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="prehistory", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_professional_accounting = LightevalTaskConfig( + name="mmlu:professional_accounting", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="professional_accounting", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_professional_law = LightevalTaskConfig( + name="mmlu:professional_law", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="professional_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_professional_medicine = LightevalTaskConfig( + name="mmlu:professional_medicine", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="professional_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_professional_psychology = LightevalTaskConfig( + name="mmlu:professional_psychology", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="professional_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_public_relations = LightevalTaskConfig( + name="mmlu:public_relations", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="public_relations", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_security_studies = LightevalTaskConfig( + name="mmlu:security_studies", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="security_studies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_sociology = LightevalTaskConfig( + name="mmlu:sociology", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="sociology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_us_foreign_policy = LightevalTaskConfig( + name="mmlu:us_foreign_policy", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="us_foreign_policy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_virology = LightevalTaskConfig( + name="mmlu:virology", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="virology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +mmlu_world_religions = LightevalTaskConfig( + name="mmlu:world_religions", + suite=["lighteval"], + prompt_function=prompt.mmlu_helm, + hf_repo="lighteval/mmlu", + hf_subset="world_religions", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + mmlu_abstract_algebra, + mmlu_anatomy, + mmlu_astronomy, + mmlu_business_ethics, + mmlu_clinical_knowledge, + mmlu_college_biology, + mmlu_college_chemistry, + mmlu_college_computer_science, + mmlu_college_mathematics, + mmlu_college_medicine, + mmlu_college_physics, + mmlu_computer_security, + mmlu_conceptual_physics, + mmlu_econometrics, + mmlu_electrical_engineering, + mmlu_elementary_mathematics, + mmlu_formal_logic, + mmlu_global_facts, + mmlu_high_school_biology, + mmlu_high_school_chemistry, + mmlu_high_school_computer_science, + mmlu_high_school_european_history, + mmlu_high_school_geography, + mmlu_high_school_government_and_politics, + mmlu_high_school_macroeconomics, + mmlu_high_school_mathematics, + mmlu_high_school_microeconomics, + mmlu_high_school_physics, + mmlu_high_school_psychology, + mmlu_high_school_statistics, + mmlu_high_school_us_history, + mmlu_high_school_world_history, + mmlu_human_aging, + mmlu_human_sexuality, + mmlu_international_law, + mmlu_jurisprudence, + mmlu_logical_fallacies, + mmlu_machine_learning, + mmlu_management, + mmlu_marketing, + mmlu_medical_genetics, + mmlu_miscellaneous, + mmlu_moral_disputes, + mmlu_moral_scenarios, + mmlu_nutrition, + mmlu_philosophy, + mmlu_prehistory, + mmlu_professional_accounting, + mmlu_professional_law, + mmlu_professional_medicine, + mmlu_professional_psychology, + mmlu_public_relations, + mmlu_security_studies, + mmlu_sociology, + mmlu_us_foreign_policy, + mmlu_virology, + mmlu_world_religions, +] diff --git a/src/lighteval/tasks/tasks/mmlu_redux.py b/src/lighteval/tasks/tasks/mmlu_redux.py new file mode 100644 index 000000000..2a29afd12 --- /dev/null +++ b/src/lighteval/tasks/tasks/mmlu_redux.py @@ -0,0 +1,107 @@ +""" +name: +Mmlu Redux + +dataset: +edinburgh-dawg/mmlu-redux-2.0 + +abstract: +MMLU-Redux is a subset of 5,700 manually re-annotated questions across 57 MMLU subjects. + +languages: +english + +tags: +general-knowledge, knowledge, multiple-choice + +paper: +https://arxiv.org/abs/2406.04127 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +_MMLU_REDUX_2_SUBSETS = [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions", +] + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"mmlu_redux_2:{subset}", + suite=["lighteval"], + prompt_function=lambda line, task_name=None, s=subset: prompt.mmlu_redux_2(line, s, task_name), + hf_repo="edinburgh-dawg/mmlu-redux-2.0", + hf_subset=subset, + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[ + Metrics.loglikelihood_acc, + Metrics.pass_at_k_letters(sample_params={"k": 1}), + ], + stop_sequence=["\n"], + version=0, + ) + for subset in _MMLU_REDUX_2_SUBSETS +] diff --git a/src/lighteval/tasks/tasks/mmmu_pro.py b/src/lighteval/tasks/tasks/mmmu_pro.py new file mode 100644 index 000000000..3a71a9061 --- /dev/null +++ b/src/lighteval/tasks/tasks/mmmu_pro.py @@ -0,0 +1,80 @@ +""" +name: +Mmmu Pro + +dataset: +MMMU/MMMU_pro + +abstract: + +languages: +english + +tags: +general-knowledge, knowledge, multimodal, multiple-choice + +paper: +https://arxiv.org/abs/2409.02813 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +mmmu_pro_standard_4_options = LightevalTaskConfig( + name="mmmu_pro:standard-4", + suite=["lighteval"], + prompt_function=prompt.mmmu_pro, + hf_repo="MMMU/MMMU_pro", + hf_subset="standard (4 options)", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, # expected an answer in a format 'Answer: B' + metrics=[Metrics.gpqa_instruct_metric], + stop_sequence=None, + version=0, +) + + +mmmu_pro_standard_10_options = LightevalTaskConfig( + name="mmmu_pro:standard-10", + suite=["lighteval"], + prompt_function=prompt.mmmu_pro, + hf_repo="MMMU/MMMU_pro", + hf_subset="standard (10 options)", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, # expected an answer in a format 'Answer: B' + metrics=[Metrics.gpqa_instruct_metric], + stop_sequence=None, + version=0, +) + + +mmmu_pro_vision = LightevalTaskConfig( + name="mmmu_pro:vision", + suite=["lighteval"], + prompt_function=prompt.mmmu_pro_vision, + hf_repo="MMMU/MMMU_pro", + hf_subset="vision", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, # expected an answer in a format 'Answer: B' + metrics=[Metrics.gpqa_instruct_metric], + stop_sequence=None, + version=0, +) + + +TASKS_TABLE = [ + mmmu_pro_standard_4_options, + mmmu_pro_standard_10_options, + mmmu_pro_vision, +] diff --git a/src/lighteval/tasks/extended/mt_bench/judge_prompt_templates.py b/src/lighteval/tasks/tasks/mt_bench/judge_prompt_templates.py similarity index 82% rename from src/lighteval/tasks/extended/mt_bench/judge_prompt_templates.py rename to src/lighteval/tasks/tasks/mt_bench/judge_prompt_templates.py index ea3ca41f4..e76de1b2d 100644 --- a/src/lighteval/tasks/extended/mt_bench/judge_prompt_templates.py +++ b/src/lighteval/tasks/tasks/mt_bench/judge_prompt_templates.py @@ -1,26 +1,3 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - - def flow_judge_prompt_mt_bench_without_ref(question, options, answer, gold): return [ { diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/tasks/mt_bench/main.py similarity index 64% rename from src/lighteval/tasks/extended/mt_bench/main.py rename to src/lighteval/tasks/tasks/mt_bench/main.py index e32194747..bed7239dd 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/tasks/mt_bench/main.py @@ -1,36 +1,38 @@ -# MIT License +""" +name: +Mt Bench -# Copyright (c) 2024 The HuggingFace Team +dataset: +lighteval/mt-bench -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +abstract: +MT-Bench is a multi-turn conversational benchmark for evaluating language +models. It consists of 80 high-quality multi-turn questions across 8 common +categories (writing, roleplay, reasoning, math, coding, extraction, STEM, +humanities). Model responses are evaluated by a judge LLM. -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +languages: +english -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +tags: +conversational, generation, multi-turn + +paper: +https://arxiv.org/abs/2402.14762 +""" + +import re + +import numpy as np -# ruff: noqa: F405, F403, F401, I001 -from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.requests import Doc, SamplingMethod from lighteval.metrics.metrics_sample import JudgeLLMMTBench from lighteval.metrics.utils.metric_utils import SampleLevelMetricGrouping -from lighteval.tasks.extended.mt_bench.judge_prompt_templates import ( +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc, SamplingMethod +from lighteval.tasks.tasks.mt_bench.judge_prompt_templates import ( flow_judge_prompt_mt_bench_with_ref, flow_judge_prompt_mt_bench_without_ref, ) -import re -import numpy as np def mt_bench_prompt(line, task_name: str = ""): @@ -80,7 +82,7 @@ def flow_judge_mt_bench_prompt(question, answer, options, gold): task = LightevalTaskConfig( name="mt_bench", prompt_function=mt_bench_prompt, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py - suite=["extended"], + suite=["lighteval"], hf_repo="lighteval/mt-bench", hf_subset="default", hf_avail_splits=["train"], diff --git a/src/lighteval/tasks/tasks/musr.py b/src/lighteval/tasks/tasks/musr.py new file mode 100644 index 000000000..074e0ac6f --- /dev/null +++ b/src/lighteval/tasks/tasks/musr.py @@ -0,0 +1,82 @@ +""" +name: +Musr + +dataset: +TAUR-Lab/MuSR + +abstract: +MuSR is a benchmark for evaluating multistep reasoning in natural language +narratives. Built using a neurosymbolic synthetic-to-natural generation process, +it features complex, realistic tasks—such as long-form murder mysteries. + +languages: +english + +tags: +long-context, multiple-choice, reasoning + +paper: +https://arxiv.org/abs/2310.16049 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +musr_murder_mysteries = LightevalTaskConfig( + name="musr:murder_mysteries", + suite=["lighteval"], + prompt_function=prompt.musr, + hf_repo="TAUR-Lab/MuSR", + hf_subset="default", + hf_avail_splits=["murder_mysteries"], + evaluation_splits=["murder_mysteries"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + + +musr_object_placements = LightevalTaskConfig( + name="musr:object_placements", + suite=["lighteval"], + prompt_function=prompt.musr, + hf_repo="TAUR-Lab/MuSR", + hf_subset="default", + hf_avail_splits=["object_placements"], + evaluation_splits=["object_placements"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + + +musr_team_allocation = LightevalTaskConfig( + name="musr:team_allocation", + suite=["lighteval"], + prompt_function=prompt.musr, + hf_repo="TAUR-Lab/MuSR", + hf_subset="default", + hf_avail_splits=["team_allocation"], + evaluation_splits=["team_allocation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + musr_murder_mysteries, + musr_object_placements, + musr_team_allocation, +] diff --git a/src/lighteval/tasks/tasks/narrativeqa.py b/src/lighteval/tasks/tasks/narrativeqa.py new file mode 100644 index 000000000..fbbd8239c --- /dev/null +++ b/src/lighteval/tasks/tasks/narrativeqa.py @@ -0,0 +1,46 @@ +""" +name: +Narrativeqa + +dataset: +lighteval/narrative_qa_helm + +abstract: +NarrativeQA is a reading comprehension benchmark that tests deep understanding +of full narratives—books and movie scripts—rather than shallow text matching. To +answer its questions, models must integrate information across entire stories. + +languages: +english + +tags: +qa, reading-comprehension + +paper: +https://aclanthology.org/Q18-1023/ +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +narrativeqa = LightevalTaskConfig( + name="narrativeqa", + suite=["lighteval"], + prompt_function=prompt.narrativeqa, + hf_repo="lighteval/narrative_qa_helm", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + narrativeqa, +] diff --git a/src/lighteval/tasks/tasks/natural_questions.py b/src/lighteval/tasks/tasks/natural_questions.py new file mode 100644 index 000000000..47bbb4b3b --- /dev/null +++ b/src/lighteval/tasks/tasks/natural_questions.py @@ -0,0 +1,48 @@ +""" +name: +Natural Questions + +dataset: +lighteval/small_natural_questions + +abstract: +This dataset is a collection of question-answer pairs from the Natural Questions +dataset. See Natural Questions for additional information. This dataset can be +used directly with Sentence Transformers to train embedding models. + +languages: +english + +tags: +general-knowledge, qa + +paper: +https://ai.google.com/research/NaturalQuestions +""" + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +natural_questions = LightevalTaskConfig( + name="natural_questions", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: {"question": line["question"], "choices": [line["answer"]]}, + ), + suite=("lighteval",), + hf_repo="lighteval/small_natural_questions", + hf_subset="default", + evaluation_splits=("test",), + few_shots_split="few_shot", + generation_size=250, + stop_sequence=["\n", "Question:", "question:"], + metrics=[Metrics.exact_match], + version=1, +) + +TASKS_TABLE = [ + natural_questions, +] diff --git a/src/lighteval/tasks/tasks/numeracy.py b/src/lighteval/tasks/tasks/numeracy.py new file mode 100644 index 000000000..9a80d0b66 --- /dev/null +++ b/src/lighteval/tasks/tasks/numeracy.py @@ -0,0 +1,162 @@ +""" +name: +Numeracy + +dataset: +lighteval/numeracy + +abstract: +Numeracy is a benchmark for evaluating the ability of language models to reason about mathematics. + +languages: +english + +tags: +math, reasoning + +paper: +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +numeracy_linear_example = LightevalTaskConfig( + name="numeracy:linear_example", + suite=["lighteval"], + prompt_function=prompt.numeracy, + hf_repo="lighteval/numeracy", + hf_subset="linear_example", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +numeracy_linear_standard = LightevalTaskConfig( + name="numeracy:linear_standard", + suite=["lighteval"], + prompt_function=prompt.numeracy, + hf_repo="lighteval/numeracy", + hf_subset="linear_standard", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +numeracy_parabola_example = LightevalTaskConfig( + name="numeracy:parabola_example", + suite=["lighteval"], + prompt_function=prompt.numeracy, + hf_repo="lighteval/numeracy", + hf_subset="parabola_example", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +numeracy_parabola_standard = LightevalTaskConfig( + name="numeracy:parabola_standard", + suite=["lighteval"], + prompt_function=prompt.numeracy, + hf_repo="lighteval/numeracy", + hf_subset="parabola_standard", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +numeracy_paraboloid_example = LightevalTaskConfig( + name="numeracy:paraboloid_example", + suite=["lighteval"], + prompt_function=prompt.numeracy, + hf_repo="lighteval/numeracy", + hf_subset="paraboloid_example", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +numeracy_paraboloid_standard = LightevalTaskConfig( + name="numeracy:paraboloid_standard", + suite=["lighteval"], + prompt_function=prompt.numeracy, + hf_repo="lighteval/numeracy", + hf_subset="paraboloid_standard", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +numeracy_plane_example = LightevalTaskConfig( + name="numeracy:plane_example", + suite=["lighteval"], + prompt_function=prompt.numeracy, + hf_repo="lighteval/numeracy", + hf_subset="plane_example", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +numeracy_plane_standard = LightevalTaskConfig( + name="numeracy:plane_standard", + suite=["lighteval"], + prompt_function=prompt.numeracy, + hf_repo="lighteval/numeracy", + hf_subset="plane_standard", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + numeracy_linear_example, + numeracy_linear_standard, + numeracy_parabola_example, + numeracy_parabola_standard, + numeracy_paraboloid_example, + numeracy_paraboloid_standard, + numeracy_plane_example, + numeracy_plane_standard, +] diff --git a/src/lighteval/tasks/extended/olympiade_bench/main.py b/src/lighteval/tasks/tasks/olympiade_bench/main.py similarity index 88% rename from src/lighteval/tasks/extended/olympiade_bench/main.py rename to src/lighteval/tasks/tasks/olympiade_bench/main.py index d9fe0d2bc..bd53d3dcf 100644 --- a/src/lighteval/tasks/extended/olympiade_bench/main.py +++ b/src/lighteval/tasks/tasks/olympiade_bench/main.py @@ -1,25 +1,23 @@ -# MIT License +""" +name: +Olympiade Bench -# Copyright (c) 2024 The HuggingFace Team +dataset: +Hothan/OlympiadBench -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +abstract: +OlympiadBench is a benchmark for evaluating the performance of language models +on olympiad problems. -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +languages: +english, chinese -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +tags: +math, reasoning, language +paper: +https://arxiv.org/abs/2402.14008 +""" import numpy as np @@ -224,7 +222,7 @@ def olympiad_bench_prompt(line, task_name: str = None): LightevalTaskConfig( name="olympiad_bench:" + subset, prompt_function=olympiad_bench_prompt, - suite=["extended"], + suite=["lighteval"], hf_repo="Hothan/OlympiadBench", hf_subset=subset, metrics=[metric], diff --git a/src/lighteval/tasks/tasks/openbookqa.py b/src/lighteval/tasks/tasks/openbookqa.py new file mode 100644 index 000000000..eb0e547dc --- /dev/null +++ b/src/lighteval/tasks/tasks/openbookqa.py @@ -0,0 +1,50 @@ +""" +name: +Openbookqa + +dataset: +allenai/openbookqa + +abstract: +OpenBookQA is a question-answering dataset modeled after open-book exams for +assessing human understanding of a subject. It contains multiple-choice +questions that require combining facts from a given open book with broad common +knowledge. The task tests language models' ability to leverage provided +information and apply common sense reasoning. + +languages: +english + +tags: +multiple-choice, qa + +paper: +https://arxiv.org/abs/1809.02789 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +openbookqa = LightevalTaskConfig( + name="openbookqa", + suite=["lighteval"], + prompt_function=prompt.openbookqa_helm, + hf_repo="allenai/openbookqa", + hf_subset="main", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + openbookqa, +] diff --git a/src/lighteval/tasks/tasks/piqa.py b/src/lighteval/tasks/tasks/piqa.py new file mode 100644 index 000000000..76388fac1 --- /dev/null +++ b/src/lighteval/tasks/tasks/piqa.py @@ -0,0 +1,47 @@ +""" +name: +Piqa + +dataset: +ybisk/piqa + +abstract: +PIQA is a benchmark for testing physical commonsense reasoning. It contains +questions requiring this kind of physical commonsense reasoning. + +languages: +english + +tags: +commonsense, multiple-choice, qa + +paper: +https://arxiv.org/abs/1911.11641 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +piqa = LightevalTaskConfig( + name="piqa", + suite=["lighteval"], + prompt_function=prompt.piqa_helm, + hf_repo="ybisk/piqa", + hf_subset="plain_text", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + piqa, +] diff --git a/src/lighteval/tasks/tasks/prost.py b/src/lighteval/tasks/tasks/prost.py new file mode 100644 index 000000000..92a0ad0ca --- /dev/null +++ b/src/lighteval/tasks/tasks/prost.py @@ -0,0 +1,48 @@ +""" +name: +Prost + +dataset: +lighteval/prost + +abstract: +PROST is a benchmark for testing physical reasoning about objects through space +and time. It includes 18,736 multiple-choice questions covering 10 core physics +concepts, designed to probe models in zero-shot settings. Results show that even +large pretrained models struggle with physical reasoning and are sensitive to +question phrasing, underscoring their limited real-world understanding. + +languages: +english + +tags: +reasoning, qa, physical-commonsense + +paper: +https://arxiv.org/abs/2106.03634 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +prost = LightevalTaskConfig( + name="prost", + suite=["lighteval"], + prompt_function=prompt.prost, + hf_repo="lighteval/prost", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + prost, +] diff --git a/src/lighteval/tasks/tasks/pubmedqa.py b/src/lighteval/tasks/tasks/pubmedqa.py new file mode 100644 index 000000000..5cef802b4 --- /dev/null +++ b/src/lighteval/tasks/tasks/pubmedqa.py @@ -0,0 +1,46 @@ +""" +name: +Pubmedqa + +dataset: +pubmed_qa + +abstract: +PubMedQA is a dataset for biomedical research question answering. + +languages: +english + +tags: +biomedical, health, medical, qa + +paper: +https://pubmedqa.github.io/ +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +pubmedqa = LightevalTaskConfig( + name="pubmedqa", + suite=["lighteval"], + prompt_function=prompt.pubmed_qa_helm, + hf_repo="pubmed_qa", + hf_subset="pqa_labeled", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + pubmedqa, +] diff --git a/src/lighteval/tasks/tasks/qa4mre.py b/src/lighteval/tasks/tasks/qa4mre.py new file mode 100644 index 000000000..9120ae95c --- /dev/null +++ b/src/lighteval/tasks/tasks/qa4mre.py @@ -0,0 +1,90 @@ +""" +name: +Qa4Mre + +dataset: +qa4mre + +abstract: +QA4MRE is a machine reading comprehension benchmark from the CLEF 2011-2013 +challenges. It evaluates systems' ability to answer questions requiring deep +understanding of short texts, supported by external background knowledge. +Covering tasks like modality, negation, biomedical reading, and entrance exams, +QA4MRE tests reasoning beyond surface-level text matching. + +languages: +english + +tags: +biomedical, health, qa + +paper: +https://link.springer.com/chapter/10.1007/978-3-642-40802-1_29 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +qa4mre_2011 = LightevalTaskConfig( + name="qa4mre:2011", + suite=["lighteval"], + prompt_function=prompt.qa4mre, + hf_repo="qa4mre", + hf_subset="2011.main.EN", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=["\n"], + version=0, +) + + +qa4mre_2012 = LightevalTaskConfig( + name="qa4mre:2012", + suite=["lighteval"], + prompt_function=prompt.qa4mre, + hf_repo="qa4mre", + hf_subset="2012.main.EN", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=["\n"], + version=0, +) + + +qa4mre_2013 = LightevalTaskConfig( + name="qa4mre:2013", + suite=["lighteval"], + prompt_function=prompt.qa4mre, + hf_repo="qa4mre", + hf_subset="2013.main.EN", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[ + Metrics.loglikelihood_acc, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + qa4mre_2011, + qa4mre_2012, + qa4mre_2013, +] diff --git a/src/lighteval/tasks/tasks/qasper.py b/src/lighteval/tasks/tasks/qasper.py new file mode 100644 index 000000000..223fb35c8 --- /dev/null +++ b/src/lighteval/tasks/tasks/qasper.py @@ -0,0 +1,49 @@ +""" +name: +Qasper + +dataset: +allenai/qasper + +abstract: +QASPER is a dataset for question answering on scientific research papers. It +consists of 5,049 questions over 1,585 Natural Language Processing papers. Each +question is written by an NLP practitioner who read only the title and abstract +of the corresponding paper, and the question seeks information present in the +full text. The questions are then answered by a separate set of NLP +practitioners who also provide supporting evidence to answers. + +languages: +english + +tags: +qa, scientific + +paper: +https://arxiv.org/abs/2105.03011 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +qasper = LightevalTaskConfig( + name="qasper", + suite=["lighteval"], + prompt_function=prompt.qasper, + hf_repo="allenai/qasper", + hf_subset="qasper", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.f1_score], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + qasper, +] diff --git a/src/lighteval/tasks/tasks/quac.py b/src/lighteval/tasks/tasks/quac.py new file mode 100644 index 000000000..8fd69d116 --- /dev/null +++ b/src/lighteval/tasks/tasks/quac.py @@ -0,0 +1,44 @@ +""" +name: +Quac + +dataset: +lighteval/quac_helm + +abstract: +The QuAC benchmark for question answering in the context of dialogues. + +languages: +english + +tags: +dialog, qa + +paper: +https://aclanthology.org/D18-1241/ +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +quac = LightevalTaskConfig( + name="quac", + suite=["lighteval"], + prompt_function=prompt.quac, + hf_repo="lighteval/quac_helm", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + quac, +] diff --git a/src/lighteval/tasks/tasks/race_high.py b/src/lighteval/tasks/tasks/race_high.py new file mode 100644 index 000000000..4ac7e452a --- /dev/null +++ b/src/lighteval/tasks/tasks/race_high.py @@ -0,0 +1,48 @@ +""" +name: +Race High + +dataset: +EleutherAI/race + +abstract: +RACE is a large-scale reading comprehension dataset with more than 28,000 +passages and nearly 100,000 questions. The dataset is collected from English +examinations in China, which are designed for middle school and high school +students. The dataset can be served as the training and test sets for machine +comprehension. + +languages: +english + +tags: +multiple-choice, reading-comprehension + +paper: +https://aclanthology.org/D17-1082/ +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +race_high = LightevalTaskConfig( + name="race:high", + suite=["lighteval"], + prompt_function=prompt.race, + hf_repo="EleutherAI/race", + hf_subset="high", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + race_high, +] diff --git a/src/lighteval/tasks/tasks/raft.py b/src/lighteval/tasks/tasks/raft.py new file mode 100644 index 000000000..5e1a00553 --- /dev/null +++ b/src/lighteval/tasks/tasks/raft.py @@ -0,0 +1,237 @@ +""" +name: +Raft + +dataset: +ought/raft + +abstract: +The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text +classification tasks. + +languages: +english + +tags: +classification, reasoning + +paper: +https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +raft_ade_corpus_v2 = LightevalTaskConfig( + name="raft:ade_corpus_v2", + suite=["lighteval"], + prompt_function=prompt.raft_ade_corpus_v2, + hf_repo="ought/raft", + hf_subset="ade_corpus_v2", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +raft_banking_77 = LightevalTaskConfig( + name="raft:banking_77", + suite=["lighteval"], + prompt_function=prompt.raft_banking_77, + hf_repo="ought/raft", + hf_subset="banking_77", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +raft_neurips_impact_statement_risks = LightevalTaskConfig( + name="raft:neurips_impact_statement_risks", + suite=["lighteval"], + prompt_function=prompt.raft_neurips_impact_statement_risks, + hf_repo="ought/raft", + hf_subset="neurips_impact_statement_risks", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +raft_one_stop_english = LightevalTaskConfig( + name="raft:one_stop_english", + suite=["lighteval"], + prompt_function=prompt.raft_one_stop_english, + hf_repo="ought/raft", + hf_subset="one_stop_english", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +raft_overruling = LightevalTaskConfig( + name="raft:overruling", + suite=["lighteval"], + prompt_function=prompt.raft_overruling, + hf_repo="ought/raft", + hf_subset="overruling", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +raft_semiconductor_org_types = LightevalTaskConfig( + name="raft:semiconductor_org_types", + suite=["lighteval"], + prompt_function=prompt.raft_semiconductor_org_types, + hf_repo="ought/raft", + hf_subset="semiconductor_org_types", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +raft_systematic_review_inclusion = LightevalTaskConfig( + name="raft:systematic_review_inclusion", + suite=["lighteval"], + prompt_function=prompt.raft_systematic_review_inclusion, + hf_repo="ought/raft", + hf_subset="systematic_review_inclusion", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +raft_tai_safety_research = LightevalTaskConfig( + name="raft:tai_safety_research", + suite=["lighteval"], + prompt_function=prompt.raft_tai_safety_research, + hf_repo="ought/raft", + hf_subset="tai_safety_research", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +raft_terms_of_service = LightevalTaskConfig( + name="raft:terms_of_service", + suite=["lighteval"], + prompt_function=prompt.raft_terms_of_service, + hf_repo="ought/raft", + hf_subset="terms_of_service", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +raft_tweet_eval_hate = LightevalTaskConfig( + name="raft:tweet_eval_hate", + suite=["lighteval"], + prompt_function=prompt.raft_tweet_eval_hate, + hf_repo="ought/raft", + hf_subset="tweet_eval_hate", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +raft_twitter_complaints = LightevalTaskConfig( + name="raft:twitter_complaints", + suite=["lighteval"], + prompt_function=prompt.raft_twitter_complaints, + hf_repo="ought/raft", + hf_subset="twitter_complaints", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + raft_ade_corpus_v2, + raft_banking_77, + raft_neurips_impact_statement_risks, + raft_one_stop_english, + raft_overruling, + raft_semiconductor_org_types, + raft_systematic_review_inclusion, + raft_tai_safety_research, + raft_terms_of_service, + raft_tweet_eval_hate, + raft_twitter_complaints, +] diff --git a/src/lighteval/tasks/tasks/real_toxicity_prompts.py b/src/lighteval/tasks/tasks/real_toxicity_prompts.py new file mode 100644 index 000000000..726fda8fe --- /dev/null +++ b/src/lighteval/tasks/tasks/real_toxicity_prompts.py @@ -0,0 +1,44 @@ +""" +name: +Real Toxicity Prompts + +dataset: +allenai/real-toxicity-prompts + +abstract: +The RealToxicityPrompts dataset for measuring toxicity in prompted model generations + +languages: +english + +tags: +generation, safety + +paper: +https://aclanthology.org/2020.findings-emnlp.301/ +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +real_toxicity_prompts = LightevalTaskConfig( + name="real_toxicity_prompts", + suite=["lighteval"], + prompt_function=prompt.real_toxicity_prompts, + hf_repo="allenai/real-toxicity-prompts", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + real_toxicity_prompts, +] diff --git a/src/lighteval/tasks/tasks/sacrebleu.py b/src/lighteval/tasks/tasks/sacrebleu.py new file mode 100644 index 000000000..b6387f2b7 --- /dev/null +++ b/src/lighteval/tasks/tasks/sacrebleu.py @@ -0,0 +1,2928 @@ +""" +name: +Sacrebleu + +dataset: +lighteval/sacrebleu_manual, wmt14, wmt16 + +abstract: +tasks from sacrebleu + +languages: +english, german, french, japanese, korean, chinese, arabic + +tags: +translation + +paper: +https://github.com/mjpost/sacrebleu +""" + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks import default_prompts as prompt +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +iwslt17_ar_en = LightevalTaskConfig( + name="iwslt17:ar-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_ar-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +iwslt17_de_en = LightevalTaskConfig( + name="iwslt17:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +iwslt17_en_ar = LightevalTaskConfig( + name="iwslt17:en-ar", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_ar-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +iwslt17_en_de = LightevalTaskConfig( + name="iwslt17:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +iwslt17_en_fr = LightevalTaskConfig( + name="iwslt17:en-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +iwslt17_en_ja = LightevalTaskConfig( + name="iwslt17:en-ja", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-ja", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +iwslt17_en_ko = LightevalTaskConfig( + name="iwslt17:en-ko", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-ko", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +iwslt17_en_zh = LightevalTaskConfig( + name="iwslt17:en-zh", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +iwslt17_fr_en = LightevalTaskConfig( + name="iwslt17:fr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +iwslt17_ja_en = LightevalTaskConfig( + name="iwslt17:ja-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_ja-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +iwslt17_ko_en = LightevalTaskConfig( + name="iwslt17:ko-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_ko-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +iwslt17_zh_en = LightevalTaskConfig( + name="iwslt17:zh-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +mtnt2019_en_fr = LightevalTaskConfig( + name="mtnt2019:en-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="mtnt2019_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +mtnt2019_en_ja = LightevalTaskConfig( + name="mtnt2019:en-ja", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="mtnt2019_en-ja", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +mtnt2019_fr_en = LightevalTaskConfig( + name="mtnt2019:fr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="mtnt2019_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +mtnt2019_ja_en = LightevalTaskConfig( + name="mtnt2019:ja-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="mtnt2019_ja-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt08_cs_en = LightevalTaskConfig( + name="wmt08:cs-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt08_de_en = LightevalTaskConfig( + name="wmt08:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt08_en_cs = LightevalTaskConfig( + name="wmt08:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt08_en_de = LightevalTaskConfig( + name="wmt08:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt08_en_es = LightevalTaskConfig( + name="wmt08:en-es", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt08_en_fr = LightevalTaskConfig( + name="wmt08:en-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt08_en_hu = LightevalTaskConfig( + name="wmt08:en-hu", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-hu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt08_es_en = LightevalTaskConfig( + name="wmt08:es-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt08_fr_en = LightevalTaskConfig( + name="wmt08:fr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt08_hu_en = LightevalTaskConfig( + name="wmt08:hu-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_hu-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt09_cs_en = LightevalTaskConfig( + name="wmt09:cs-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt09_de_en = LightevalTaskConfig( + name="wmt09:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt09_en_cs = LightevalTaskConfig( + name="wmt09:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt09_en_de = LightevalTaskConfig( + name="wmt09:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt09_en_es = LightevalTaskConfig( + name="wmt09:en-es", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt09_en_fr = LightevalTaskConfig( + name="wmt09:en-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt09_en_hu = LightevalTaskConfig( + name="wmt09:en-hu", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-hu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt09_en_it = LightevalTaskConfig( + name="wmt09:en-it", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-it", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt09_es_en = LightevalTaskConfig( + name="wmt09:es-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt09_fr_en = LightevalTaskConfig( + name="wmt09:fr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt09_hu_en = LightevalTaskConfig( + name="wmt09:hu-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_hu-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt09_it_en = LightevalTaskConfig( + name="wmt09:it-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_it-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt10_cs_en = LightevalTaskConfig( + name="wmt10:cs-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt10_de_en = LightevalTaskConfig( + name="wmt10:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt10_en_cs = LightevalTaskConfig( + name="wmt10:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt10_en_de = LightevalTaskConfig( + name="wmt10:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt10_en_es = LightevalTaskConfig( + name="wmt10:en-es", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt10_en_fr = LightevalTaskConfig( + name="wmt10:en-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt10_es_en = LightevalTaskConfig( + name="wmt10:es-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt10_fr_en = LightevalTaskConfig( + name="wmt10:fr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt11_cs_en = LightevalTaskConfig( + name="wmt11:cs-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt11_de_en = LightevalTaskConfig( + name="wmt11:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt11_en_cs = LightevalTaskConfig( + name="wmt11:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt11_en_de = LightevalTaskConfig( + name="wmt11:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt11_en_es = LightevalTaskConfig( + name="wmt11:en-es", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt11_en_fr = LightevalTaskConfig( + name="wmt11:en-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt11_es_en = LightevalTaskConfig( + name="wmt11:es-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt11_fr_en = LightevalTaskConfig( + name="wmt11:fr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt12_cs_en = LightevalTaskConfig( + name="wmt12:cs-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt12_de_en = LightevalTaskConfig( + name="wmt12:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt12_en_cs = LightevalTaskConfig( + name="wmt12:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt12_en_de = LightevalTaskConfig( + name="wmt12:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt12_en_es = LightevalTaskConfig( + name="wmt12:en-es", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt12_en_fr = LightevalTaskConfig( + name="wmt12:en-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt12_es_en = LightevalTaskConfig( + name="wmt12:es-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt12_fr_en = LightevalTaskConfig( + name="wmt12:fr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt13_cs_en = LightevalTaskConfig( + name="wmt13:cs-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt13_de_en = LightevalTaskConfig( + name="wmt13:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt13_en_cs = LightevalTaskConfig( + name="wmt13:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt13_en_de = LightevalTaskConfig( + name="wmt13:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt13_en_es = LightevalTaskConfig( + name="wmt13:en-es", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt13_en_fr = LightevalTaskConfig( + name="wmt13:en-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt13_en_ru = LightevalTaskConfig( + name="wmt13:en-ru", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt13_es_en = LightevalTaskConfig( + name="wmt13:es-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt13_fr_en = LightevalTaskConfig( + name="wmt13:fr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt13_ru_en = LightevalTaskConfig( + name="wmt13:ru-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt14_cs_en = LightevalTaskConfig( + name="wmt14:cs-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt14_de_en = LightevalTaskConfig( + name="wmt14:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt14_en_cs = LightevalTaskConfig( + name="wmt14:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt14_en_de = LightevalTaskConfig( + name="wmt14:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt14_en_fr = LightevalTaskConfig( + name="wmt14:en-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="wmt14", + hf_subset="fr-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt14_en_fr = LightevalTaskConfig( + name="wmt14:en-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt14_en_hi = LightevalTaskConfig( + name="wmt14:en-hi", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-hi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt14_en_ru = LightevalTaskConfig( + name="wmt14:en-ru", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt14_fr_en = LightevalTaskConfig( + name="wmt14:fr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="wmt14", + hf_subset="fr-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt14_fr_en = LightevalTaskConfig( + name="wmt14:fr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt14_hi_en = LightevalTaskConfig( + name="wmt14:hi-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_hi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt14_ru_en = LightevalTaskConfig( + name="wmt14:ru-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt15_cs_en = LightevalTaskConfig( + name="wmt15:cs-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt15_de_en = LightevalTaskConfig( + name="wmt15:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt15_en_cs = LightevalTaskConfig( + name="wmt15:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt15_en_de = LightevalTaskConfig( + name="wmt15:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt15_en_fi = LightevalTaskConfig( + name="wmt15:en-fi", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt15_en_fr = LightevalTaskConfig( + name="wmt15:en-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt15_en_ru = LightevalTaskConfig( + name="wmt15:en-ru", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt15_fi_en = LightevalTaskConfig( + name="wmt15:fi-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt15_fr_en = LightevalTaskConfig( + name="wmt15:fr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt15_ru_en = LightevalTaskConfig( + name="wmt15:ru-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_cs_en = LightevalTaskConfig( + name="wmt16:cs-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_de_en = LightevalTaskConfig( + name="wmt16:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="wmt16", + hf_subset="de-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_de_en = LightevalTaskConfig( + name="wmt16:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_en_cs = LightevalTaskConfig( + name="wmt16:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_en_de = LightevalTaskConfig( + name="wmt16:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="wmt16", + hf_subset="de-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_en_de = LightevalTaskConfig( + name="wmt16:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_en_fi = LightevalTaskConfig( + name="wmt16:en-fi", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_en_ro = LightevalTaskConfig( + name="wmt16:en-ro", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="wmt16", + hf_subset="ro-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_en_ro = LightevalTaskConfig( + name="wmt16:en-ro", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-ro", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_en_ru = LightevalTaskConfig( + name="wmt16:en-ru", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_en_tr = LightevalTaskConfig( + name="wmt16:en-tr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-tr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_fi_en = LightevalTaskConfig( + name="wmt16:fi-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_ro_en = LightevalTaskConfig( + name="wmt16:ro-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="wmt16", + hf_subset="ro-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_ro_en = LightevalTaskConfig( + name="wmt16:ro-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_ro-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_ru_en = LightevalTaskConfig( + name="wmt16:ru-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt16_tr_en = LightevalTaskConfig( + name="wmt16:tr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_tr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_cs_en = LightevalTaskConfig( + name="wmt17:cs-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_de_en = LightevalTaskConfig( + name="wmt17:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_en_cs = LightevalTaskConfig( + name="wmt17:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_en_de = LightevalTaskConfig( + name="wmt17:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_en_fi = LightevalTaskConfig( + name="wmt17:en-fi", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_en_lv = LightevalTaskConfig( + name="wmt17:en-lv", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-lv", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_en_ru = LightevalTaskConfig( + name="wmt17:en-ru", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_en_tr = LightevalTaskConfig( + name="wmt17:en-tr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-tr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_en_zh = LightevalTaskConfig( + name="wmt17:en-zh", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_fi_en = LightevalTaskConfig( + name="wmt17:fi-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_lv_en = LightevalTaskConfig( + name="wmt17:lv-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_lv-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_ru_en = LightevalTaskConfig( + name="wmt17:ru-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_tr_en = LightevalTaskConfig( + name="wmt17:tr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_tr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt17_zh_en = LightevalTaskConfig( + name="wmt17:zh-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_cs_en = LightevalTaskConfig( + name="wmt18:cs-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_de_en = LightevalTaskConfig( + name="wmt18:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_en_cs = LightevalTaskConfig( + name="wmt18:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_en_de = LightevalTaskConfig( + name="wmt18:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_en_et = LightevalTaskConfig( + name="wmt18:en-et", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-et", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_en_fi = LightevalTaskConfig( + name="wmt18:en-fi", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_en_ru = LightevalTaskConfig( + name="wmt18:en-ru", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_en_tr = LightevalTaskConfig( + name="wmt18:en-tr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-tr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_en_zh = LightevalTaskConfig( + name="wmt18:en-zh", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_et_en = LightevalTaskConfig( + name="wmt18:et-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_et-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_fi_en = LightevalTaskConfig( + name="wmt18:fi-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_ru_en = LightevalTaskConfig( + name="wmt18:ru-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_tr_en = LightevalTaskConfig( + name="wmt18:tr-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_tr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt18_zh_en = LightevalTaskConfig( + name="wmt18:zh-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_cs_de = LightevalTaskConfig( + name="wmt19:cs-de", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_cs-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_de_cs = LightevalTaskConfig( + name="wmt19:de-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_de-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_de_en = LightevalTaskConfig( + name="wmt19:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_de_fr = LightevalTaskConfig( + name="wmt19:de-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_de-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_en_cs = LightevalTaskConfig( + name="wmt19:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_en_de = LightevalTaskConfig( + name="wmt19:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_en_fi = LightevalTaskConfig( + name="wmt19:en-fi", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_en_gu = LightevalTaskConfig( + name="wmt19:en-gu", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-gu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_en_kk = LightevalTaskConfig( + name="wmt19:en-kk", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-kk", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_en_lt = LightevalTaskConfig( + name="wmt19:en-lt", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-lt", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_en_ru = LightevalTaskConfig( + name="wmt19:en-ru", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_en_zh = LightevalTaskConfig( + name="wmt19:en-zh", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_fi_en = LightevalTaskConfig( + name="wmt19:fi-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_fr_de = LightevalTaskConfig( + name="wmt19:fr-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_fr-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_gu_en = LightevalTaskConfig( + name="wmt19:gu-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_gu-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_kk_en = LightevalTaskConfig( + name="wmt19:kk-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_kk-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_lt_en = LightevalTaskConfig( + name="wmt19:lt-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_lt-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_ru_en = LightevalTaskConfig( + name="wmt19:ru-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt19_zh_en = LightevalTaskConfig( + name="wmt19:zh-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_cs_en = LightevalTaskConfig( + name="wmt20:cs-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_de_en = LightevalTaskConfig( + name="wmt20:de-en", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_de_fr = LightevalTaskConfig( + name="wmt20:de-fr", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_de-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_en_cs = LightevalTaskConfig( + name="wmt20:en-cs", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_en_de = LightevalTaskConfig( + name="wmt20:en-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_en_iu = LightevalTaskConfig( + name="wmt20:en-iu", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-iu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_en_ja = LightevalTaskConfig( + name="wmt20:en-ja", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-ja", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_en_km = LightevalTaskConfig( + name="wmt20:en-km", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-km", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_en_pl = LightevalTaskConfig( + name="wmt20:en-pl", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-pl", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_en_ps = LightevalTaskConfig( + name="wmt20:en-ps", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-ps", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_en_ru = LightevalTaskConfig( + name="wmt20:en-ru", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_en_ta = LightevalTaskConfig( + name="wmt20:en-ta", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-ta", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_en_zh = LightevalTaskConfig( + name="wmt20:en-zh", + suite=["lighteval"], + prompt_function=prompt.wmt_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_fr_de = LightevalTaskConfig( + name="wmt20:fr-de", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_fr-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_iu_en = LightevalTaskConfig( + name="wmt20:iu-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_iu-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_ja_en = LightevalTaskConfig( + name="wmt20:ja-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_ja-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_km_en = LightevalTaskConfig( + name="wmt20:km-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_km-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_pl_en = LightevalTaskConfig( + name="wmt20:pl-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_pl-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_ps_en = LightevalTaskConfig( + name="wmt20:ps-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_ps-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_ru_en = LightevalTaskConfig( + name="wmt20:ru-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_ta_en = LightevalTaskConfig( + name="wmt20:ta-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_ta-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +wmt20_zh_en = LightevalTaskConfig( + name="wmt20:zh-en", + suite=["lighteval"], + prompt_function=prompt.wmt_reverse_alphabetical, + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metrics=[Metrics.bleu, Metrics.chrf, Metrics.ter], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + wmt14_de_en, + wmt16_en_cs, + wmt19_en_cs, + wmt19_en_de, + wmt19_en_fi, + wmt19_en_gu, + wmt19_en_kk, + wmt19_en_lt, + wmt19_en_ru, + wmt19_en_zh, + wmt19_fi_en, + wmt19_fr_de, + wmt19_gu_en, + wmt19_kk_en, + wmt19_lt_en, + wmt19_ru_en, + wmt19_zh_en, + wmt20_cs_en, + wmt20_de_en, + wmt20_en_de, + wmt20_en_iu, + wmt20_en_ja, + wmt20_en_km, + wmt20_en_pl, + wmt20_en_ps, + wmt20_en_ru, + wmt20_en_ta, + wmt20_en_zh, + wmt20_fr_de, + wmt20_iu_en, + wmt20_ja_en, + wmt20_km_en, + wmt20_pl_en, + wmt20_ps_en, + wmt20_ru_en, + wmt20_ta_en, + wmt20_zh_en, +] diff --git a/src/lighteval/tasks/tasks/sciq.py b/src/lighteval/tasks/tasks/sciq.py new file mode 100644 index 000000000..ed4285101 --- /dev/null +++ b/src/lighteval/tasks/tasks/sciq.py @@ -0,0 +1,48 @@ +""" +name: +Sciq + +dataset: +allenai/sciq + +abstract: +The SciQ dataset contains 13,679 crowdsourced science exam questions about +Physics, Chemistry and Biology, among others. The questions are in +multiple-choice format with 4 answer options each. For the majority of the +questions, an additional paragraph with supporting evidence for the correct +answer is provided. + +languages: +english + +tags: +physics, chemistry, biology, reasoning, multiple-choice, qa + +paper: +https://arxiv.org/abs/1707.06209 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +sciq = LightevalTaskConfig( + name="sciq", + suite=["lighteval"], + prompt_function=prompt.sciq, + hf_repo="allenai/sciq", + hf_subset="default", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + sciq, +] diff --git a/src/lighteval/tasks/tasks/simpleqa.py b/src/lighteval/tasks/tasks/simpleqa.py new file mode 100644 index 000000000..31ab0e369 --- /dev/null +++ b/src/lighteval/tasks/tasks/simpleqa.py @@ -0,0 +1,45 @@ +""" +name: +Simpleqa + +dataset: +lighteval/SimpleQA + +abstract: +A factuality benchmark called SimpleQA that measures the ability for language +models to answer short, fact-seeking questions. + +languages: +english + +tags: +factuality, general-knowledge, qa + +paper: +https://openai.com/index/introducing-simpleqa/ +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +simpleqa = LightevalTaskConfig( + name="simpleqa", + suite=["lighteval"], + prompt_function=prompt.simpleqa, + hf_repo="lighteval/SimpleQA", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="few_shot", + few_shots_select=None, + generation_size=2048, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + simpleqa, +] diff --git a/src/lighteval/tasks/tasks/siqa.py b/src/lighteval/tasks/tasks/siqa.py new file mode 100644 index 000000000..e8e049bbf --- /dev/null +++ b/src/lighteval/tasks/tasks/siqa.py @@ -0,0 +1,54 @@ +""" +name: +Siqa + +dataset: +allenai/social_i_qa + +abstract: +We introduce Social IQa: Social Interaction QA, a new question-answering +benchmark for testing social commonsense intelligence. Contrary to many prior +benchmarks that focus on physical or taxonomic knowledge, Social IQa focuses on +reasoning about people's actions and their social implications. For example, +given an action like "Jesse saw a concert" and a question like "Why did Jesse do +this?", humans can easily infer that Jesse wanted "to see their favorite +performer" or "to enjoy the music", and not "to see what's happening inside" or +"to see if it works". The actions in Social IQa span a wide variety of social +situations, and answer candidates contain both human-curated answers and +adversarially-filtered machine-generated candidates. Social IQa contains over +37,000 QA pairs for evaluating models' abilities to reason about the social +implications of everyday events and situations. + +languages: +english + +tags: +commonsense, multiple-choice, qa + +paper: +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +siqa = LightevalTaskConfig( + name="siqa", + suite=["lighteval"], + prompt_function=prompt.siqa, + hf_repo="allenai/social_i_qa", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + siqa, +] diff --git a/community_tasks/slr_bench_evals.py b/src/lighteval/tasks/tasks/slr_bench.py similarity index 55% rename from community_tasks/slr_bench_evals.py rename to src/lighteval/tasks/tasks/slr_bench.py index b6d60ff43..bad487b57 100644 --- a/community_tasks/slr_bench_evals.py +++ b/src/lighteval/tasks/tasks/slr_bench.py @@ -1,68 +1,63 @@ -# MIT License +""" +name: +SLR-Bench -# Copyright (c) 2025 Lukas Helff +dataset: +AIML-TUDA/SLR-Bench -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +abstract: +SLR-Bench is a large-scale benchmark for scalable logical reasoning with +language models, comprising 19,000 prompts organized into 20 curriculum levels. -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +languages: +english -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +tags: +reasoning, symbolic -""" -SLR-Bench is a large-scale benchmark for scalable logical reasoning with language models, comprising 19,000 prompts organized into 20 curriculum levels. -The tasks progressively increase in relational, arithmetic, and recursive complexity, requiring models to synthesize Prolog rules that classify train compositions. -For more details see: https://huggingface.co/datasets/AIML-TUDA/SLR-Bench -The paper can be found here: https://arxiv.org/abs/2506.15787 -Before using this task, please ensure that SWI-Prolog and evaluate are installed on your system, as they are required for symbolic verification of the generated Prolog programs. +paper: +https://arxiv.org/abs/2506.15787 """ import logging -import shutil import numpy as np -from evaluate import load from lighteval.metrics.utils.metric_utils import SampleLevelComputation, SampleLevelMetric from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc, SamplingMethod +from lighteval.utils.imports import is_package_available, requires -logger = logging.getLogger(__name__) - - -# Check for SWI-Prolog installation -if shutil.which("swipl") is None: - raise ImportError( - "SWI-Prolog (swipl) is not installed or not in PATH. " - "Please install SWI-Prolog to use this task. " - "You can install required dependencies with: pip install -r community_tasks/slr_bench_requirements.txt" - ) +if is_package_available("evaluate"): + from evaluate import load +else: + load = None -# Load the symbolic judge for evaluating Prolog programs -symbolic_judge = load("AIML-TUDA/VerifiableRewardsForScalableLogicalReasoning") +logger = logging.getLogger(__name__) +@requires("evaluate") def prompt_fn(line: dict, task_name: str): """Defines how to go from a dataset line to a doc object.""" + # Check for SWI-Prolog installation + import shutil + + if shutil.which("swipl") is None: + raise ImportError( + "SWI-Prolog (swipl) is not installed or not in PATH. Please install SWI-Prolog to use this task. " + ) + return Doc( task_name=task_name, query=line["prompt"], choices=[str(line.get("validation program", ""))], gold_index=0 ) class VerifiableRewardMetric(SampleLevelComputation): + # Load the symbolic judge for evaluating Prolog programs + def compute(self, doc, model_response, **kwargs): + symbolic_judge = load("AIML-TUDA/VerifiableRewardsForScalableLogicalReasoning") try: prediction = model_response.final_text[0] validation_program = doc.choices[0] if doc.choices else "" diff --git a/src/lighteval/tasks/tasks/squad_v2.py b/src/lighteval/tasks/tasks/squad_v2.py new file mode 100644 index 000000000..a05df9332 --- /dev/null +++ b/src/lighteval/tasks/tasks/squad_v2.py @@ -0,0 +1,59 @@ +""" +name: +Squad V2 + +dataset: +rajpurkar/squad_v2 + +abstract: +Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, +consisting of questions posed by crowdworkers on a set of Wikipedia articles, +where the answer to every question is a segment of text, or span, from the +corresponding reading passage, or the question might be unanswerable. +SQuAD 2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 +unanswerable questions written adversarially by crowdworkers to look similar to +answerable ones. To do well on SQuAD2.0, systems must not only answer questions +when possible, but also determine when no answer is supported by the paragraph +and abstain from answering. + +languages: +english + +tags: +qa + +paper: +https://arxiv.org/abs/1806.03822 +""" + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.templates.qa import get_qa_prompt_function +from lighteval.utils.language import Language + + +squad_v2 = LightevalTaskConfig( + name="squad_v2", + prompt_function=get_qa_prompt_function( + Language.ENGLISH, + lambda line: { + "question": line["question"], + "context": line["context"], + "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], + }, + ), + suite=("lighteval",), + hf_repo="rajpurkar/squad_v2", + hf_subset="squad_v2", + hf_filter=lambda line: any(ans for ans in line["answers"]["text"] if len(ans) > 0), + evaluation_splits=("validation",), + few_shots_split="train", + stop_sequence=["\n", "Question:", "question:"], + generation_size=200, + metrics=[Metrics.exact_match], + version=1, +) + +TASKS_TABLE = [ + squad_v2, +] diff --git a/src/lighteval/tasks/tasks/storycloze.py b/src/lighteval/tasks/tasks/storycloze.py new file mode 100644 index 000000000..5fdd34c9c --- /dev/null +++ b/src/lighteval/tasks/tasks/storycloze.py @@ -0,0 +1,63 @@ +""" +name: +Storycloze + +dataset: +MoE-UNC/story_cloze + +abstract: +A Corpus and Cloze Evaluation for Deeper Understanding of +Commonsense Stories + +languages: +english + +tags: +narrative, reasoning + +paper: +https://arxiv.org/abs/1604.01696 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +storycloze_2016 = LightevalTaskConfig( + name="storycloze:2016", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="MoE-UNC/story_cloze", + hf_subset="2016", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + + +storycloze_2018 = LightevalTaskConfig( + name="storycloze:2018", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="MoE-UNC/story_cloze", + hf_subset="2018", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + storycloze_2016, + storycloze_2018, +] diff --git a/src/lighteval/tasks/tasks/summarization.py b/src/lighteval/tasks/tasks/summarization.py new file mode 100644 index 000000000..84deb9f01 --- /dev/null +++ b/src/lighteval/tasks/tasks/summarization.py @@ -0,0 +1,104 @@ +""" +name: +Summarization + +dataset: +lighteval/summarization + +abstract: +Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural +Networks for Extreme Summarization and: Abstractive Text Summarization using +Sequence-to-sequence RNNs and Beyond + +languages: +english + +tags: +summarization + +paper: +https://aclanthology.org/D18-1206/ +https://aclanthology.org/K16-1028/ +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +summarization_cnn_dm = LightevalTaskConfig( + name="summarization:cnn-dm", + suite=["lighteval"], + prompt_function=prompt.cnn_dm, + hf_repo="lighteval/summarization", + hf_subset="cnn-dm", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metrics=[ + Metrics.rouge1, + Metrics.rouge2, + Metrics.rougeL, + Metrics.faithfulness, + Metrics.extractiveness, + Metrics.bert_score, + ], + stop_sequence=["\n"], + version=0, +) + + +summarization_xsum = LightevalTaskConfig( + name="summarization:xsum", + suite=["lighteval"], + prompt_function=prompt.xsum, + hf_repo="lighteval/summarization", + hf_subset="xsum", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=64, + metrics=[ + Metrics.rouge1, + Metrics.rouge2, + Metrics.rougeL, + Metrics.faithfulness, + Metrics.extractiveness, + Metrics.bert_score, + ], + stop_sequence=["\n"], + version=0, +) + + +summarization_xsum_sampled = LightevalTaskConfig( + name="summarization:xsum-sampled", + suite=["lighteval"], + prompt_function=prompt.xsum, + hf_repo="lighteval/summarization", + hf_subset="xsum-sampled", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=64, + metrics=[ + Metrics.rouge1, + Metrics.rouge2, + Metrics.rougeL, + Metrics.faithfulness, + Metrics.extractiveness, + Metrics.bert_score, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + summarization_cnn_dm, + summarization_xsum, + summarization_xsum_sampled, +] diff --git a/src/lighteval/tasks/tasks/swag.py b/src/lighteval/tasks/tasks/swag.py new file mode 100644 index 000000000..7743a1c47 --- /dev/null +++ b/src/lighteval/tasks/tasks/swag.py @@ -0,0 +1,51 @@ +""" +name: +Swag + +dataset: +allenai/swag + +abstract: +The dataset consists of 113k multiple choice questions about grounded situations +(73k training, 20k validation, 20k test). Each question is a video caption from +LSMDC or ActivityNet Captions, with four answer choices about what might happen +next in the scene. The correct answer is the (real) video caption for the next +event in the video; the three incorrect answers are adversarially generated and +human verified, so as to fool machines but not humans. SWAG aims to be a +benchmark for evaluating grounded commonsense NLI and for learning +representations. + +languages: +english + +tags: +narrative, reasoning + +paper: +https://arxiv.org/abs/1808.05326 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +swag = LightevalTaskConfig( + name="swag", + suite=["lighteval"], + prompt_function=prompt.swag, + hf_repo="allenai/swag", + hf_subset="regular", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + swag, +] diff --git a/src/lighteval/tasks/tasks/synthetic_reasoning.py b/src/lighteval/tasks/tasks/synthetic_reasoning.py new file mode 100644 index 000000000..815e0e91a --- /dev/null +++ b/src/lighteval/tasks/tasks/synthetic_reasoning.py @@ -0,0 +1,122 @@ +""" +name: +Synthetic Reasoning + +dataset: +lighteval/synthetic_reasoning, lighteval/synthetic_reasoning_natural + +abstract: +LIME: Learning Inductive Bias for Primitives of Mathematical Reasoning + +languages: +english + +tags: +reasoning + +paper: +https://arxiv.org/abs/2206.03855 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +synthetic_reasoning_induction = LightevalTaskConfig( + name="synthetic_reasoning:induction", + suite=["lighteval"], + prompt_function=prompt.synthetic_reasoning, + hf_repo="lighteval/synthetic_reasoning", + hf_subset="induction", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + + +synthetic_reasoning_natural_easy = LightevalTaskConfig( + name="synthetic_reasoning:natural_easy", + suite=["lighteval"], + prompt_function=prompt.synthetic_reasoning_natural, + hf_repo="lighteval/synthetic_reasoning_natural", + hf_subset="easy", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + + +synthetic_reasoning_natural_hard = LightevalTaskConfig( + name="synthetic_reasoning:natural_hard", + suite=["lighteval"], + prompt_function=prompt.synthetic_reasoning_natural, + hf_repo="lighteval/synthetic_reasoning_natural", + hf_subset="hard", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + + +synthetic_reasoning_pattern_match = LightevalTaskConfig( + name="synthetic_reasoning:pattern_match", + suite=["lighteval"], + prompt_function=prompt.synthetic_reasoning, + hf_repo="lighteval/synthetic_reasoning", + hf_subset="pattern_match", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + + +synthetic_reasoning_variable_substitution = LightevalTaskConfig( + name="synthetic_reasoning:variable_substitution", + suite=["lighteval"], + prompt_function=prompt.synthetic_reasoning, + hf_repo="lighteval/synthetic_reasoning", + hf_subset="variable_substitution", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metrics=[ + Metrics.exact_match, + ], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + synthetic_reasoning_induction, + synthetic_reasoning_natural_easy, + synthetic_reasoning_natural_hard, + synthetic_reasoning_pattern_match, + synthetic_reasoning_variable_substitution, +] diff --git a/src/lighteval/tasks/tasks/the_pile.py b/src/lighteval/tasks/tasks/the_pile.py new file mode 100644 index 000000000..3ed26d94e --- /dev/null +++ b/src/lighteval/tasks/tasks/the_pile.py @@ -0,0 +1,351 @@ +""" +name: +The Pile + +dataset: +lighteval/pile_helm + +abstract: +The Pile corpus for measuring lanugage model performance across various domains. + +languages: +english + +tags: +language-modeling + +paper: +https://arxiv.org/abs/2101.00027 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +the_pile_arxiv_helm = LightevalTaskConfig( + name="the_pile:arxiv", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="arxiv", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_bibliotik_helm = LightevalTaskConfig( + name="the_pile:bibliotik", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="bibliotik", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_commoncrawl_helm = LightevalTaskConfig( + name="the_pile:commoncrawl", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="commoncrawl", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_dm_mathematics_helm = LightevalTaskConfig( + name="the_pile:dm-mathematics", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="dm-mathematics", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_enron_helm = LightevalTaskConfig( + name="the_pile:enron", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="enron", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_europarl_helm = LightevalTaskConfig( + name="the_pile:europarl", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="europarl", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_freelaw_helm = LightevalTaskConfig( + name="the_pile:freelaw", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="freelaw", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_github_helm = LightevalTaskConfig( + name="the_pile:github", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="github", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_gutenberg_helm = LightevalTaskConfig( + name="the_pile:gutenberg", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="gutenberg", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_hackernews_helm = LightevalTaskConfig( + name="the_pile:hackernews", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="hackernews", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_nih_exporter_helm = LightevalTaskConfig( + name="the_pile:nih-exporter", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="nih-exporter", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_opensubtitles_helm = LightevalTaskConfig( + name="the_pile:opensubtitles", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="opensubtitles", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_openwebtext2_helm = LightevalTaskConfig( + name="the_pile:openwebtext2", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="openwebtext2", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + + +the_pile_pubmed_abstracts_helm = LightevalTaskConfig( + name="the_pile:pubmed-abstracts", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="pubmed-abstracts", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_pubmed_central_helm = LightevalTaskConfig( + name="the_pile:pubmed-central", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="pubmed-central", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_stackexchange_helm = LightevalTaskConfig( + name="the_pile:stackexchange", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="stackexchange", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_upsto_helm = LightevalTaskConfig( + name="the_pile:upsto", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="uspto", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_wikipedia_helm = LightevalTaskConfig( + name="the_pile:wikipedia", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="wikipedia", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +the_pile_youtubesubtitles_helm = LightevalTaskConfig( + name="the_pile:youtubesubtitles", + suite=["lighteval"], + prompt_function=prompt.the_pile, + hf_repo="lighteval/pile_helm", + hf_subset="youtubesubtitles", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + the_pile_arxiv_helm, + the_pile_bibliotik_helm, + the_pile_commoncrawl_helm, + the_pile_dm_mathematics_helm, + the_pile_enron_helm, + the_pile_europarl_helm, + the_pile_freelaw_helm, + the_pile_github_helm, + the_pile_gutenberg_helm, + the_pile_hackernews_helm, + the_pile_nih_exporter_helm, + the_pile_opensubtitles_helm, + the_pile_openwebtext2_helm, + the_pile_pubmed_abstracts_helm, + the_pile_pubmed_central_helm, + the_pile_stackexchange_helm, + the_pile_upsto_helm, + the_pile_wikipedia_helm, + the_pile_youtubesubtitles_helm, +] diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/tasks/tiny_benchmarks/main.py similarity index 86% rename from src/lighteval/tasks/extended/tiny_benchmarks/main.py rename to src/lighteval/tasks/tasks/tiny_benchmarks/main.py index 44e05d0cc..bb8d0c2d1 100644 --- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py +++ b/src/lighteval/tasks/tasks/tiny_benchmarks/main.py @@ -1,29 +1,24 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team & Felipe Maia Polo +""" +name: +Tiny Benchmarks -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: +dataset: +tinyBenchmarks/tinyWinogrande, tinyBenchmarks/tinyAI2_arc, +tinyBenchmarks/tinyHellaswag, tinyBenchmarks/tinyMMLU, +tinyBenchmarks/tinyTruthfulQA, tinyBenchmarks/tinyGSM8k -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +abstract: +TinyBenchmarks is a benchmark for evaluating the performance of language models +on tiny benchmarks. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. +languages: +english -# ruff: noqa: F405, F403, F401 -"""See https://github.com/felipemaiapolo/tinyBenchmarks/ for the original code. +tags: +general-knowledge, reasoning, qa -Test with `python run_evals_accelerate.py --model_args "pretrained=EleutherAI/pythia-70m" --tasks "extended|tiny:winogrande|0,extended|tiny:gsm8k|0,extended|tiny:hellaswag|0,extended|tiny:arc|0,extended|tiny:truthfulqa|0" --extended_tasks extended_tasks --output_dir "./evals"` +paper: +https://arxiv.org/abs/2402.14992 """ import os @@ -249,7 +244,7 @@ def compute_corpus(self, y_input): task = LightevalTaskConfig( name=f"tiny:{name}", prompt_function=task["prompt"], - suite=["extended"], + suite=["lighteval"], hf_repo=task["dataset"], hf_subset=task["subset"], hf_avail_splits=task["splits"], diff --git a/src/lighteval/tasks/tasks/toxigen.py b/src/lighteval/tasks/tasks/toxigen.py new file mode 100644 index 000000000..c5e724a9d --- /dev/null +++ b/src/lighteval/tasks/tasks/toxigen.py @@ -0,0 +1,45 @@ +""" +name: +Toxigen + +dataset: +skg/toxigen-data + +abstract: +This dataset is for implicit hate speech detection. All instances were generated +using GPT-3 and the methods described in our paper. + +languages: +english + +tags: +generation, safety + +paper: +https://arxiv.org/abs/2203.09509 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +toxigen = LightevalTaskConfig( + name="toxigen", + suite=["lighteval"], + prompt_function=prompt.toxigen, + hf_repo="skg/toxigen-data", + hf_subset="annotated", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + toxigen, +] diff --git a/src/lighteval/tasks/tasks/triviaqa.py b/src/lighteval/tasks/tasks/triviaqa.py new file mode 100644 index 000000000..b3e13d553 --- /dev/null +++ b/src/lighteval/tasks/tasks/triviaqa.py @@ -0,0 +1,48 @@ +""" +name: +Triviaqa + +dataset: +mandarjoshi/trivia_qa + +abstract: +TriviaqQA is a reading comprehension dataset containing over 650K +question-answer-evidence triples. TriviaqQA includes 95K question-answer pairs +authored by trivia enthusiasts and independently gathered evidence documents, +six per question on average, that provide high quality distant supervision for +answering the questions. + +languages: +english + +tags: +qa + +paper: +https://arxiv.org/abs/1705.03551 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +triviaqa = LightevalTaskConfig( + name="triviaqa", + suite=["lighteval"], + prompt_function=prompt.triviaqa, + hf_repo="mandarjoshi/trivia_qa", + hf_subset="rc.nocontext", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metrics=[Metrics.exact_match], + stop_sequence=["\n", ".", ","], + version=0, +) + +TASKS_TABLE = [ + triviaqa, +] diff --git a/src/lighteval/tasks/tasks/truthfulqa.py b/src/lighteval/tasks/tasks/truthfulqa.py new file mode 100644 index 000000000..84db92ed6 --- /dev/null +++ b/src/lighteval/tasks/tasks/truthfulqa.py @@ -0,0 +1,61 @@ +""" +name: +Truthfulqa + +dataset: +EleutherAI/truthful_qa_mc + +abstract: +TruthfulQA: Measuring How Models Mimic Human Falsehoods + +languages: +english + +tags: +factuality, qa + +paper: +https://arxiv.org/abs/2109.07958 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +truthfulqa_gen = LightevalTaskConfig( + name="truthfulqa:gen", + suite=["lighteval"], + prompt_function=prompt.truthful_qa_generative, + hf_repo="truthfulqa/truthful_qa", + hf_subset="generation", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +truthfulqa_mc = LightevalTaskConfig( + name="truthfulqa:mc", + suite=["lighteval"], + prompt_function=prompt.truthful_qa_multiple_choice, + hf_repo="truthfulqa/truthful_qa", + hf_subset="multiple_choice", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.truthfulqa_mc_metrics], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + truthfulqa_gen, + truthfulqa_mc, +] diff --git a/src/lighteval/tasks/tasks/twitterAAE.py b/src/lighteval/tasks/tasks/twitterAAE.py new file mode 100644 index 000000000..dd9861f91 --- /dev/null +++ b/src/lighteval/tasks/tasks/twitterAAE.py @@ -0,0 +1,62 @@ +""" +name: +Twitteraae + +dataset: +lighteval/twitterAAE + +abstract: +Demographic Dialectal Variation in Social Media: A Case Study of African-American English + +languages: +english + +tags: +language-modeling + +paper: +https://aclanthology.org/D16-1120/ +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +twitterAAE_aa = LightevalTaskConfig( + name="twitterAAE:aa", + suite=["lighteval"], + prompt_function=prompt.twitter_aae, + hf_repo="lighteval/twitterAAE", + hf_subset="aa", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + + +twitterAAE_white = LightevalTaskConfig( + name="twitterAAE:white", + suite=["lighteval"], + prompt_function=prompt.twitter_aae, + hf_repo="lighteval/twitterAAE", + hf_subset="white", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + twitterAAE_aa, + twitterAAE_white, +] diff --git a/src/lighteval/tasks/tasks/unscramble.py b/src/lighteval/tasks/tasks/unscramble.py new file mode 100644 index 000000000..eb8335026 --- /dev/null +++ b/src/lighteval/tasks/tasks/unscramble.py @@ -0,0 +1,113 @@ +""" +name: +Unscramble + +dataset: +lighteval/GPT3_unscramble + +abstract: +Benchmark where we ask the model to unscramble a word, either anagram or +random insertion. + +languages: +english + +tags: +language-modeling, reasoning + +paper: +https://huggingface.co/datasets/lighteval/GPT3_unscramble +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +unscramble_anagrams1 = LightevalTaskConfig( + name="unscramble:anagrams1", + suite=["lighteval"], + prompt_function=prompt.unscramble, + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["mid_word_1_anagrams"], + evaluation_splits=["mid_word_1_anagrams"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +unscramble_anagrams2 = LightevalTaskConfig( + name="unscramble:anagrams2", + suite=["lighteval"], + prompt_function=prompt.unscramble, + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["mid_word_2_anagrams"], + evaluation_splits=["mid_word_2_anagrams"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +unscramble_cycle_letters = LightevalTaskConfig( + name="unscramble:cycle_letters", + suite=["lighteval"], + prompt_function=prompt.unscramble, + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["cycle_letters_in_word"], + evaluation_splits=["cycle_letters_in_word"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +unscramble_random_insertion = LightevalTaskConfig( + name="unscramble:random_insertion", + suite=["lighteval"], + prompt_function=prompt.unscramble, + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["random_insertion_in_word"], + evaluation_splits=["random_insertion_in_word"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +unscramble_reversed_words = LightevalTaskConfig( + name="unscramble:reversed_words", + suite=["lighteval"], + prompt_function=prompt.unscramble, + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["reversed_words"], + evaluation_splits=["reversed_words"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metrics=[Metrics.exact_match(sample_params={"strip_strings": False})], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + unscramble_anagrams1, + unscramble_anagrams2, + unscramble_cycle_letters, + unscramble_random_insertion, + unscramble_reversed_words, +] diff --git a/src/lighteval/tasks/tasks/webqs.py b/src/lighteval/tasks/tasks/webqs.py new file mode 100644 index 000000000..493b83f75 --- /dev/null +++ b/src/lighteval/tasks/tasks/webqs.py @@ -0,0 +1,47 @@ +""" +name: +Webqs + +dataset: +stanfordnlp/web_questions + +abstract: +This dataset consists of 6,642 question/answer pairs. The questions are supposed +to be answerable by Freebase, a large knowledge graph. The questions are mostly +centered around a single named entity. The questions are popular ones asked on +the web. + +languages: +english + +tags: +qa + +paper: +https://aclanthology.org/D13-1160.pdf +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +webqs = LightevalTaskConfig( + name="webqs", + suite=["lighteval"], + prompt_function=prompt.webqs, + hf_repo="stanfordnlp/web_questions", + hf_subset="default", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + webqs, +] diff --git a/src/lighteval/tasks/tasks/wikifact.py b/src/lighteval/tasks/tasks/wikifact.py new file mode 100644 index 000000000..592491379 --- /dev/null +++ b/src/lighteval/tasks/tasks/wikifact.py @@ -0,0 +1,1453 @@ +""" +name: +Wikifact + +dataset: +lighteval/wikifact + +abstract: +Extensively test factual knowledge. + +languages: +english + +tags: +factuality, knowledge + +paper: +https://aclanthology.org/D19-1250/ +""" + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks import default_prompts as prompt +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +wikifact_applies_to_jurisdiction = LightevalTaskConfig( + name="wikifact:applies_to_jurisdiction", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="applies_to_jurisdiction", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_atomic_number = LightevalTaskConfig( + name="wikifact:atomic_number", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="atomic_number", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_author = LightevalTaskConfig( + name="wikifact:author", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="author", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_award_received = LightevalTaskConfig( + name="wikifact:award_received", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="award_received", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_basic_form_of_government = LightevalTaskConfig( + name="wikifact:basic_form_of_government", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="basic_form_of_government", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_capital = LightevalTaskConfig( + name="wikifact:capital", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="capital", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_capital_of = LightevalTaskConfig( + name="wikifact:capital_of", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="capital_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_central_bank = LightevalTaskConfig( + name="wikifact:central_bank", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="central_bank", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_composer = LightevalTaskConfig( + name="wikifact:composer", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="composer", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_continent = LightevalTaskConfig( + name="wikifact:continent", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="continent", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_country = LightevalTaskConfig( + name="wikifact:country", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="country", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_country_of_citizenship = LightevalTaskConfig( + name="wikifact:country_of_citizenship", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="country_of_citizenship", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_country_of_origin = LightevalTaskConfig( + name="wikifact:country_of_origin", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="country_of_origin", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_creator = LightevalTaskConfig( + name="wikifact:creator", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="creator", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_currency = LightevalTaskConfig( + name="wikifact:currency", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="currency", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_defendant = LightevalTaskConfig( + name="wikifact:defendant", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="defendant", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_developer = LightevalTaskConfig( + name="wikifact:developer", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="developer", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_diplomatic_relation = LightevalTaskConfig( + name="wikifact:diplomatic_relation", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="diplomatic_relation", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_director = LightevalTaskConfig( + name="wikifact:director", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="director", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_discoverer_or_inventor = LightevalTaskConfig( + name="wikifact:discoverer_or_inventor", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="discoverer_or_inventor", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_drug_or_therapy_used_for_treatment = LightevalTaskConfig( + name="wikifact:drug_or_therapy_used_for_treatment", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="drug_or_therapy_used_for_treatment", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_educated_at = LightevalTaskConfig( + name="wikifact:educated_at", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="educated_at", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_electron_configuration = LightevalTaskConfig( + name="wikifact:electron_configuration", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="electron_configuration", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_employer = LightevalTaskConfig( + name="wikifact:employer", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="employer", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_field_of_work = LightevalTaskConfig( + name="wikifact:field_of_work", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="field_of_work", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_file_extension = LightevalTaskConfig( + name="wikifact:file_extension", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="file_extension", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_genetic_association = LightevalTaskConfig( + name="wikifact:genetic_association", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="genetic_association", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_genre = LightevalTaskConfig( + name="wikifact:genre", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="genre", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_has_part = LightevalTaskConfig( + name="wikifact:has_part", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="has_part", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_head_of_government = LightevalTaskConfig( + name="wikifact:head_of_government", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="head_of_government", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_head_of_state = LightevalTaskConfig( + name="wikifact:head_of_state", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="head_of_state", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_headquarters_location = LightevalTaskConfig( + name="wikifact:headquarters_location", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="headquarters_location", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_industry = LightevalTaskConfig( + name="wikifact:industry", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="industry", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_influenced_by = LightevalTaskConfig( + name="wikifact:influenced_by", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="influenced_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_instance_of = LightevalTaskConfig( + name="wikifact:instance_of", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="instance_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_instrument = LightevalTaskConfig( + name="wikifact:instrument", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="instrument", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_language_of_work_or_name = LightevalTaskConfig( + name="wikifact:language_of_work_or_name", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="language_of_work_or_name", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_languages_spoken_written_or_signed = LightevalTaskConfig( + name="wikifact:languages_spoken_written_or_signed", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="languages_spoken_written_or_signed", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_laws_applied = LightevalTaskConfig( + name="wikifact:laws_applied", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="laws_applied", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_located_in_the_administrative_territorial_entity = LightevalTaskConfig( + name="wikifact:located_in_the_administrative_territorial_entity", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="located_in_the_administrative_territorial_entity", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_location = LightevalTaskConfig( + name="wikifact:location", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="location", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_location_of_discovery = LightevalTaskConfig( + name="wikifact:location_of_discovery", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="location_of_discovery", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_location_of_formation = LightevalTaskConfig( + name="wikifact:location_of_formation", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="location_of_formation", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_majority_opinion_by = LightevalTaskConfig( + name="wikifact:majority_opinion_by", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="majority_opinion_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_manufacturer = LightevalTaskConfig( + name="wikifact:manufacturer", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="manufacturer", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_measured_physical_quantity = LightevalTaskConfig( + name="wikifact:measured_physical_quantity", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="measured_physical_quantity", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_medical_condition_treated = LightevalTaskConfig( + name="wikifact:medical_condition_treated", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="medical_condition_treated", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_member_of = LightevalTaskConfig( + name="wikifact:member_of", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="member_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_member_of_political_party = LightevalTaskConfig( + name="wikifact:member_of_political_party", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="member_of_political_party", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_member_of_sports_team = LightevalTaskConfig( + name="wikifact:member_of_sports_team", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="member_of_sports_team", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_movement = LightevalTaskConfig( + name="wikifact:movement", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="movement", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_named_after = LightevalTaskConfig( + name="wikifact:named_after", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="named_after", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_native_language = LightevalTaskConfig( + name="wikifact:native_language", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="native_language", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_number_of_processor_cores = LightevalTaskConfig( + name="wikifact:number_of_processor_cores", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="number_of_processor_cores", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_occupation = LightevalTaskConfig( + name="wikifact:occupation", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="occupation", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_office_held_by_head_of_government = LightevalTaskConfig( + name="wikifact:office_held_by_head_of_government", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="office_held_by_head_of_government", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_office_held_by_head_of_state = LightevalTaskConfig( + name="wikifact:office_held_by_head_of_state", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="office_held_by_head_of_state", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_official_language = LightevalTaskConfig( + name="wikifact:official_language", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="official_language", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_operating_system = LightevalTaskConfig( + name="wikifact:operating_system", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="operating_system", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_original_language_of_film_or_TV_show = LightevalTaskConfig( + name="wikifact:original_language_of_film_or_TV_show", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="original_language_of_film_or_TV_show", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_original_network = LightevalTaskConfig( + name="wikifact:original_network", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="original_network", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_overrules = LightevalTaskConfig( + name="wikifact:overrules", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="overrules", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_owned_by = LightevalTaskConfig( + name="wikifact:owned_by", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="owned_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_part_of = LightevalTaskConfig( + name="wikifact:part_of", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="part_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_participating_team = LightevalTaskConfig( + name="wikifact:participating_team", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="participating_team", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_place_of_birth = LightevalTaskConfig( + name="wikifact:place_of_birth", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="place_of_birth", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_place_of_death = LightevalTaskConfig( + name="wikifact:place_of_death", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="place_of_death", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_plaintiff = LightevalTaskConfig( + name="wikifact:plaintiff", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="plaintiff", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_position_held = LightevalTaskConfig( + name="wikifact:position_held", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="position_held", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_position_played_on_team = LightevalTaskConfig( + name="wikifact:position_played_on_team", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="position_played_on_team", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_programming_language = LightevalTaskConfig( + name="wikifact:programming_language", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="programming_language", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_recommended_unit_of_measurement = LightevalTaskConfig( + name="wikifact:recommended_unit_of_measurement", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="recommended_unit_of_measurement", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_record_label = LightevalTaskConfig( + name="wikifact:record_label", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="record_label", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_religion = LightevalTaskConfig( + name="wikifact:religion", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="religion", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_repealed_by = LightevalTaskConfig( + name="wikifact:repealed_by", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="repealed_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_shares_border_with = LightevalTaskConfig( + name="wikifact:shares_border_with", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="shares_border_with", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_solved_by = LightevalTaskConfig( + name="wikifact:solved_by", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="solved_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_statement_describes = LightevalTaskConfig( + name="wikifact:statement_describes", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="statement_describes", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_stock_exchange = LightevalTaskConfig( + name="wikifact:stock_exchange", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="stock_exchange", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_subclass_of = LightevalTaskConfig( + name="wikifact:subclass_of", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="subclass_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_subsidiary = LightevalTaskConfig( + name="wikifact:subsidiary", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="subsidiary", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_symptoms_and_signs = LightevalTaskConfig( + name="wikifact:symptoms_and_signs", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="symptoms_and_signs", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_therapeutic_area = LightevalTaskConfig( + name="wikifact:therapeutic_area", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="therapeutic_area", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_time_of_discovery_or_invention = LightevalTaskConfig( + name="wikifact:time_of_discovery_or_invention", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="time_of_discovery_or_invention", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_twinned_administrative_body = LightevalTaskConfig( + name="wikifact:twinned_administrative_body", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="twinned_administrative_body", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +wikifact_work_location = LightevalTaskConfig( + name="wikifact:work_location", + suite=["lighteval"], + prompt_function=prompt.wikifact, + hf_repo="lighteval/wikifact", + hf_subset="work_location", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metrics=[Metrics.exact_match], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + wikifact_applies_to_jurisdiction, + wikifact_atomic_number, + wikifact_author, + wikifact_employer, + wikifact_field_of_work, + wikifact_file_extension, + wikifact_genetic_association, + wikifact_instrument, + wikifact_language_of_work_or_name, + wikifact_languages_spoken_written_or_signed, + wikifact_laws_applied, + wikifact_located_in_the_administrative_territorial_entity, + wikifact_location, + wikifact_location_of_discovery, + wikifact_location_of_formation, + wikifact_member_of, + wikifact_member_of_political_party, + wikifact_member_of_sports_team, + wikifact_movement, + wikifact_headquarters_location, + wikifact_industry, + wikifact_named_after, + wikifact_native_language, + wikifact_number_of_processor_cores, + wikifact_occupation, + wikifact_original_language_of_film_or_TV_show, + wikifact_original_network, + wikifact_overrules, + wikifact_owned_by, + wikifact_part_of, + wikifact_participating_team, + wikifact_place_of_birth, + wikifact_place_of_death, + wikifact_position_played_on_team, + wikifact_programming_language, + wikifact_recommended_unit_of_measurement, + wikifact_record_label, + wikifact_religion, + wikifact_repealed_by, + wikifact_shares_border_with, + wikifact_solved_by, + wikifact_statement_describes, + wikifact_stock_exchange, + wikifact_subclass_of, + wikifact_subsidiary, + wikifact_symptoms_and_signs, + wikifact_therapeutic_area, + wikifact_time_of_discovery_or_invention, + wikifact_twinned_administrative_body, + wikifact_work_location, +] diff --git a/src/lighteval/tasks/tasks/wikitext.py b/src/lighteval/tasks/tasks/wikitext.py new file mode 100644 index 000000000..a6f62e90b --- /dev/null +++ b/src/lighteval/tasks/tasks/wikitext.py @@ -0,0 +1,47 @@ +""" +name: +Wikitext + +dataset: +EleutherAI/wikitext_document_level + +abstract: +The WikiText language modeling dataset is a collection of over 100 million +tokens extracted from the set of verified Good and Featured articles on +Wikipedia. The dataset is available under the Creative Commons +Attribution-ShareAlike License. + +languages: +english + +tags: +language-modeling + +paper: +https://arxiv.org/abs/1609.07843 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +wikitext_103_document_level = LightevalTaskConfig( + name="wikitext:103:document_level", + suite=["lighteval"], + prompt_function=prompt.wikitext_helm, + hf_repo="EleutherAI/wikitext_document_level", + hf_subset="wikitext-103-raw-v1", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + wikitext_103_document_level, +] diff --git a/src/lighteval/tasks/tasks/winogrande.py b/src/lighteval/tasks/tasks/winogrande.py new file mode 100644 index 000000000..bcc49899b --- /dev/null +++ b/src/lighteval/tasks/tasks/winogrande.py @@ -0,0 +1,48 @@ +""" +name: +Winogrande + +dataset: +allenai/winogrande + +abstract: +WinoGrande is a new collection of 44k problems, inspired by Winograd Schema +Challenge (Levesque, Davis, and Morgenstern 2011), but adjusted to improve the +scale and robustness against the dataset-specific bias. Formulated as a +fill-in-a-blank task with binary options, the goal is to choose the right option +for a given sentence which requires commonsense reasoning. + +languages: +english + +tags: +commonsense, multiple-choice + +paper: +https://arxiv.org/abs/1907.10641 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +winogrande = LightevalTaskConfig( + name="winogrande", + suite=["lighteval"], + prompt_function=prompt.winogrande, + hf_repo="allenai/winogrande", + hf_subset="winogrande_xl", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + winogrande, +] diff --git a/src/lighteval/tasks/tasks/xcopa.py b/src/lighteval/tasks/tasks/xcopa.py new file mode 100644 index 000000000..6b51be639 --- /dev/null +++ b/src/lighteval/tasks/tasks/xcopa.py @@ -0,0 +1,233 @@ +""" +name: +Xcopa + +dataset: +cambridgeltl/xcopa + +abstract: +XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning The Cross-lingual +Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability +of machine learning models to transfer commonsense reasoning across languages. + +languages: +english + +tags: +commonsense, multilingual, multiple-choice, reasoning + +paper: +https://arxiv.org/abs/2005.00333 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +xcopa_en = LightevalTaskConfig( + name="xcopa:en", + suite=["lighteval"], + prompt_function=prompt.xcopa_en, + hf_repo="cambridgeltl/xcopa", + hf_subset="default", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xcopa_et = LightevalTaskConfig( + name="xcopa:et", + suite=["lighteval"], + prompt_function=prompt.xcopa_et, + hf_repo="cambridgeltl/xcopa", + hf_subset="et", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xcopa_ht = LightevalTaskConfig( + name="xcopa:ht", + suite=["lighteval"], + prompt_function=prompt.xcopa_ht, + hf_repo="cambridgeltl/xcopa", + hf_subset="ht", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xcopa_it = LightevalTaskConfig( + name="xcopa:it", + suite=["lighteval"], + prompt_function=prompt.xcopa_it, + hf_repo="cambridgeltl/xcopa", + hf_subset="it", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xcopa_id = LightevalTaskConfig( + name="xcopa:id", + suite=["lighteval"], + prompt_function=prompt.xcopa_id, + hf_repo="cambridgeltl/xcopa", + hf_subset="id", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xcopa_qu = LightevalTaskConfig( + name="xcopa:qu", + suite=["lighteval"], + prompt_function=prompt.xcopa_qu, + hf_repo="cambridgeltl/xcopa", + hf_subset="qu", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xcopa_sw = LightevalTaskConfig( + name="xcopa:sw", + suite=["lighteval"], + prompt_function=prompt.xcopa_sw, + hf_repo="cambridgeltl/xcopa", + hf_subset="sw", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xcopa_zh = LightevalTaskConfig( + name="xcopa:zh", + suite=["lighteval"], + prompt_function=prompt.xcopa_zh, + hf_repo="cambridgeltl/xcopa", + hf_subset="zh", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xcopa_ta = LightevalTaskConfig( + name="xcopa:ta", + suite=["lighteval"], + prompt_function=prompt.xcopa_ta, + hf_repo="cambridgeltl/xcopa", + hf_subset="ta", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xcopa_th = LightevalTaskConfig( + name="xcopa:th", + suite=["lighteval"], + prompt_function=prompt.xcopa_th, + hf_repo="cambridgeltl/xcopa", + hf_subset="th", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xcopa_tr = LightevalTaskConfig( + name="xcopa:tr", + suite=["lighteval"], + prompt_function=prompt.xcopa_tr, + hf_repo="cambridgeltl/xcopa", + hf_subset="tr", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xcopa_vi = LightevalTaskConfig( + name="xcopa:vi", + suite=["lighteval"], + prompt_function=prompt.xcopa_vi, + hf_repo="cambridgeltl/xcopa", + hf_subset="vi", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + xcopa_en, + xcopa_et, + xcopa_ht, + xcopa_it, + xcopa_id, + xcopa_qu, + xcopa_sw, + xcopa_zh, + xcopa_ta, + xcopa_th, + xcopa_tr, + xcopa_vi, +] diff --git a/src/lighteval/tasks/tasks/xstory_cloze.py b/src/lighteval/tasks/tasks/xstory_cloze.py new file mode 100644 index 000000000..96caef9b5 --- /dev/null +++ b/src/lighteval/tasks/tasks/xstory_cloze.py @@ -0,0 +1,215 @@ +""" +name: +Xstory Cloze + +dataset: +juletxara/xstory_cloze + +abstract: +XStoryCloze consists of the professionally translated version of the English +StoryCloze dataset (Spring 2016 version) to 10 non-English languages. This +dataset is released by Meta AI. + +languages: +english, russian, chinese, spanish, arabic, hindi, indonesian, telugu, swahili, basque, burmese + +tags: +multilingual, narrative, reasoning + +paper: +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +xstory_cloze_en = LightevalTaskConfig( + name="xstory_cloze:en", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="juletxara/xstory_cloze", + hf_subset="en", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xstory_cloze_ru = LightevalTaskConfig( + name="xstory_cloze:ru", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="juletxara/xstory_cloze", + hf_subset="ru", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xstory_cloze_zh = LightevalTaskConfig( + name="xstory_cloze:zh", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="juletxara/xstory_cloze", + hf_subset="zh", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xstory_cloze_es = LightevalTaskConfig( + name="xstory_cloze:es", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="juletxara/xstory_cloze", + hf_subset="es", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xstory_cloze_ar = LightevalTaskConfig( + name="xstory_cloze:ar", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="juletxara/xstory_cloze", + hf_subset="ar", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xstory_cloze_hi = LightevalTaskConfig( + name="xstory_cloze:hi", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="juletxara/xstory_cloze", + hf_subset="hi", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xstory_cloze_id = LightevalTaskConfig( + name="xstory_cloze:id", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="juletxara/xstory_cloze", + hf_subset="id", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xstory_cloze_te = LightevalTaskConfig( + name="xstory_cloze:te", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="juletxara/xstory_cloze", + hf_subset="te", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xstory_cloze_sw = LightevalTaskConfig( + name="xstory_cloze:sw", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="juletxara/xstory_cloze", + hf_subset="sw", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xstory_cloze_eu = LightevalTaskConfig( + name="xstory_cloze:eu", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="juletxara/xstory_cloze", + hf_subset="eu", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xstory_cloze_my = LightevalTaskConfig( + name="xstory_cloze:my", + suite=["lighteval"], + prompt_function=prompt.storycloze, + hf_repo="juletxara/xstory_cloze", + hf_subset="my", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + xstory_cloze_en, + xstory_cloze_ru, + xstory_cloze_zh, + xstory_cloze_es, + xstory_cloze_ar, + xstory_cloze_hi, + xstory_cloze_id, + xstory_cloze_te, + xstory_cloze_sw, + xstory_cloze_eu, + xstory_cloze_my, +] diff --git a/src/lighteval/tasks/tasks/xwinograd.py b/src/lighteval/tasks/tasks/xwinograd.py new file mode 100644 index 000000000..c692c5803 --- /dev/null +++ b/src/lighteval/tasks/tasks/xwinograd.py @@ -0,0 +1,129 @@ +""" +name: +Xwinograd + +dataset: +Muennighoff/xwinograd + +abstract: +Multilingual winograd schema challenge as used in Crosslingual Generalization through Multitask Finetuning. + +languages: +english, french, japanese, portuguese, russian, chinese + +tags: +commonsense, multilingual, reasoning + +paper: +https://arxiv.org/abs/2211.01786 +""" + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +xwinograd_en = LightevalTaskConfig( + name="xwinograd:en", + suite=["lighteval"], + prompt_function=prompt.winogrande, + hf_repo="Muennighoff/xwinograd", + hf_subset="en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xwinograd_fr = LightevalTaskConfig( + name="xwinograd:fr", + suite=["lighteval"], + prompt_function=prompt.winogrande, + hf_repo="Muennighoff/xwinograd", + hf_subset="fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xwinograd_jp = LightevalTaskConfig( + name="xwinograd:jp", + suite=["lighteval"], + prompt_function=prompt.winogrande, + hf_repo="Muennighoff/xwinograd", + hf_subset="jp", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xwinograd_pt = LightevalTaskConfig( + name="xwinograd:pt", + suite=["lighteval"], + prompt_function=prompt.winogrande, + hf_repo="Muennighoff/xwinograd", + hf_subset="pt", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xwinograd_ru = LightevalTaskConfig( + name="xwinograd:ru", + suite=["lighteval"], + prompt_function=prompt.winogrande, + hf_repo="Muennighoff/xwinograd", + hf_subset="ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +xwinograd_zh = LightevalTaskConfig( + name="xwinograd:zh", + suite=["lighteval"], + prompt_function=prompt.winogrande, + hf_repo="Muennighoff/xwinograd", + hf_subset="zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, +) + +TASKS_TABLE = [ + xwinograd_en, + xwinograd_fr, + xwinograd_jp, + xwinograd_pt, + xwinograd_ru, + xwinograd_zh, +] diff --git a/src/lighteval/utils/cache_management.py b/src/lighteval/utils/cache_management.py index 3e8c0a08a..e5764a04b 100644 --- a/src/lighteval/utils/cache_management.py +++ b/src/lighteval/utils/cache_management.py @@ -79,7 +79,6 @@ def __init__(self, model_config: ModelConfig): Args: model_config: Configuration for the model being cached - cache_dir: Directory to store cache files """ self.model_config = model_config self.model_hash = self.get_model_hash(model_config) @@ -213,7 +212,6 @@ def _load_sample(self, sample: pd.core.series.Series | dict) -> Union[dict, Mode Args: sample: Raw sample data from cache, arrives as a dataframe row - sample_type: Type of sample being loaded Returns: Union[dict, ModelResponse]: Loaded sample in appropriate format for processing @@ -360,7 +358,7 @@ def cached(sampling_method: SamplingMethod = None): # noqa C901 Decorator to cache method results based on Doc inputs. Args: - cache_type_name: Type of cache ("tokenization" or "predictions") + sampling_method: Sampling method to cache Usage: @cached(SamplingMethod.GENERATIVE) diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|arc:challenge|25_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|arc:challenge|25_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index df81532e4..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|arc:challenge|25_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d2dce4416d022cb704a77d63dcbacc99e148cb598186f88f33e7b1c5c019335e -size 87199 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|hellaswag|10_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|hellaswag|10_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 9f9639216..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|hellaswag|10_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ac904dbbbd26b93de90df7400242713a359207985d5f4c4f75d31ee9bb3325f -size 106015 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|mmlu:college_chemistry|5_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|mmlu:college_chemistry|5_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 86eb5a1ce..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|mmlu:college_chemistry|5_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e52b3dd01e79fa7028396bad84f6fba4d653fe6ede17a74cf1829115f809fdbe -size 36114 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|mmlu:us_foreign_policy|5_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|mmlu:us_foreign_policy|5_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index f51f7ad89..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|mmlu:us_foreign_policy|5_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:73de608e18e75e21cd832c09aecd13f6e7a0dbb91f113cb4cb6f8984be474d77 -size 36635 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|truthfulqa:mc|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|truthfulqa:mc|0_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 50cc5802f..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_leaderboard|truthfulqa:mc|0_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dc795a85bcb77084b1275bfadfe2c613a3b44543a6184e3ffd32bc4588d8d64f -size 25269 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:aqua-rat|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:aqua-rat|0_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 2ca8fcfc0..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:aqua-rat|0_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2e75e6460dd0c3ba833b74c19b4943b1baa0f266e5207895454a54019dc9cbf6 -size 21944 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:aqua-rat|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:aqua-rat|0_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..29fcc86f2 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:aqua-rat|0_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bb8f6798f1556468a715ef990a090a74149242ca44be87c4908966e7c18f684 +size 21839 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:logiqa-en|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:logiqa-en|0_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 675c2125e..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:logiqa-en|0_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6c96e81a70ef68946e7e83e30a9ef5dd5c04a4e8de215a021de33d4e841ec502 -size 34133 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:logiqa-en|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:logiqa-en|0_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..222e73463 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:logiqa-en|0_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e820d31ec994386562144504b28116960c48ee649fefa887c11cc10a6dc12373 +size 34072 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-ar|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-ar|0_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index b5d4632ed..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-ar|0_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ebf20030a92a27e15144e4f2071c419edafd1ae9d0e8fe7b9bc38a3edf7a181e -size 30775 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-ar|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-ar|0_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..7cd541d5d --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-ar|0_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f42202e916ecc484879e824801e85d4965cf83b466199241734dfacd7f5f07d +size 30714 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-lr|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-lr|0_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 811989b76..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-lr|0_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:01db21e17415bb49be149cf25da813faadfb6bac3b127ba246ae3dbcf96685d7 -size 39431 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-lr|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-lr|0_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..41aa908a7 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-lr|0_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f79f38ee2bf762a43bf75326f02fbf373a8b54f004764c51de05805da48378b2 +size 39384 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-rc|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-rc|0_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 670c7475b..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-rc|0_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5ff511fe233f3fa5d057ca06671779dd8acd990c195ac3132636d1612cb17dcd -size 74222 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-rc|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-rc|0_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..45062f426 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:lsat-rc|0_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71090b25c032493e4ec26cada301343397043222143d55525d4049d0cfe2fea2 +size 74176 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en-without-passage|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en-without-passage|0_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index af81308bc..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en-without-passage|0_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c2770719dd0e256dc0634fb9a3b374b085080f76dbaf9b96326dcf2e070d3701 -size 25968 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en-without-passage|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en-without-passage|0_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..33b5e59c5 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en-without-passage|0_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca54ee0507b761db283874619584d9eefde9412cd38f1e158aa2557c2c69e95f +size 25907 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en|0_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 2c88d4075..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en|0_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b1bf41a41845a4d41b8a5ba28c0117746689fa96143489fe798651bf2af98e5f -size 72560 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en|0_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..695396792 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|agieval:sat-en|0_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03a313bf91b1642fc24bb23ef034a851a17d33610bfb3f83de4cc1c33d5d23dd +size 72493 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|arc:challenge|25_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|arc:challenge|25_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..4ccd4261f --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|arc:challenge|25_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbb426e5d8f5b54a1d8527a9b6bc7b62e4d4fad5d6b75af1a3af47de816229dd +size 87676 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:causal_judgment|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:causal_judgment|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 712c604c9..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:causal_judgment|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:afb32f7ffe8f53a1b892123e8c8f0325830c1703154b1e8ba07786aa32fcf163 -size 46253 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:date_understanding|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:date_understanding|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index e9904becd..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:date_understanding|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d741c8c198a8ad188da86f6ee5c8795abb1c89665580cec627216b4204e18a17 -size 28804 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:disambiguation_qa|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:disambiguation_qa|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index e6d0732ca..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:disambiguation_qa|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:209b8b1be20f217a687c9a2ea50e15176bd8df3a62d8e24f20afa371cdaac2da -size 29675 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:geometric_shapes|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:geometric_shapes|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 2b4666c55..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:geometric_shapes|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64228e6c0460d5dbf75dbff6a210db107611314f84df9105f91a17340703386c -size 31219 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:logical_deduction_five_objects|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:logical_deduction_five_objects|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 3f5964fac..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:logical_deduction_five_objects|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:417d41730a5dd77c1729df05d1888e6d91f29d641c802bc45bd94c7cccf7581d -size 33393 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:logical_deduction_seven_objects|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:logical_deduction_seven_objects|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 38984c530..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:logical_deduction_seven_objects|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b486108ab93f2b274b80cb45ce87da4e09bcab49b02c82f94838246cb1243cb6 -size 36893 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:movie_recommendation|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:movie_recommendation|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 868565ed9..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:movie_recommendation|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:511eda270bab7771b2697adaaa95aa5eb1a41da1926b51a73272a1104b3025bb -size 28017 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:navigate|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:navigate|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 2158582ff..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:navigate|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f7f72df2e5a180fdda15ee2d4a2f23e63d6b5695d4a086fbe7baf55fa5854a74 -size 27629 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:ruin_names|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:ruin_names|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 7813c3884..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:ruin_names|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:789f8818d20a28f3ae6854a1b472ef6020875b99e217b067f71133ede511599b -size 26814 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:salient_translation_error_detection|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:salient_translation_error_detection|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 6760674a8..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:salient_translation_error_detection|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eba32e4dc54bdc313dd6c5cc9b24250418d9186cebca96e845d2b801750ec84a -size 48058 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:snarks|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:snarks|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 596aa76e3..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:snarks|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4ae6c4b877baa4a127d1e540c3522fe7d016d15e5827be9db5eb1ade50d2a4a -size 27979 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:temporal_sequences|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:temporal_sequences|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 71a4ca996..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:temporal_sequences|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4ed5bda45b8bdb868e42361827501fb108304512e5b7a853d8fa3e314162e620 -size 33161 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:tracking_shuffled_objects_five_objects|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:tracking_shuffled_objects_five_objects|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index fe0896288..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:tracking_shuffled_objects_five_objects|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c65cf6bf80bd1d20420ca0925f120317ddaee59a5f283f1c544acb6b9bcf550f -size 33631 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:tracking_shuffled_objects_seven_objects|3_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:tracking_shuffled_objects_seven_objects|3_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 74a321d63..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench:tracking_shuffled_objects_seven_objects|3_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d34487632eb79e9c5a59aa354434b681218e6406b3eb885caf81a735936fae2 -size 36162 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:causal_judgment|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:causal_judgment|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..a27f12606 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:causal_judgment|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:408adb2cc6ebfd6227c29ae7b36ebaec628d133b7a55fcd62996da1a81b683be +size 47608 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:date_understanding|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:date_understanding|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..5e7551aa2 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:date_understanding|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26902afcf065eb91840fbfbe50bef53284141d0c1772c5dce0bb45acfac7dfbf +size 30056 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:disambiguation_qa|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:disambiguation_qa|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..606551571 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:disambiguation_qa|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76dfb895bd369d3092b3faf32e52e070a7ac2797e918e6d78f10fe6521fcec73 +size 30982 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:geometric_shapes|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:geometric_shapes|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..7719095bb --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:geometric_shapes|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:601162ba27b672f1763513b2360846104e673bae46937e1990b0b146187c9e74 +size 32514 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:logical_deduction_five_objects|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:logical_deduction_five_objects|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..29cefcae6 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:logical_deduction_five_objects|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d97f9b7b0d06000abc67c45eed722c63237057358c603d67bfb9ce7855bffad9 +size 34703 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:logical_deduction_seven_objects|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:logical_deduction_seven_objects|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..730f0f472 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:logical_deduction_seven_objects|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e5acdff1e58361591fde26d1b3fd422b0be9adad4dfbee98dc211f75cfbb568 +size 38228 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:movie_recommendation|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:movie_recommendation|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..46404f494 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:movie_recommendation|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b79a11c9981b37b71e306fb7a0e049c1845adc6752f4394f6e7406db27a9c16 +size 29272 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:navigate|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:navigate|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..288d2c0e6 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:navigate|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a150f48c0928da6642309188a0ab5a89a9bed5eb66c9a9f7b3897f02af239809 +size 28884 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:ruin_names|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:ruin_names|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..83d132e37 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:ruin_names|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa768f89fd06423d3dad3bf7fd229442eb0d813e8f4c1be94b62a4ee91ce1c0e +size 28021 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:salient_translation_error_detection|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:salient_translation_error_detection|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..d01582b4e --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:salient_translation_error_detection|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e6ee64f0595ef3db00de7c43d9e4411d8fe32ae4c1c5b576b713a09448b5038 +size 49390 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:snarks|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:snarks|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..84f17cfae --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:snarks|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc32011b7f35b96edb89efc0dfa2f2aa56de5b19566ec424427193f72d80424b +size 29202 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:temporal_sequences|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:temporal_sequences|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..6376f53fd --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:temporal_sequences|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1f0fecbea584b4617f5c14e577acc2c516ce86a8e45e493be0e47f76c99a3d5 +size 34443 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..ce267d004 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ab2cd33ce068a2f6ec0a3613eb0b26790596e8be0da0491d31e0d0f293f35eb +size 34896 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..085a59a9e --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e511aedc2c86800f5456a315c8ead57a216a0abab650f58e1282b3f9e96a60c7 +size 37440 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|gsm8k_test|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|gsm8k_test|0_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..5545aa11c --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|gsm8k_test|0_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:215753259adbd35ec5cf0fd30471064017e7f160a49f4b1542d22ccedbbb6f19 +size 35747 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|hellaswag|10_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|hellaswag|10_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..def3e823f --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|hellaswag|10_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:686cde8c82ccaea58035dcc0fd5729b67343af90c02cdac4768c260d13cd6ce0 +size 67303 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|mmlu:college_chemistry|5_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|mmlu:college_chemistry|5_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..3ac277a83 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|mmlu:college_chemistry|5_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:370499a7dbda06de110b28dd4803880a62b63d9f31480463848277a8784250aa +size 37734 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|mmlu:us_foreign_policy|5_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|mmlu:us_foreign_policy|5_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..11ae5fd8f --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|mmlu:us_foreign_policy|5_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d42481a014c8beeeeb5009809418815652437330b2828a6b3b1f3696c269949 +size 38503 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|truthfulqa:mc|0_2025-10-17T14-08-59.659871.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|truthfulqa:mc|0_2025-10-17T14-08-59.659871.parquet new file mode 100644 index 000000000..0cfc28382 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_lighteval|truthfulqa:mc|0_2025-10-17T14-08-59.659871.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4e18a142f8af00c5681c49d0a7b4e0580f1c7096c1b72855ddff29e141620e3 +size 26087 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_test|gsm8k|0_2025-09-19T14-21-59.670987.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_test|gsm8k|0_2025-09-19T14-21-59.670987.parquet deleted file mode 100644 index 160b3defc..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_test|gsm8k|0_2025-09-19T14-21-59.670987.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e281554c86326b1f2e05f8c27ef7d58048a2b751a2ceed6c4c79d50ecbbdcab -size 34833 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|arc:challenge|25_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|arc:challenge|25_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index da0f11a41..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|arc:challenge|25_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7fe08af0c72407c1997534ac38db74cf716d2a4f6e9fcc9a7e138b8b55b1480 -size 144374 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|hellaswag|10_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|hellaswag|10_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index e1a9adf2c..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|hellaswag|10_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:be5cb187977d6f8a6acdf7712477da51c7cd66e353671f86c5cf8f48ce1b9d61 -size 137038 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|mmlu:college_chemistry|5_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|mmlu:college_chemistry|5_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index eab885a8d..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|mmlu:college_chemistry|5_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6ca8136266ee39de5ed61bfcffdb048d0f71b9428a2c3b78de70e9a5f189a818 -size 53139 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|mmlu:us_foreign_policy|5_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|mmlu:us_foreign_policy|5_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 4be39bbc6..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|mmlu:us_foreign_policy|5_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a11b96fcc1f22ac5349a9acccb6f45203e01071afc50811a1646388a8d06199 -size 54501 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|truthfulqa:mc|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|truthfulqa:mc|0_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 638aab548..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_leaderboard|truthfulqa:mc|0_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b84277d5f3a97613f4e9f491281c64f2f224d017b99beeb7820ed948cf36d019 -size 31570 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:aqua-rat|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:aqua-rat|0_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 18d340905..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:aqua-rat|0_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:32e3aa399ece1fec63937b28f7058a0f92c2274ecbba0f404c6f6d2118faadfb -size 26577 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:aqua-rat|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:aqua-rat|0_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..d690a4f14 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:aqua-rat|0_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55af4b3a8f20480b118b8697b95b766da6d87db04395141a4ffe750b0adf0e20 +size 26534 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:logiqa-en|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:logiqa-en|0_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index fb6a53e32..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:logiqa-en|0_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d62633ded1b67ed70f538c27f8f8756386d4b707bf7f878a2458d087fe8f3360 -size 45781 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:logiqa-en|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:logiqa-en|0_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..67146b758 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:logiqa-en|0_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09856647c8e52b0162bead55c03ec464bd36b4c297a8167bd0a2384ca51cc55a +size 45739 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-ar|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-ar|0_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 1ebc2067e..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-ar|0_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:757b28842addb90c8278938fec7524f87a1b2b635f5a488b49a22197a9d9d885 -size 50807 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-ar|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-ar|0_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..7e438e70f --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-ar|0_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d0988269e97ebec6615ac36e7e72c6a46d513e49dc9d8683a74659acd2dd872 +size 50771 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-lr|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-lr|0_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index ad35380db..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-lr|0_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7f2edfe9a5f7501615b442e7026c6d5f16b0e7e03caf00f4a41846acf3e0ed3e -size 55855 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-lr|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-lr|0_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..43c45d6f3 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-lr|0_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4485ca9806ba31f83fe8e4a411ef9ac14dcf2af7c4b440361c4fed5d3b4c2eb5 +size 55826 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-rc|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-rc|0_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 1b9b46481..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-rc|0_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:561fcf29d4ad4ff8d0f333e888b0cef84c133db009be34b989576d0bb3c78a44 -size 148865 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-rc|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-rc|0_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..484870f64 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:lsat-rc|0_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acb6163bd8503121ed2962c1080445976ef5e0fe7820a7c66354cd5984834273 +size 148838 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en-without-passage|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en-without-passage|0_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 958038ad0..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en-without-passage|0_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d811dc576579af492de475703ddaa40d6bb0db3506facd2679f10de50f608db -size 32795 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en-without-passage|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en-without-passage|0_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..19f2d87a2 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en-without-passage|0_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ec622a7c7699f78e92bcadf6d3121ad114dca0959131b879b5489936ea6da0 +size 32753 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en|0_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 0b680f7af..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en|0_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d4729a89ab8729d83549ec34ec316b68bcf05fab4111bf8530ab2f7f6f16bc56 -size 110056 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en|0_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..f38c24d46 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|agieval:sat-en|0_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9c225a4b4295ddbcde3df6405c89751025ee910a6a5c55633a51cbb9485ed17 +size 109983 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|arc:challenge|25_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|arc:challenge|25_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..b978eba02 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|arc:challenge|25_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78d7529cb2b80be6022a5b41fa46d12f48a4556ae322c46afe1bb4a393eb7a98 +size 144845 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:causal_judgment|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:causal_judgment|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index c5cf55616..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:causal_judgment|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d920d6b1d9757af95d515a8435972a667375e13020a1709ab27a203484d04704 -size 70718 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:date_understanding|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:date_understanding|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index d4666b2fe..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:date_understanding|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:38e56b21e15ca43fad2f286b8b75e7d2b3db729004c4cb825d8609118f194af3 -size 38152 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:disambiguation_qa|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:disambiguation_qa|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 2e8b80d83..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:disambiguation_qa|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:474a092eb73f0734f2a31b13fee8cd3edcc649c96ed13e054961be22e16efbe5 -size 36972 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:geometric_shapes|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:geometric_shapes|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 83ff6841a..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:geometric_shapes|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b3d8aea15719f8c31847fe5e415cfcad8f4bb24a9f5a7309b9eb5e74e95a513d -size 48287 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:logical_deduction_five_objects|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:logical_deduction_five_objects|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 17ad7da3b..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:logical_deduction_five_objects|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f5f4943c293cb2472f74030dbfd220eabd0c12d612fa20a0f905ef0a0a6846c -size 46228 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:logical_deduction_seven_objects|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:logical_deduction_seven_objects|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 9eb4ad34f..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:logical_deduction_seven_objects|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c05a9d6d976d4529483fcac90163705fabca22ccdba0b3ee33ad1df44b8c234 -size 54843 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:movie_recommendation|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:movie_recommendation|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 9e8068912..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:movie_recommendation|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d49cf61fba119a019d8047f64206ce860cb41d70c7a4b85a20e92fdb76b9c65a -size 35234 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:navigate|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:navigate|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 2aca5e3bd..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:navigate|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:25cfadaf467f2850cee53b89ca1c05b8491f3f9d54612e96d113c9b9e0ca5fae -size 33264 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:ruin_names|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:ruin_names|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 761b290f1..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:ruin_names|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:218ef4b465e8f164df7cce40c9ea367596165dfa1f392f56ba2029a36430556d -size 33280 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:salient_translation_error_detection|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:salient_translation_error_detection|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 506566766..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:salient_translation_error_detection|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f2066ffecda60170f7d6e65384899fea4d3232011e5803e5f0d72b8159f8dd2e -size 67823 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:snarks|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:snarks|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 3bf51107e..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:snarks|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1e8e1e9cefafc6872cee5ab021f5b418d2738b555b1ac7d0caaaa7ddbe1c84df -size 36628 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:temporal_sequences|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:temporal_sequences|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 69e6f60bb..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:temporal_sequences|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e7f092f6994c6e18349bdb3c489c059eee371c90f1a6d250495d9f7255db75e -size 49007 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:tracking_shuffled_objects_five_objects|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:tracking_shuffled_objects_five_objects|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 0e86bb133..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:tracking_shuffled_objects_five_objects|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:49b6cab428aa555786fb5d74d6d91699f9246d8a0c7ff2d7dee4bb9621f5b9b2 -size 51220 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:tracking_shuffled_objects_seven_objects|3_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:tracking_shuffled_objects_seven_objects|3_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index 915319abc..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench:tracking_shuffled_objects_seven_objects|3_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0442fff2fb12229444bfeb0fa4ccc8a9d73455b5494aed31b6c4b91950cdadf7 -size 58577 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:causal_judgment|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:causal_judgment|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..b5174abd1 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:causal_judgment|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e4e4047e4b3bef68e96d106b404d5da844c254c4021c155159cfd00aebc036 +size 72102 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:date_understanding|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:date_understanding|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..968be4faa --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:date_understanding|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:732966b04a49242e642d06de47b15ca4a7fce1b52bf103baed843c29cc878d4e +size 39473 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:disambiguation_qa|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:disambiguation_qa|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..9d8554d2d --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:disambiguation_qa|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7ce68d0631ee4707f57bd0848e86a544c70bc2268c08fbe24275cb47921d11f +size 38313 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:geometric_shapes|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:geometric_shapes|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..21e80f4c7 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:geometric_shapes|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:549b13758170f710449b845b8c0bd3bc2a9eb8fab9c4a91751fb38830082ef8b +size 49621 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:logical_deduction_five_objects|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:logical_deduction_five_objects|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..f051f91a5 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:logical_deduction_five_objects|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d33f3c199c57fd2ee607174043b2087ee26da4f27ef68cad8e81c133d85f5dad +size 47607 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:logical_deduction_seven_objects|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:logical_deduction_seven_objects|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..4c8814b87 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:logical_deduction_seven_objects|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8af05943bd9fa01e2fe4f1fb082d0919c266c8ec478c8259577d0def03f45103 +size 56216 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:movie_recommendation|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:movie_recommendation|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..3c0ea7eaf --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:movie_recommendation|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8fbcafe67f79eaf7433c7a87c2bba773340ab6aa7872400ea993da1dff9e531 +size 36552 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:navigate|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:navigate|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..abfe874a3 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:navigate|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be905adee2d8cee7e8d66441456c225c901e67746679ac80c6bc7f3763ff167 +size 34588 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:ruin_names|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:ruin_names|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..aa6142ea9 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:ruin_names|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a332d10b713b0995a11b8da1c8a17644261fccc79a0a19de343580e276842713 +size 34561 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:salient_translation_error_detection|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:salient_translation_error_detection|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..0974ffb0c --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:salient_translation_error_detection|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a4a9eb97df91f315104a89a71c3e3221ffcd97cd839b15bf7bcc060eaf25e8d +size 69190 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:snarks|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:snarks|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..cdcc1db3d --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:snarks|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f41fc4e7250aa9fb05b4122ef062f27403c99d8e4960c3fde4072aede655563d +size 37908 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:temporal_sequences|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:temporal_sequences|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..ffbb3e29d --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:temporal_sequences|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2c3985c35dce91ec1aba39f67ab5252af0663cd3f9664326498cbbfd753864a +size 50327 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..1f1896d8e --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9660e3a3836b705e44922972cf7fe8cc1fd44bd16822892cc6706b3aa07590d +size 52546 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..ead4ecd0b --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72287e782eefe8ef33278ce582a6d163e47e6b839dcb2bd4b031c58ff8d0b154 +size 59891 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|gsm8k_test|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|gsm8k_test|0_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..10071540f --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|gsm8k_test|0_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b97852148b8779de9185c1dfe506d104d98d1a5f06369614c188a023d5ab6b5e +size 39107 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|hellaswag|10_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|hellaswag|10_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..127d5518e --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|hellaswag|10_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aecc616ade5f82ca78d39b65743eb5890c671d83db6c274972d507a8fc997a4 +size 88652 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|mmlu:college_chemistry|5_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|mmlu:college_chemistry|5_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..1dbd0c716 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|mmlu:college_chemistry|5_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d663a38bb9208a98b2839093275ed9a1b0e8312d1308e0eace94a616191b79b1 +size 51027 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|mmlu:us_foreign_policy|5_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|mmlu:us_foreign_policy|5_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..94fa4337c --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|mmlu:us_foreign_policy|5_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:897d4d7063e681f928f709bff3ec8b2ace2566fd70faf812fe74e6cd65582785 +size 52560 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|truthfulqa:mc|0_2025-10-17T14-03-07.927732.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|truthfulqa:mc|0_2025-10-17T14-03-07.927732.parquet new file mode 100644 index 000000000..48e6d2807 --- /dev/null +++ b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_lighteval|truthfulqa:mc|0_2025-10-17T14-03-07.927732.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b879e37019beb40032695a9e0a63d9d60ce571d601eca8f356cec2165c1962a +size 32420 diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_test|gsm8k|0_2025-09-19T14-18-26.717757.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_test|gsm8k|0_2025-09-19T14-18-26.717757.parquet deleted file mode 100644 index a95529696..000000000 --- a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_test|gsm8k|0_2025-09-19T14-18-26.717757.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1122709febbfe4d9b3aefc6914eb43a4571611c67b37a2be79cc91d7b936150c -size 38168 diff --git a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json index e3cf75ccc..f35ac4d17 100644 --- a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json +++ b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6246068f1967408620b2f128c4b1e994d4afa3165f5ea2f59529073869dde29b -size 51794 +oid sha256:063f2cbdc1f8f85147534dd590a5139b1f815e580771b353ee76c5b7672ff545 +size 46217 diff --git a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json index fd40b5b92..26e304bcb 100644 --- a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json +++ b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d31bb1623784ef37efd4f90f39d6e662bdb139f6ac53a00d731c98a8b546de1f -size 51893 +oid sha256:16a8bec22d5ebaf5064c6c9a6ca03e6009d36df6598a0fe3470c84f3914340df +size 46345 diff --git a/tests/slow_tests/sample_comparison.py b/tests/slow_tests/sample_comparison.py index 02237a1c1..d525bc948 100644 --- a/tests/slow_tests/sample_comparison.py +++ b/tests/slow_tests/sample_comparison.py @@ -37,29 +37,6 @@ def _to_plain_list(value): return new_value -def _logprobs_approximately_equal(current_logprobs, reference_logprobs): - """Check if logprobs are sorted in the same order. - for example: - current_logprobs = [1.1, 2.1, 3.1] - reference_logprobs = [1.0, 2.0, 3.0] - should return True - """ - if current_logprobs is None and reference_logprobs is None: - return True - if current_logprobs is None or reference_logprobs is None: - return False - - current_logprobs = _to_plain_list(current_logprobs) - reference_logprobs = _to_plain_list(reference_logprobs) - - # Check if both lists have the same ordering - # Convert to relative ordering: 0 for smallest, 1 for second smallest, etc. - current_indices = sorted(range(len(current_logprobs)), key=lambda i: current_logprobs[i]) - reference_indices = sorted(range(len(reference_logprobs)), key=lambda i: reference_logprobs[i]) - - return current_indices == reference_indices - - def load_sample_details(details_dir: str): """Load sample-level details from parquet files in the details directory.""" details = {} @@ -115,12 +92,15 @@ def _compare_metrics(current, reference): reference_metrics = reference["metric"] metric_diffs = {} - for metric_name in set(current_metrics.keys()) | set(reference_metrics.keys()): - current_val = current_metrics.get(metric_name) - reference_val = reference_metrics.get(metric_name) + for metric_name in set(current_metrics.keys()) & set(reference_metrics.keys()): + try: + current_val = current_metrics.get(metric_name) + reference_val = reference_metrics.get(metric_name) - if not math.isclose(current_val, reference_val, abs_tol=0.05): - metric_diffs[metric_name] = {"current": current_val, "reference": reference_val} + if not math.isclose(current_val, reference_val, abs_tol=0.05): + metric_diffs[metric_name] = {"current": current_val, "reference": reference_val} + except Exception: + breakpoint() if metric_diffs: sample_diff["metric_differences"] = metric_diffs @@ -175,6 +155,7 @@ def compare_sample_details(current_details, reference_details): for task_name in current_details: if task_name not in reference_details: + breakpoint() differences[task_name] = [{"error": "Task not found in reference results"}] continue diff --git a/tests/unit/metrics/test_metric_requests.py b/tests/unit/metrics/test_metric_requests.py index e7f9ee473..7c383c737 100644 --- a/tests/unit/metrics/test_metric_requests.py +++ b/tests/unit/metrics/test_metric_requests.py @@ -25,9 +25,9 @@ from lighteval.metrics.normalizations import LogProbPMINorm from lighteval.metrics.utils.metric_utils import Metric from lighteval.models.model_output import ModelResponse -from lighteval.tasks.default_tasks import xstory_cloze_en_lighteval from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig from lighteval.tasks.requests import Doc +from lighteval.tasks.tasks.xstory_cloze import xstory_cloze_en from tests.utils import FakeModel, fake_evaluate_task @@ -48,9 +48,9 @@ def get_pmi_task(metrics: list[Metric]): metrics=metrics, suite=["test"], prompt_function=dummy_prompt_fc, - hf_repo=xstory_cloze_en_lighteval.hf_repo, - hf_subset=xstory_cloze_en_lighteval.hf_subset, - evaluation_splits=xstory_cloze_en_lighteval.evaluation_splits, + hf_repo=xstory_cloze_en.hf_repo, + hf_subset=xstory_cloze_en.hf_subset, + evaluation_splits=xstory_cloze_en.evaluation_splits, ) # This is manually edited when updating the config and in the post init function # - we need to get a more homogeneous system for naming... diff --git a/tests/unit/pipeline/test_reasoning_tags.py b/tests/unit/pipeline/test_reasoning_tags.py index f772970c4..00fa00d78 100644 --- a/tests/unit/pipeline/test_reasoning_tags.py +++ b/tests/unit/pipeline/test_reasoning_tags.py @@ -22,9 +22,7 @@ import tempfile import unittest -from pathlib import Path -from types import ModuleType -from typing import Optional, Union +from typing import Optional from unittest.mock import patch from lighteval.logging.evaluation_tracker import EvaluationTracker @@ -96,7 +94,7 @@ def download_dataset_worker(task) -> None: class FakeRegistry(Registry): def __init__( - self, tasks: Optional[str] = None, custom_tasks: Optional[Union[str, Path, ModuleType]] = None + self, tasks: Optional[str] = None, load_multilingual: bool = False, custom_tasks: Optional[str] = None ): self.tasks_list = [input_task_name] # suite_name, task_name, few_shot = input_task_name.split("|") diff --git a/tests/unit/tasks/test_registry.py b/tests/unit/tasks/test_registry.py index 377ea7d6c..bbbd32dc8 100644 --- a/tests/unit/tasks/test_registry.py +++ b/tests/unit/tasks/test_registry.py @@ -26,51 +26,6 @@ from lighteval.tasks.registry import Registry -TASKS_TABLE = [ - LightevalTaskConfig( - name="test_task_revision", - # Won't be called, so it can be anything - prompt_function=lambda x: x, # type: ignore - hf_repo="test", - hf_subset="default", - evaluation_splits=["train"], - metrics=[], - ) -] - -TASKS_GROUPS = { - "zero_and_one": "custom|test_task_revision|0,custom|test_task_revision|1", - "all_mmlu": "original|mmlu|3", -} - - -def test_custom_task_groups(): - """ - Tests that task info selector correctly handles custom task groups. - """ - registry = Registry(tasks="zero_and_one", custom_tasks="tests.unit.tasks.test_registry") - - assert set(registry.tasks_list) == {"custom|test_task_revision|0", "custom|test_task_revision|1"} - - assert set(registry.task_to_configs.keys()) == {"custom|test_task_revision"} - - task_info: list[LightevalTaskConfig] = registry.task_to_configs["custom|test_task_revision"] - assert {task_info[0].num_fewshots, task_info[1].num_fewshots} == {0, 1} - - -def test_custom_tasks(): - """ - Tests that task info selector correctly handles custom tasks. - """ - registry = Registry(tasks="custom|test_task_revision|0", custom_tasks="tests.unit.tasks.test_registry") - - assert registry.tasks_list == ["custom|test_task_revision|0"] - assert set(registry.task_to_configs.keys()) == {"custom|test_task_revision"} - - task_info: list[LightevalTaskConfig] = registry.task_to_configs["custom|test_task_revision"] - assert task_info[0].num_fewshots == 0 - - def test_superset_expansion(): """ Tests that task info selector correctly handles supersets. @@ -92,13 +47,13 @@ def test_superset_with_subset_task(): """ Tests that task info selector correctly handles if both superset and one of subset tasks are provided. """ - registry = Registry(tasks="original|mmlu|3,original|mmlu:abstract_algebra|5") + registry = Registry(tasks="lighteval|mmlu|3,lighteval|mmlu:abstract_algebra|5") # We have all mmlu tasks - assert set(registry.tasks_list) == {"original|mmlu|3", "original|mmlu:abstract_algebra|5"} + assert set(registry.tasks_list) == {"lighteval|mmlu|3", "lighteval|mmlu:abstract_algebra|5"} assert len(registry.task_to_configs.keys()) == 57 - task_info: list[LightevalTaskConfig] = registry.task_to_configs["original|mmlu:abstract_algebra"] + task_info: list[LightevalTaskConfig] = registry.task_to_configs["lighteval|mmlu:abstract_algebra"] assert {task_info[0].num_fewshots, task_info[1].num_fewshots} == {3, 5} @@ -133,7 +88,7 @@ def test_task_group_expansion_with_subset_expansion(): """ Tests that task info selector correctly handles a group with task superset is provided. """ - registry = Registry(tasks="all_mmlu", custom_tasks="tests.unit.tasks.test_registry") + registry = Registry(tasks="lighteval|mmlu|0") # We have all mmlu tasks assert len(registry.task_to_configs.keys()) == 57 @@ -151,11 +106,9 @@ def test_task_duplicates(): """ Tests that task info selector correctly handles if duplicate tasks are provided. """ - registry = Registry( - tasks="custom|test_task_revision|0,custom|test_task_revision|0", custom_tasks="tests.unit.tasks.test_registry" - ) + registry = Registry(tasks="lighteval|storycloze:2016|0,lighteval|storycloze:2016|0") - assert list(registry.tasks_list) == ["custom|test_task_revision|0"] + assert list(registry.tasks_list) == ["lighteval|storycloze:2016|0"] def test_task_creation(): diff --git a/tests/utils.py b/tests/utils.py index 3b68dd631..b7ba2a042 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -20,9 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from pathlib import Path -from types import ModuleType -from typing import Optional, Union +from typing import Optional from unittest.mock import patch from transformers import AutoTokenizer @@ -108,7 +106,7 @@ def fake_evaluate_task( # Create a mock Registry class class FakeRegistry(Registry): - def __init__(self, tasks: Optional[str], custom_tasks: Optional[Union[str, Path, ModuleType]] = None): + def __init__(self, tasks: Optional[str], load_multilingual: bool = False, custom_tasks: Optional[str] = None): self.tasks_list = [task_name_fs] self.task_to_configs = {task_name_fs: [lighteval_task.config]}