techwolf-ai · Mattdl · Feb 26, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -181,10 +181,10 @@ class MyCustomRankingTask(RankingTask):
         """Override default metrics if needed"""
         return ["map", "mrr", "recall@5", "recall@10"]
 
-    def load_monolingual_data(self, split: DatasetSplit, language: Language) -> RankingDataset:
+    def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset:
         """
-        Load dataset for a specific language and split.
-        
+        Load dataset for a specific dataset ID and split.
+
         Returns:
             RankingDataset with query_texts, target_indices, and target_space
         """
@@ -196,12 +196,12 @@ class MyCustomRankingTask(RankingTask):
             [0, 2],  # Software Engineer -> Python, SQL
             [0, 1],  # Data Scientist -> Python, Machine Learning
         ]
-        
+
         return RankingDataset(
             query_texts=query_texts,
             target_indices=target_indices,
             target_space=target_space,
-            language=language,
+            dataset_id=dataset_id,
         )
 ```
 

diff --git a/README.md b/README.md
@@ -112,7 +112,7 @@ Feel free to make a PR to add your models & tasks to the official package! See [
 
 ### Checkpointing & Resuming
 
-WorkRB automatically saves result checkpoints after each task completion in a specific language.
+WorkRB automatically saves result checkpoints after each dataset evaluation within a task.
 
 **Automatic Resuming** - Simply rerun with the same `output_folder`:
 

diff --git a/examples/custom_model_example.py b/examples/custom_model_example.py
@@ -9,6 +9,7 @@
 import torch
 from sentence_transformers import SentenceTransformer
 
+import workrb
 from workrb.models.base import ModelInterface
 from workrb.registry import register_model
 from workrb.types import ModelInputType
@@ -47,10 +48,12 @@ def __init__(
         self.encoder.to(device)
         self.encoder.eval()
 
+    @property
     def name(self) -> str:
         """Return the unique name of this model."""
         return f"MyCustomModel-{self.base_model_name.split('/')[-1]}"
 
+    @property
     def description(self) -> str:
         """Return the description of this model."""
         return "A custom model that demonstrates WorkRB extensibility"

diff --git a/examples/custom_task_example.py b/examples/custom_task_example.py
@@ -6,6 +6,7 @@
 and implement the required abstract methods.
 """
 
+import workrb
 from workrb.registry import register_task
 from workrb.tasks.abstract.base import DatasetSplit, LabelType, Language
 from workrb.tasks.abstract.ranking_base import RankingDataset, RankingTaskGroup
@@ -78,14 +79,14 @@ def supported_target_languages(self) -> list[Language]:
         """Supported target languages are English."""
         return [Language.EN]
 
-    def load_monolingual_data(self, language: Language, split: DatasetSplit) -> RankingDataset:
+    def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset:
         """
         Load data for evaluation.
 
         This method must return a RankingDataset.
 
         Args:
-            language: Language code (e.g., "en", "de", "fr")
+            dataset_id: Dataset identifier (e.g., "en", "de", "fr" for language-based tasks)
             split: Data split ("test", "validation", "train")
 
         Returns
@@ -121,7 +122,7 @@ def load_monolingual_data(self, language: Language, split: DatasetSplit) -> Rank
             query_texts=queries,
             target_indices=labels,
             target_space=targets,
-            language=language,
+            dataset_id=dataset_id,
         )
 
     # Note: The evaluate() method is inherited from RankingTask and doesn't need

diff --git a/examples/run_benchmark_flat_average.py b/examples/run_benchmark_flat_average.py
@@ -0,0 +1,81 @@
+"""
+Run the benchmark with flat averaging on a selected set of languages.
+
+Aggregation mode: SKIP_LANGUAGE_AGGREGATION
+    All datasets contribute equally to the per-task score as a flat
+    average, with no language-based grouping or filtering. This means
+    cross-lingual and multilingual datasets are included alongside
+    monolingual ones. The final results do not include per-language
+    averages, since no language grouping criterion is defined and
+    there is no unambiguous way to assign cross-lingual or
+    multilingual datasets to a single language bucket.
+
+Task-level language filtering:
+    The `langs` list restricts which datasets each task loads during
+    initialization. Only languages in this list are considered.
+
+Execution mode: ALL
+    Explicitly set here, but has no practical effect under
+    SKIP_LANGUAGE_AGGREGATION since no datasets are ever filtered
+    out by the aggregation mode.
+"""
+
+import workrb
+from workrb.types import ExecutionMode, Language, LanguageAggregationMode
+
+if __name__ == "__main__":
+    # Models
+    models = [
+        # Lexical baselines
+        workrb.models.RandomRankingModel(),
+        workrb.models.BM25Model(lowercase=True),
+        # DL model
+        workrb.models.JobBERTModel(),
+    ]
+
+    # Languages (as strings via .value)
+    langs = [
+        Language.DA.value,
+        Language.DE.value,
+        Language.EN.value,
+        Language.ES.value,
+        Language.FR.value,
+        Language.HU.value,
+        Language.IT.value,
+        Language.LT.value,
+        Language.NL.value,
+        Language.PL.value,
+        Language.PT.value,
+        Language.SL.value,
+        Language.SV.value,
+    ]
+    split = "test"
+
+    # Tasks
+    tasks = [
+        # Tasks with monolingual datasets
+        workrb.tasks.ESCOJob2SkillRanking(split=split, languages=langs),
+        workrb.tasks.ESCOSkill2JobRanking(split=split, languages=langs),
+        # Tasks with monolingual, cross-lingual, and multilingual datasets
+        workrb.tasks.ProjectCandidateRanking(split=split, languages=langs),
+        workrb.tasks.SearchQueryCandidateRanking(split=split, languages=langs),
+        # TODO: add MELO and MELS tasks when PR #37 is merged
+    ]
+
+    # Evaluate
+    # NOTE: execution_mode=ALL has no effect when using SKIP_LANGUAGE_AGGREGATION,
+    # because no datasets are ever filtered out regardless of execution mode.
+    all_results = workrb.evaluate_multiple_models(
+        models=models,
+        tasks=tasks,
+        output_folder_template="../results/flat_average/{model_name}",
+        description="Flat average benchmark",
+        force_restart=True,
+        language_aggregation_mode=LanguageAggregationMode.SKIP_LANGUAGE_AGGREGATION,
+        execution_mode=ExecutionMode.ALL,
+    )
+
+    # Display results
+    for model_name, results in all_results.items():
+        print(f"\nResults for {model_name}:")
+        print(results)
diff --git a/examples/run_benchmark_flat_average_all_langs.py b/examples/run_benchmark_flat_average_all_langs.py
@@ -0,0 +1,67 @@
+"""
+Run the benchmark with flat averaging on all available languages.
+
+Aggregation mode: SKIP_LANGUAGE_AGGREGATION
+    All datasets contribute equally to the per-task score as a flat
+    average, with no language-based grouping or filtering. This means
+    cross-lingual and multilingual datasets are included alongside
+    monolingual ones. The final results do not include per-language
+    averages, since no language grouping criterion is defined and
+    there is no unambiguous way to assign cross-lingual or
+    multilingual datasets to a single language bucket.
+
+Task-level language filtering: None
+    Setting langs=None means each task loads all languages it supports,
+    with no filtering at the task level.
+
+Execution mode: ALL
+    Explicitly set here, but has no practical effect under
+    SKIP_LANGUAGE_AGGREGATION since no datasets are ever filtered
+    out by the aggregation mode.
+"""
+
+import workrb
+from workrb.types import ExecutionMode, LanguageAggregationMode
+
+if __name__ == "__main__":
+    # Models
+    models = [
+        # Lexical baselines
+        workrb.models.RandomRankingModel(),
+        workrb.models.BM25Model(lowercase=True),
+        # DL model
+        workrb.models.JobBERTModel(),
+    ]
+
+    # No language filtering: each task loads all languages it supports
+    langs = None
+    split = "test"
+
+    # Tasks
+    tasks = [
+        # Tasks with monolingual datasets
+        workrb.tasks.ESCOJob2SkillRanking(split=split, languages=langs),
+        workrb.tasks.ESCOSkill2JobRanking(split=split, languages=langs),
+        # Tasks with monolingual, cross-lingual, and multilingual datasets
+        workrb.tasks.ProjectCandidateRanking(split=split, languages=langs),
+        workrb.tasks.SearchQueryCandidateRanking(split=split, languages=langs),
+        # TODO: add MELO and MELS tasks when PR #37 is merged
+    ]
+
+    # Evaluate
+    # NOTE: execution_mode=ALL has no effect when using SKIP_LANGUAGE_AGGREGATION,
+    # because no datasets are ever filtered out regardless of execution mode.
+    all_results = workrb.evaluate_multiple_models(
+        models=models,
+        tasks=tasks,
+        output_folder_template="../results/flat_average_all_langs/{model_name}",
+        description="Flat average benchmark (all languages)",
+        force_restart=True,
+        language_aggregation_mode=LanguageAggregationMode.SKIP_LANGUAGE_AGGREGATION,
+        execution_mode=ExecutionMode.ALL,
+    )
+
+    # Display results
+    for model_name, results in all_results.items():
+        print(f"\nResults for {model_name}:")
+        print(results)
diff --git a/examples/run_benchmark_language_weighted.py b/examples/run_benchmark_language_weighted.py
@@ -0,0 +1,76 @@
+"""
+Run the benchmark with language-weighted aggregation on a selected set of languages.
+
+Aggregation mode: MONOLINGUAL_ONLY
+    Within each task, datasets are grouped by language and averaged per
+    group, then the per-language means are averaged to produce the
+    per-task score. This gives equal weight to each language regardless
+    of how many datasets it has. Datasets where input and output
+    languages differ (cross-lingual) are filtered out of aggregation.
+
+Task-level language filtering:
+    The `langs` list restricts which datasets each task loads during
+    initialization. Only languages in this list are considered.
+
+Execution mode: LAZY (default)
+    Datasets that would be filtered out by the aggregation mode are
+    not evaluated at all, saving compute.
+"""
+
+import workrb
+from workrb.types import ExecutionMode, Language, LanguageAggregationMode
+
+if __name__ == "__main__":
+    # Models
+    models = [
+        # Lexical baselines
+        workrb.models.RandomRankingModel(),
+        workrb.models.BM25Model(lowercase=True),
+        # DL model
+        workrb.models.JobBERTModel(),
+    ]
+
+    # Languages (as strings via .value)
+    langs = [
+        Language.DA.value,
+        Language.DE.value,
+        Language.EN.value,
+        Language.ES.value,
+        Language.FR.value,
+        Language.HU.value,
+        Language.IT.value,
+        Language.LT.value,
+        Language.NL.value,
+        Language.PL.value,
+        Language.PT.value,
+        Language.SL.value,
+        Language.SV.value,
+    ]
+    split = "test"
+
+    # Tasks
+    tasks = [
+        # Tasks with monolingual datasets
+        workrb.tasks.ESCOJob2SkillRanking(split=split, languages=langs),
+        workrb.tasks.ESCOSkill2JobRanking(split=split, languages=langs),
+        # Tasks with monolingual, cross-lingual, and multilingual datasets
+        workrb.tasks.ProjectCandidateRanking(split=split, languages=langs),
+        workrb.tasks.SearchQueryCandidateRanking(split=split, languages=langs),
+        # TODO: add MELO and MELS tasks when PR #37 is merged
+    ]
+
+    # Evaluate
+    all_results = workrb.evaluate_multiple_models(
+        models=models,
+        tasks=tasks,
+        output_folder_template="../results/language_weighted/{model_name}",
+        description="Language-weighted benchmark",
+        force_restart=True,
+        language_aggregation_mode=LanguageAggregationMode.MONOLINGUAL_ONLY,
+        execution_mode=ExecutionMode.LAZY,
+    )
+
+    # Display results
+    for model_name, results in all_results.items():
+        print(f"\nResults for {model_name}:")
+        print(results)
diff --git a/examples/run_benchmark_language_weighted_all_langs.py b/examples/run_benchmark_language_weighted_all_langs.py
@@ -0,0 +1,62 @@
+"""
+Run the benchmark with language-weighted aggregation on all available languages.
+
+Aggregation mode: MONOLINGUAL_ONLY
+    Within each task, datasets are grouped by language and averaged per
+    group, then the per-language means are averaged to produce the
+    per-task score. This gives equal weight to each language regardless
+    of how many datasets it has. Datasets where input and output
+    languages differ (cross-lingual) are filtered out of aggregation.
+
+Task-level language filtering: None
+    Setting langs=None means each task loads all languages it supports,
+    with no filtering at the task level.
+
+Execution mode: LAZY (default)
+    Datasets that would be filtered out by the aggregation mode are
+    not evaluated at all, saving compute.
+"""
+
+import workrb
+from workrb.types import ExecutionMode, LanguageAggregationMode
+
+if __name__ == "__main__":
+    # Models
+    models = [
+        # Lexical baselines
+        workrb.models.RandomRankingModel(),
+        workrb.models.BM25Model(lowercase=True),
+        # DL model
+        workrb.models.JobBERTModel(),
+    ]
+
+    # No language filtering: each task loads all languages it supports
+    langs = None
+    split = "test"
+
+    # Tasks
+    tasks = [
+        # Tasks with monolingual datasets
+        workrb.tasks.ESCOJob2SkillRanking(split=split, languages=langs),
+        workrb.tasks.ESCOSkill2JobRanking(split=split, languages=langs),
+        # Tasks with monolingual, cross-lingual, and multilingual datasets
+        workrb.tasks.ProjectCandidateRanking(split=split, languages=langs),
+        workrb.tasks.SearchQueryCandidateRanking(split=split, languages=langs),
+        # TODO: add MELO and MELS tasks when PR #37 is merged
+    ]
+
+    # Evaluate
+    all_results = workrb.evaluate_multiple_models(
+        models=models,
+        tasks=tasks,
+        output_folder_template="../results/language_weighted_all_langs/{model_name}",
+        description="Language-weighted benchmark (all languages)",
+        force_restart=True,
+        language_aggregation_mode=LanguageAggregationMode.MONOLINGUAL_ONLY,
+        execution_mode=ExecutionMode.LAZY,
+    )
+
+    # Display results
+    for model_name, results in all_results.items():
+        print(f"\nResults for {model_name}:")
+        print(results)
diff --git a/examples/run_multiple_models.py b/examples/run_multiple_models.py
@@ -2,6 +2,8 @@
 Reproduce benchmark results.
 """
 
+import workrb
+
 if __name__ == "__main__":
     # 1. Setup model and tasks
     models = [