Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
b00e4c5
refactor: generalize dataset indexing from language-based to dataset_…
federetyk Jan 15, 2026
17b1897
fix: solve issues in example files
federetyk Jan 15, 2026
e16f8dd
fix: add language field to MetricsResult for proper per-language aggr…
federetyk Jan 15, 2026
e254bc2
style: update docstrings to comply with NumPy style
federetyk Jan 16, 2026
40810c2
chore: merge upstream changes (v0.3.0, task renames, test refactor)
federetyk Jan 16, 2026
71d6d97
refactor: rename language_results to datasetid_results for consistenc…
federetyk Feb 19, 2026
647b070
docs: clarify get_dataset_language docstring on purpose and when to o…
federetyk Feb 20, 2026
1b726ee
Merge branch 'techwolf-ai:main' into refactor/generalize-dataset-inde…
federetyk Feb 21, 2026
3a9514d
refactor: migrate freelancer project matching tasks to load_dataset API
federetyk Feb 21, 2026
879dece
feat: add cross-lingual aggregation modes for per-language metrics
federetyk Feb 22, 2026
e3ccb24
Merge branch 'techwolf-ai:main' into refactor/generalize-dataset-inde…
federetyk Feb 23, 2026
72b8e40
test: make it explicit that the dataset key "en" comes from the Langu…
federetyk Feb 23, 2026
033db0f
test: fix lexical baselines regression test to use dataset_id parameter
federetyk Feb 23, 2026
724b0e0
feat: add lazy execution filtering and ExecutionMode enum
federetyk Feb 23, 2026
f3c5e19
test: fix tolerance for regression test to work well on diverse envir…
federetyk Feb 23, 2026
7d0b8b5
refactor: make language_aggregation_mode a non-optional parameter in …
federetyk Feb 24, 2026
4825486
refactor: migrate freelancer task to dataset_id-based language mapping
federetyk Feb 24, 2026
bbe0ac3
refactor: use language-grouped averaging in per-task aggregation
federetyk Feb 25, 2026
e1dfd9d
docs: add benchmark example scripts for each aggregation mode
federetyk Feb 25, 2026
e4a6bce
fix: remove from example the dataset that uses ESCO 1.0.5 but defines…
federetyk Feb 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ class MyCustomRankingTask(RankingTask):
"""Override default metrics if needed"""
return ["map", "mrr", "recall@5", "recall@10"]

def load_monolingual_data(self, split: DatasetSplit, language: Language) -> RankingDataset:
def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset:
"""
Load dataset for a specific language and split.
Load dataset for a specific dataset ID and split.

Returns:
RankingDataset with query_texts, target_indices, and target_space
"""
Expand All @@ -196,12 +196,12 @@ class MyCustomRankingTask(RankingTask):
[0, 2], # Software Engineer -> Python, SQL
[0, 1], # Data Scientist -> Python, Machine Learning
]

return RankingDataset(
query_texts=query_texts,
target_indices=target_indices,
target_space=target_space,
language=language,
dataset_id=dataset_id,
)
```

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ Feel free to make a PR to add your models & tasks to the official package! See [

### Checkpointing & Resuming

WorkRB automatically saves result checkpoints after each task completion in a specific language.
WorkRB automatically saves result checkpoints after each dataset evaluation within a task.

**Automatic Resuming** - Simply rerun with the same `output_folder`:

Expand Down
3 changes: 3 additions & 0 deletions examples/custom_model_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import torch
from sentence_transformers import SentenceTransformer

import workrb
from workrb.models.base import ModelInterface
from workrb.registry import register_model
from workrb.types import ModelInputType
Expand Down Expand Up @@ -47,10 +48,12 @@ def __init__(
self.encoder.to(device)
self.encoder.eval()

@property
def name(self) -> str:
"""Return the unique name of this model."""
return f"MyCustomModel-{self.base_model_name.split('/')[-1]}"

@property
def description(self) -> str:
"""Return the description of this model."""
return "A custom model that demonstrates WorkRB extensibility"
Expand Down
7 changes: 4 additions & 3 deletions examples/custom_task_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
and implement the required abstract methods.
"""

import workrb
from workrb.registry import register_task
from workrb.tasks.abstract.base import DatasetSplit, LabelType, Language
from workrb.tasks.abstract.ranking_base import RankingDataset, RankingTaskGroup
Expand Down Expand Up @@ -78,14 +79,14 @@ def supported_target_languages(self) -> list[Language]:
"""Supported target languages are English."""
return [Language.EN]

def load_monolingual_data(self, language: Language, split: DatasetSplit) -> RankingDataset:
def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset:
"""
Load data for evaluation.

This method must return a RankingDataset.

Args:
language: Language code (e.g., "en", "de", "fr")
dataset_id: Dataset identifier (e.g., "en", "de", "fr" for language-based tasks)
split: Data split ("test", "validation", "train")

Returns
Expand Down Expand Up @@ -121,7 +122,7 @@ def load_monolingual_data(self, language: Language, split: DatasetSplit) -> Rank
query_texts=queries,
target_indices=labels,
target_space=targets,
language=language,
dataset_id=dataset_id,
)

# Note: The evaluate() method is inherited from RankingTask and doesn't need
Expand Down
81 changes: 81 additions & 0 deletions examples/run_benchmark_flat_average.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
Run the benchmark with flat averaging on a selected set of languages.

Aggregation mode: SKIP_LANGUAGE_AGGREGATION
All datasets contribute equally to the per-task score as a flat
average, with no language-based grouping or filtering. This means
cross-lingual and multilingual datasets are included alongside
monolingual ones. The final results do not include per-language
averages, since no language grouping criterion is defined and
there is no unambiguous way to assign cross-lingual or
multilingual datasets to a single language bucket.

Task-level language filtering:
The `langs` list restricts which datasets each task loads during
initialization. Only languages in this list are considered.

Execution mode: ALL
Explicitly set here, but has no practical effect under
SKIP_LANGUAGE_AGGREGATION since no datasets are ever filtered
out by the aggregation mode.
"""

import workrb
from workrb.types import ExecutionMode, Language, LanguageAggregationMode

if __name__ == "__main__":
# Models
models = [
# Lexical baselines
workrb.models.RandomRankingModel(),
workrb.models.BM25Model(lowercase=True),
# DL model
workrb.models.JobBERTModel(),
]

# Languages (as strings via .value)
langs = [
Language.DA.value,
Language.DE.value,
Language.EN.value,
Language.ES.value,
Language.FR.value,
Language.HU.value,
Language.IT.value,
Language.LT.value,
Language.NL.value,
Language.PL.value,
Language.PT.value,
Language.SL.value,
Language.SV.value,
]
split = "test"

# Tasks
tasks = [
# Tasks with monolingual datasets
workrb.tasks.ESCOJob2SkillRanking(split=split, languages=langs),
workrb.tasks.ESCOSkill2JobRanking(split=split, languages=langs),
# Tasks with monolingual, cross-lingual, and multilingual datasets
workrb.tasks.ProjectCandidateRanking(split=split, languages=langs),
workrb.tasks.SearchQueryCandidateRanking(split=split, languages=langs),
# TODO: add MELO and MELS tasks when PR #37 is merged
]

# Evaluate
# NOTE: execution_mode=ALL has no effect when using SKIP_LANGUAGE_AGGREGATION,
# because no datasets are ever filtered out regardless of execution mode.
all_results = workrb.evaluate_multiple_models(
models=models,
tasks=tasks,
output_folder_template="../results/flat_average/{model_name}",
description="Flat average benchmark",
force_restart=True,
language_aggregation_mode=LanguageAggregationMode.SKIP_LANGUAGE_AGGREGATION,
execution_mode=ExecutionMode.ALL,
)

# Display results
for model_name, results in all_results.items():
print(f"\nResults for {model_name}:")
print(results)
67 changes: 67 additions & 0 deletions examples/run_benchmark_flat_average_all_langs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
Run the benchmark with flat averaging on all available languages.

Aggregation mode: SKIP_LANGUAGE_AGGREGATION
All datasets contribute equally to the per-task score as a flat
average, with no language-based grouping or filtering. This means
cross-lingual and multilingual datasets are included alongside
monolingual ones. The final results do not include per-language
averages, since no language grouping criterion is defined and
there is no unambiguous way to assign cross-lingual or
multilingual datasets to a single language bucket.

Task-level language filtering: None
Setting langs=None means each task loads all languages it supports,
with no filtering at the task level.

Execution mode: ALL
Explicitly set here, but has no practical effect under
SKIP_LANGUAGE_AGGREGATION since no datasets are ever filtered
out by the aggregation mode.
"""

import workrb
from workrb.types import ExecutionMode, LanguageAggregationMode

if __name__ == "__main__":
# Models
models = [
# Lexical baselines
workrb.models.RandomRankingModel(),
workrb.models.BM25Model(lowercase=True),
# DL model
workrb.models.JobBERTModel(),
]

# No language filtering: each task loads all languages it supports
langs = None
split = "test"

# Tasks
tasks = [
# Tasks with monolingual datasets
workrb.tasks.ESCOJob2SkillRanking(split=split, languages=langs),
workrb.tasks.ESCOSkill2JobRanking(split=split, languages=langs),
# Tasks with monolingual, cross-lingual, and multilingual datasets
workrb.tasks.ProjectCandidateRanking(split=split, languages=langs),
workrb.tasks.SearchQueryCandidateRanking(split=split, languages=langs),
# TODO: add MELO and MELS tasks when PR #37 is merged
]

# Evaluate
# NOTE: execution_mode=ALL has no effect when using SKIP_LANGUAGE_AGGREGATION,
# because no datasets are ever filtered out regardless of execution mode.
all_results = workrb.evaluate_multiple_models(
models=models,
tasks=tasks,
output_folder_template="../results/flat_average_all_langs/{model_name}",
description="Flat average benchmark (all languages)",
force_restart=True,
language_aggregation_mode=LanguageAggregationMode.SKIP_LANGUAGE_AGGREGATION,
execution_mode=ExecutionMode.ALL,
)

# Display results
for model_name, results in all_results.items():
print(f"\nResults for {model_name}:")
print(results)
76 changes: 76 additions & 0 deletions examples/run_benchmark_language_weighted.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""
Run the benchmark with language-weighted aggregation on a selected set of languages.

Aggregation mode: MONOLINGUAL_ONLY
Within each task, datasets are grouped by language and averaged per
group, then the per-language means are averaged to produce the
per-task score. This gives equal weight to each language regardless
of how many datasets it has. Datasets where input and output
languages differ (cross-lingual) are filtered out of aggregation.

Task-level language filtering:
The `langs` list restricts which datasets each task loads during
initialization. Only languages in this list are considered.

Execution mode: LAZY (default)
Datasets that would be filtered out by the aggregation mode are
not evaluated at all, saving compute.
"""

import workrb
from workrb.types import ExecutionMode, Language, LanguageAggregationMode

if __name__ == "__main__":
# Models
models = [
# Lexical baselines
workrb.models.RandomRankingModel(),
workrb.models.BM25Model(lowercase=True),
# DL model
workrb.models.JobBERTModel(),
]

# Languages (as strings via .value)
langs = [
Language.DA.value,
Language.DE.value,
Language.EN.value,
Language.ES.value,
Language.FR.value,
Language.HU.value,
Language.IT.value,
Language.LT.value,
Language.NL.value,
Language.PL.value,
Language.PT.value,
Language.SL.value,
Language.SV.value,
]
split = "test"

# Tasks
tasks = [
# Tasks with monolingual datasets
workrb.tasks.ESCOJob2SkillRanking(split=split, languages=langs),
workrb.tasks.ESCOSkill2JobRanking(split=split, languages=langs),
# Tasks with monolingual, cross-lingual, and multilingual datasets
workrb.tasks.ProjectCandidateRanking(split=split, languages=langs),
workrb.tasks.SearchQueryCandidateRanking(split=split, languages=langs),
# TODO: add MELO and MELS tasks when PR #37 is merged
]

# Evaluate
all_results = workrb.evaluate_multiple_models(
models=models,
tasks=tasks,
output_folder_template="../results/language_weighted/{model_name}",
description="Language-weighted benchmark",
force_restart=True,
language_aggregation_mode=LanguageAggregationMode.MONOLINGUAL_ONLY,
execution_mode=ExecutionMode.LAZY,
)

# Display results
for model_name, results in all_results.items():
print(f"\nResults for {model_name}:")
print(results)
62 changes: 62 additions & 0 deletions examples/run_benchmark_language_weighted_all_langs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""
Run the benchmark with language-weighted aggregation on all available languages.

Aggregation mode: MONOLINGUAL_ONLY
Within each task, datasets are grouped by language and averaged per
group, then the per-language means are averaged to produce the
per-task score. This gives equal weight to each language regardless
of how many datasets it has. Datasets where input and output
languages differ (cross-lingual) are filtered out of aggregation.

Task-level language filtering: None
Setting langs=None means each task loads all languages it supports,
with no filtering at the task level.

Execution mode: LAZY (default)
Datasets that would be filtered out by the aggregation mode are
not evaluated at all, saving compute.
"""

import workrb
from workrb.types import ExecutionMode, LanguageAggregationMode

if __name__ == "__main__":
# Models
models = [
# Lexical baselines
workrb.models.RandomRankingModel(),
workrb.models.BM25Model(lowercase=True),
# DL model
workrb.models.JobBERTModel(),
]

# No language filtering: each task loads all languages it supports
langs = None
split = "test"

# Tasks
tasks = [
# Tasks with monolingual datasets
workrb.tasks.ESCOJob2SkillRanking(split=split, languages=langs),
workrb.tasks.ESCOSkill2JobRanking(split=split, languages=langs),
# Tasks with monolingual, cross-lingual, and multilingual datasets
workrb.tasks.ProjectCandidateRanking(split=split, languages=langs),
workrb.tasks.SearchQueryCandidateRanking(split=split, languages=langs),
# TODO: add MELO and MELS tasks when PR #37 is merged
]

# Evaluate
all_results = workrb.evaluate_multiple_models(
models=models,
tasks=tasks,
output_folder_template="../results/language_weighted_all_langs/{model_name}",
description="Language-weighted benchmark (all languages)",
force_restart=True,
language_aggregation_mode=LanguageAggregationMode.MONOLINGUAL_ONLY,
execution_mode=ExecutionMode.LAZY,
)

# Display results
for model_name, results in all_results.items():
print(f"\nResults for {model_name}:")
print(results)
2 changes: 2 additions & 0 deletions examples/run_multiple_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Reproduce benchmark results.
"""

import workrb

if __name__ == "__main__":
# 1. Setup model and tasks
models = [
Expand Down
Loading