From 96020089dee48b7e327c4104acf36b98620aa9e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 23 Sep 2025 14:54:41 +0200 Subject: [PATCH 01/10] homogeneize k and n in parametrizable metrics --- src/lighteval/metrics/metrics.py | 18 ++--- src/lighteval/metrics/metrics_sample.py | 84 +++++++++------------ src/lighteval/metrics/utils/metric_utils.py | 10 ++- 3 files changed, 51 insertions(+), 61 deletions(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 0674d2df1..d1869813b 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -41,7 +41,7 @@ MRR, ROUGE, AccGoldLikelihood, - AvgAtK, + AvgAtN, BertScore, ExactMatches, Extractiveness, @@ -50,7 +50,7 @@ GPassAtK, JudgeLLMSimpleQA, LoglikelihoodAcc, - MajAtK, + MajAtN, PassAtK, Recall, StringDistance, @@ -85,16 +85,16 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - avg_at_k = SampleLevelMetric( - metric_name="avg@k", - sample_level_fn=AvgAtK(strip_strings=True), + avg_at_n = SampleLevelMetric( + metric_name="avg@n", + sample_level_fn=AvgAtN(strip_strings=True), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, ) - avg_at_k_math = SampleLevelMetric( - metric_name="avg@k", - sample_level_fn=AvgAtK( + avg_at_n_math = SampleLevelMetric( + metric_name="avg@n", + sample_level_fn=AvgAtN( sample_scoring_function=MultilingualExtractiveMatchMetric( language=Language.ENGLISH, gold_extraction_target=[ExprExtractionConfig(), LatexExtractionConfig()], @@ -367,7 +367,7 @@ class Metrics(Enum): ) maj_at_k = SampleLevelMetric( metric_name="maj@k", - sample_level_fn=MajAtK(), + sample_level_fn=MajAtN(), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, higher_is_better=True, diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 25b4f68ff..bf2a6cd05 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -823,9 +823,6 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): Returns: float: Score over the current sample's items. """ - import nltk - - nltk.download("punkt_tab") golds = doc.get_golds() predictions = model_response.final_text return np.mean([self._bleu_score(golds, p) for p in predictions]) @@ -1125,14 +1122,13 @@ def __init__( raise ValueError(f"Unknown normalization function: {normalize}") else: self.normalize = normalize - self.strip_strings = strip_strings if callable(sample_scoring_function): self.compute_score = sample_scoring_function self.type_exact_match = None elif isinstance(sample_scoring_function, SampleLevelComputation): - self.score_sample = sample_scoring_function.compute + self.compute_score = sample_scoring_function.compute self.type_exact_match = None else: if isinstance(sample_scoring_function, str): @@ -1141,11 +1137,9 @@ def __init__( f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {sample_scoring_function} instead." ) self.type_exact_match = sample_scoring_function - self.score_sample = self.default_sample_scoring else: self.type_exact_match = "full" self.compute_score = self.default_sample_scoring - self.score_sample = self.default_sample_scoring def preprocess(self, text: str) -> str: if not text: @@ -1172,17 +1166,17 @@ def name_metrics(self) -> str | list[str]: raise NotImplementedError -class AvgAtK(SamplingMetric, SampleLevelComputation): - def __init__(self, k: int | None = None, **kwargs): - """Sample score averages all the individual k predictions scores. +class AvgAtN(SamplingMetric, SampleLevelComputation): + def __init__(self, n: int | None = None, **kwargs): + """Sample score averages all the individual n predictions scores. Args: - k (int | None): The number of top choices to consider. + n (int | None): Number of correct samples threshold **kwargs: Additional keyword arguments. """ super().__init__(**kwargs) - self.k = k - self.attribute_must_be_set = ["k"] + self.n = n + self.attribute_must_be_set = ["n"] def compute(self, model_response: ModelResponse, doc: Doc, **kwargs): """Computes the metric over a list of golds and predictions for one single sample. @@ -1199,40 +1193,36 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs): """ all_scores = [] for i in range(self.k): - all_scores.append(self.score_sample(doc, model_response[i])) + all_scores.append(self.compute_score(doc, model_response[i])) avg_score = np.mean(all_scores) return avg_score def num_samples(self): - """Get the number of samples for this metric. - - Returns: - int: The number of samples - """ - return self.k + return self.n -class MajAtK(SamplingMetric, SampleLevelComputation): - def __init__(self, k: int | None = None, **kwargs): +class MajAtN(SamplingMetric, SampleLevelComputation): + def __init__(self, n: int | None = None, **kwargs): """An exact match class. Args: - k (int): The number of top choices to consider. + n (int): Total number of samples to generate **kwargs: Additional keyword arguments. """ super().__init__(**kwargs) - self.k = k - self.attribute_must_be_set = ["k"] + self.n = n + self.attribute_must_be_set = ["n"] - def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): + def compute(self, model_response: ModelResponse, docs: Doc, **kwargs): """Computes the metric over a list of golds and predictions for one single sample. - It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones, then compares it to the gold. + It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones, + then compares it to the gold. Args: - doc (Doc): The document containing gold references. model_response (ModelResponse): The model's response containing predictions. + docs (Doc): The document containing gold references. **kwargs: Additional keyword arguments. Returns: @@ -1240,38 +1230,36 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): """ if self.k is None: raise Exception("You did not set the value of k") - - golds = doc.get_golds() - + golds = docs.get_golds() if len(golds) > 1: raise Exception("Cannot compute maj@k with several golds") - processed_choices = [self.preprocess(text=g) for g in doc.get_golds()] + processed_choices = [self.preprocess(text=g) for g in docs.get_golds()] new_doc = Doc( choices=processed_choices, - query=doc.query, - gold_index=list(range(len(processed_choices))), + query=docs.query, + gold_index=docs.gold_index, ) all_answers = [] - for pred in model_response.final_text[: self.k]: + for pred in model_response.final_text[: self.n]: all_answers.append(self.preprocess(text=pred)) majority_prediction = max(all_answers, key=all_answers.count) new_model_response = ModelResponse( text=[majority_prediction], ) - return self.compute_score(new_doc, new_model_response) + return self.compute_score(new_model_response, new_doc) def num_samples(self): - return self.k + return self.n class PassAtK(SamplingMetric, SampleLevelComputation): def __init__(self, k: int | None = None, n: int | None = None, **kwargs): - """Computing pass at k + """Computing pass at k with an estimator Args: - k (int | None): Threshold for the number of successful attempts. - n (int | None): Number of samples to generate. + k (int | None): Number of correct samples threshold + n (int | None): Total number of samples to generate. **kwargs: Additional keyword arguments. """ super().__init__(**kwargs) @@ -1316,7 +1304,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: new_model_response = ModelResponse( text=[cur_pred], ) - all_scores.append(self.score_sample(doc=new_doc, model_response=new_model_response)) + all_scores.append(self.compute_score(doc=new_doc, model_response=new_model_response)) return self.pass_at_k(all_scores) @@ -1344,8 +1332,8 @@ def __init__( """Computing G-Pass@k from http://arxiv.org/abs/2412.13147 Args: - k (Union[int, list[int]] | None): The number of successful attempts to be considered. - n (int | None): Number of samples to generate. + k (Union[int, list[int]] | None): Number of correct samples threshold + n (int | None): Total number of samples to generate. thresholds (list[float]): Thresholds to control successful attempts in k generate. name_prefix (str | None): Prefix for the metric name. **kwargs: Additional keyword arguments. @@ -1406,7 +1394,7 @@ def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: new_model_response = ModelResponse( text=[cur_pred], ) - all_scores.append(self.score_sample(new_doc, new_model_response)) + all_scores.append(self.compute_score(new_doc, new_model_response)) return self.g_pass_at_k(all_scores) @@ -1439,8 +1427,8 @@ def compute_mg_pass_at_k(n, c, k): metrics = {} for k in ks: for t in thresholds: - metrics[f"{self.name}{k}_{t}"] = compute_g_pass_at_k(n, c, k, t) - metrics[f"m{self.name}{k}"] = compute_mg_pass_at_k(n, c, k) + metrics[f"{self.name}@{k}_{t}"] = compute_g_pass_at_k(n, c, k, t) + metrics[f"m{self.name}@{k}"] = compute_mg_pass_at_k(n, c, k) return metrics @@ -1452,8 +1440,8 @@ def metric_names(self): metrics = [] for k in ks: for t in thresholds: - metrics.append(f"{self.name}{k}_{t}") - metrics.append(f"m{self.name}{k}") + metrics.append(f"{self.name}@{k}_{t}") + metrics.append(f"m{self.name}@{k}") return metrics diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py index c806c5b6b..8a78380fb 100644 --- a/src/lighteval/metrics/utils/metric_utils.py +++ b/src/lighteval/metrics/utils/metric_utils.py @@ -83,15 +83,17 @@ def __call__(self, sample_params: dict | None): # Once the parameters are updated, we need to adjust the # metric name to what will be returned - sample_params_name = "&".join(sample_params.keys()) + sample_params_values = [f"{k}={v}" for k, v in sample_params.items()] + sample_params_values = "&".join(sample_params_values) if isinstance(self, MetricGrouping): if hasattr(self.sample_level_fn, "metric_names"): - # this is mostly for the gpass@k metrics + # this is mostly for the gpass@k metrics which redefine submetric names self.metric_name = self.sample_level_fn.metric_names else: - self.metric_name = [f"{metric}_with_{sample_params_name}" for metric in self.metric_name] + self.metric_name = [f"{metric}_with_{sample_params_values}" for metric in self.metric_name] else: - self.metric_name = f"{self.metric_name}_with_{sample_params_name}" + self.metric_name = f"{self.metric_name}_with_{sample_params_values}" + return self @staticmethod From 4db4639270f745e532ce7e2f4a643f5d0394b9e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 23 Sep 2025 14:59:55 +0200 Subject: [PATCH 02/10] updated aime, last metric fixs --- src/lighteval/metrics/metrics.py | 2 +- src/lighteval/metrics/metrics_sample.py | 4 +- src/lighteval/tasks/default_tasks.py | 60 ++++++++++++------------- 3 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index d1869813b..fed235067 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -365,7 +365,7 @@ class Metrics(Enum): corpus_level_fn=CorpusLevelF1Score(None), higher_is_better=True, ) - maj_at_k = SampleLevelMetric( + maj_at_n = SampleLevelMetric( metric_name="maj@k", sample_level_fn=MajAtN(), category=SamplingMethod.GENERATIVE, diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index bf2a6cd05..e2bde4adc 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -1229,10 +1229,10 @@ def compute(self, model_response: ModelResponse, docs: Doc, **kwargs): float: Aggregated score over the current sample's items. """ if self.k is None: - raise Exception("You did not set the value of k") + raise Exception("You did not set the value of n") golds = docs.get_golds() if len(golds) > 1: - raise Exception("Cannot compute maj@k with several golds") + raise Exception("Cannot compute maj@n with several golds") processed_choices = [self.preprocess(text=g) for g in docs.get_golds()] new_doc = Doc( diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 7092264ad..1c72d5008 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -410,7 +410,7 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.pass_at_k_math(sample_params={"k": 1})], + metrics=[Metrics.pass_at_k_math(sample_params={"k": 1}), Metrics.avg_at_n_math(sample_params={"n": 1})], version=2, ) aime24_avg = LightevalTaskConfig( @@ -424,7 +424,7 @@ few_shots_split=None, few_shots_select=None, generation_size=None, - metrics=[Metrics.avg_at_k_math(sample_params={"k": 64})], + metrics=[Metrics.avg_at_n_math(sample_params={"n": 64})], version=2, ) aime24_gpassk = LightevalTaskConfig( @@ -10464,9 +10464,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10489,9 +10489,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10514,9 +10514,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10539,9 +10539,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10564,9 +10564,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10589,9 +10589,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10614,9 +10614,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10639,9 +10639,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10664,9 +10664,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10689,9 +10689,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10714,9 +10714,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10739,9 +10739,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10764,9 +10764,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, @@ -10789,9 +10789,9 @@ generation_size=2048, metrics=[ Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}), - Metrics.maj_at_k( + Metrics.maj_at_n( sample_params={ - "k": 4, + "n": 4, "strip_strings": True, "normalize_pred": math_normalizer, "normalize_gold": math_normalizer, From 393c4305884619819bbf8e35ca31d8fa0e45bba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 23 Sep 2025 15:18:22 +0200 Subject: [PATCH 03/10] fix --- src/lighteval/metrics/metrics_sample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index e2bde4adc..43f6f732d 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -1171,7 +1171,7 @@ def __init__(self, n: int | None = None, **kwargs): """Sample score averages all the individual n predictions scores. Args: - n (int | None): Number of correct samples threshold + n (int | None): Number of samples to generate **kwargs: Additional keyword arguments. """ super().__init__(**kwargs) From ce3e94356e396415343632768218495b459e4dc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 23 Sep 2025 15:21:35 +0200 Subject: [PATCH 04/10] restore rm import --- src/lighteval/metrics/metrics.py | 2 +- src/lighteval/metrics/metrics_sample.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index fed235067..bc9488fef 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -366,7 +366,7 @@ class Metrics(Enum): higher_is_better=True, ) maj_at_n = SampleLevelMetric( - metric_name="maj@k", + metric_name="maj@n", sample_level_fn=MajAtN(), category=SamplingMethod.GENERATIVE, corpus_level_fn=np.mean, diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 43f6f732d..d7929a991 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -823,6 +823,9 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): Returns: float: Score over the current sample's items. """ + import nltk + + nltk.download("punkt_tab") golds = doc.get_golds() predictions = model_response.final_text return np.mean([self._bleu_score(golds, p) for p in predictions]) From 3d2607345bdaa4913de35862192c78cc84470219 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 23 Sep 2025 15:25:09 +0200 Subject: [PATCH 05/10] restore --- src/lighteval/metrics/metrics_sample.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index d7929a991..90d407c9b 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -63,7 +63,7 @@ class SampleLevelComputation(ABC): @abstractmethod - def compute(self, model_response: ModelResponse, doc: Doc, **kwargs): + def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): raise NotImplementedError def __str__(self): @@ -444,7 +444,7 @@ def __init__(self, length_normalization: bool = False): """ self.length_normalization = length_normalization - def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: + def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float: """Mean reciprocal rank. Measures the quality of a ranking of choices (ordered by correctness). Args: @@ -1181,7 +1181,7 @@ def __init__(self, n: int | None = None, **kwargs): self.n = n self.attribute_must_be_set = ["n"] - def compute(self, model_response: ModelResponse, doc: Doc, **kwargs): + def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): """Computes the metric over a list of golds and predictions for one single sample. It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones, then compares it to the gold. @@ -1218,14 +1218,14 @@ def __init__(self, n: int | None = None, **kwargs): self.n = n self.attribute_must_be_set = ["n"] - def compute(self, model_response: ModelResponse, docs: Doc, **kwargs): + def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): """Computes the metric over a list of golds and predictions for one single sample. It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones, then compares it to the gold. Args: + doc (Doc): The document containing gold references. model_response (ModelResponse): The model's response containing predictions. - docs (Doc): The document containing gold references. **kwargs: Additional keyword arguments. Returns: @@ -1233,15 +1233,16 @@ def compute(self, model_response: ModelResponse, docs: Doc, **kwargs): """ if self.k is None: raise Exception("You did not set the value of n") - golds = docs.get_golds() + + golds = doc.get_golds() if len(golds) > 1: raise Exception("Cannot compute maj@n with several golds") - processed_choices = [self.preprocess(text=g) for g in docs.get_golds()] + processed_choices = [self.preprocess(text=g) for g in doc.get_golds()] new_doc = Doc( choices=processed_choices, - query=docs.query, - gold_index=docs.gold_index, + query=doc.query, + gold_index=doc.gold_index, ) all_answers = [] for pred in model_response.final_text[: self.n]: @@ -1357,7 +1358,7 @@ def k(self): def k(self, new_val): self._k = as_list(new_val) - def compute(self, model_response: ModelResponse, doc: Doc, **kwargs) -> float: + def compute(self, doc: Doc, model_response: ModelResponse, **kwargs): """Computes the metric over a list of golds and predictions for one single item with possibly many samples. It applies normalisation (if needed) to model prediction and gold, computes their per prediction score, then aggregates the scores over the samples using a pass@k. From 3082a0304d58793683aba80bca65b02142f6f5e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 23 Sep 2025 15:27:48 +0200 Subject: [PATCH 06/10] update doc --- docs/source/package_reference/metrics.mdx | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/docs/source/package_reference/metrics.mdx b/docs/source/package_reference/metrics.mdx index 1b946a82e..bb22975bf 100644 --- a/docs/source/package_reference/metrics.mdx +++ b/docs/source/package_reference/metrics.mdx @@ -56,15 +56,21 @@ [[autodoc]] metrics.metrics_sample.BLEU ### StringDistance [[autodoc]] metrics.metrics_sample.StringDistance + +### Metrics allowing sampling +#### PassAtK +[[autodoc]] metrics.metrics_sample.PassAtK +#### MajAtN +[[autodoc]] metrics.metrics_sample.MajAtN +#### AvgAtN +[[autodoc]] metrics.metrics_sample.AvgAtN + +## LLM-as-a-Judge +### JudgeLM +[[autodoc]] metrics.utils.llm_as_judge.JudgeLM ### JudgeLLM [[autodoc]] metrics.metrics_sample.JudgeLLM ### JudgeLLMMTBench [[autodoc]] metrics.metrics_sample.JudgeLLMMTBench ### JudgeLLMMixEval [[autodoc]] metrics.metrics_sample.JudgeLLMMixEval -### MajAtK -[[autodoc]] metrics.metrics_sample.MajAtK - -## LLM-as-a-Judge -### JudgeLM -[[autodoc]] metrics.utils.llm_as_judge.JudgeLM From 038e64c060470a9c0ca68abf55e0c05d34f9199a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 23 Sep 2025 16:21:27 +0200 Subject: [PATCH 07/10] gpqa fix --- src/lighteval/metrics/utils/metric_utils.py | 4 ++-- src/lighteval/tasks/extended/lcb/codegen_metrics.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/metric_utils.py index 8a78380fb..b50264855 100644 --- a/src/lighteval/metrics/utils/metric_utils.py +++ b/src/lighteval/metrics/utils/metric_utils.py @@ -90,9 +90,9 @@ def __call__(self, sample_params: dict | None): # this is mostly for the gpass@k metrics which redefine submetric names self.metric_name = self.sample_level_fn.metric_names else: - self.metric_name = [f"{metric}_with_{sample_params_values}" for metric in self.metric_name] + self.metric_name = [f"{metric}_{sample_params_values}" for metric in self.metric_name] else: - self.metric_name = f"{self.metric_name}_with_{sample_params_values}" + self.metric_name = f"{self.metric_name}_{sample_params_values}" return self diff --git a/src/lighteval/tasks/extended/lcb/codegen_metrics.py b/src/lighteval/tasks/extended/lcb/codegen_metrics.py index 98fad8858..08246806a 100644 --- a/src/lighteval/tasks/extended/lcb/codegen_metrics.py +++ b/src/lighteval/tasks/extended/lcb/codegen_metrics.py @@ -53,7 +53,10 @@ from tqdm import tqdm -sys.set_int_max_str_digits(50000) +try: + sys.set_int_max_str_digits(50000) +except AttributeError: + print("You likely won't be able to run codegen metrics on your system.") os.environ["TOKENIZERS_PARALLELISM"] = "false" From 238729dde1e7cc955069c230b58eb5d375741643 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 23 Sep 2025 16:26:39 +0200 Subject: [PATCH 08/10] pass at --- src/lighteval/metrics/metrics_sample.py | 8 +++---- .../test_cases/gpqa_instruct_pass_at_k.json | 22 +++++++++---------- tests/unit/metrics/test_cases/pass_at_k.json | 6 ++--- .../metrics/test_cases/pass_at_k_letters.json | 6 ++--- .../metrics/test_cases/pass_at_k_math.json | 6 ++--- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 90d407c9b..242d13be4 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -1431,8 +1431,8 @@ def compute_mg_pass_at_k(n, c, k): metrics = {} for k in ks: for t in thresholds: - metrics[f"{self.name}@{k}_{t}"] = compute_g_pass_at_k(n, c, k, t) - metrics[f"m{self.name}@{k}"] = compute_mg_pass_at_k(n, c, k) + metrics[f"{self.name}{k}_{t}"] = compute_g_pass_at_k(n, c, k, t) + metrics[f"m{self.name}{k}"] = compute_mg_pass_at_k(n, c, k) return metrics @@ -1444,8 +1444,8 @@ def metric_names(self): metrics = [] for k in ks: for t in thresholds: - metrics.append(f"{self.name}@{k}_{t}") - metrics.append(f"m{self.name}@{k}") + metrics.append(f"{self.name}{k}_{t}") + metrics.append(f"m{self.name}{k}") return metrics diff --git a/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json b/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json index c3a0c6f25..fead1bd4c 100644 --- a/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json +++ b/tests/unit/metrics/test_cases/gpqa_instruct_pass_at_k.json @@ -22,7 +22,7 @@ "output_tokens": [] }, "expected_output": { - "gpqa_pass@k_with_k&n&strip_strings": 1.0 + "gpqa_pass@k_k=1&n=1&strip_strings=True": 1.0 }, "tolerance": 0.01, "description": "Basic test case with single correct sample" @@ -47,7 +47,7 @@ "output_tokens": [] }, "expected_output": { - "gpqa_pass@k_with_k&n&strip_strings": 1.0 + "gpqa_pass@k_k=2&n=3&strip_strings=True": 1.0 }, "tolerance": 0.01, "description": "Test case with multiple samples all correct" @@ -72,7 +72,7 @@ "output_tokens": [] }, "expected_output": { - "gpqa_pass@k_with_k&n&strip_strings": 0.8333333333333333 + "gpqa_pass@k_k=2&n=4&strip_strings=True": 0.8333333333333333 }, "tolerance": 0.01, "description": "Test case with mixed correct and incorrect samples" @@ -97,7 +97,7 @@ "output_tokens": [] }, "expected_output": { - "gpqa_pass@k_with_k&n&strip_strings": 0.5 + "gpqa_pass@k_k=1&n=2&strip_strings=True": 0.5 }, "tolerance": 0.01, "description": "Test case with case sensitivity (strip_strings should handle this)" @@ -122,7 +122,7 @@ "output_tokens": [] }, "expected_output": { - "gpqa_pass@k_with_k&n&strip_strings": 0.0 + "gpqa_pass@k_k=1&n=3&strip_strings=True": 0.0 }, "tolerance": 0.01, "description": "Test case with all incorrect samples" @@ -147,7 +147,7 @@ "output_tokens": [] }, "expected_output": { - "gpqa_pass@k_with_k&n&strip_strings": 1.0 + "gpqa_pass@k_k=5&n=8&strip_strings=True": 1.0 }, "tolerance": 0.01, "description": "Test case with high k value and multiple correct samples" @@ -172,7 +172,7 @@ "output_tokens": [] }, "expected_output": { - "gpqa_pass@k_with_k&n&strip_strings": 1.0 + "gpqa_pass@k_k=1&n=2&strip_strings=True": 1.0 }, "tolerance": 0.01, "description": "Test case with parentheses format" @@ -197,7 +197,7 @@ "output_tokens": [] }, "expected_output": { - "gpqa_pass@k_with_k&n&strip_strings": 1.0 + "gpqa_pass@k_k=1&n=2&strip_strings=True": 1.0 }, "tolerance": 0.01, "description": "Test case with reasoning and answer extraction" @@ -222,7 +222,7 @@ "output_tokens": [] }, "expected_output": { - "gpqa_pass@k_with_k&n&strip_strings": 1.0 + "gpqa_pass@k_k=1&n=2&strip_strings=True": 1.0 }, "tolerance": 0.01, "description": "Test case with 'final answer' format" @@ -247,7 +247,7 @@ "output_tokens": [] }, "expected_output": { - "gpqa_pass@k_with_k&n&strip_strings": 1.0 + "gpqa_pass@k_k=1&n=1&strip_strings=True": 1.0 }, "tolerance": 0.01, "description": "Edge case with single choice" @@ -272,7 +272,7 @@ "output_tokens": [] }, "expected_output": { - "gpqa_pass@k_with_k&n&strip_strings": 0.8333333333333333 + "gpqa_pass@k_k=2&n=4&strip_strings=True": 0.8333333333333333 }, "tolerance": 0.01, "description": "Test case with multiple correct answers (first correct answer)" diff --git a/tests/unit/metrics/test_cases/pass_at_k.json b/tests/unit/metrics/test_cases/pass_at_k.json index 1e552cb96..5afbe089f 100644 --- a/tests/unit/metrics/test_cases/pass_at_k.json +++ b/tests/unit/metrics/test_cases/pass_at_k.json @@ -18,7 +18,7 @@ "output_tokens": [] }, "expected_output": { - "pass@k_with_k&n": 0.5 + "pass@k_k=1&n=2": 0.5 }, "tolerance": 0.01, "description": "Test pass at k with correct answer in k" @@ -39,7 +39,7 @@ "output_tokens": [] }, "expected_output": { - "pass@k_with_k&n": 0.0 + "pass@k_k=1&n=2": 0.0 }, "tolerance": 0.01, "description": "Test pass at k with correct answer not in k" @@ -60,7 +60,7 @@ "output_tokens": [] }, "expected_output": { - "pass@k_with_k&n": 0.66 + "pass@k_k=2&n=3": 0.66 }, "tolerance": 0.01, "description": "Test pass at k with multiple attempts" diff --git a/tests/unit/metrics/test_cases/pass_at_k_letters.json b/tests/unit/metrics/test_cases/pass_at_k_letters.json index 5156b8e36..72f1857b5 100644 --- a/tests/unit/metrics/test_cases/pass_at_k_letters.json +++ b/tests/unit/metrics/test_cases/pass_at_k_letters.json @@ -18,7 +18,7 @@ "output_tokens": [] }, "expected_output": { - "pass@k_with_k&n": 0.0 + "pass@k_k=1&n=2": 0.0 }, "tolerance": 0.01, "description": "Test pass at k letters with correct letter answer" @@ -39,7 +39,7 @@ "output_tokens": [] }, "expected_output": { - "pass@k_with_k&n": 0.0 + "pass@k_k=1&n=2": 0.0 }, "tolerance": 0.01, "description": "Test pass at k letters with wrong letter answer" @@ -60,7 +60,7 @@ "output_tokens": [] }, "expected_output": { - "pass@k_with_k&n": 0.0 + "pass@k_k=2&n=3": 0.0 }, "tolerance": 0.01, "description": "Test pass at k letters with multiple attempts" diff --git a/tests/unit/metrics/test_cases/pass_at_k_math.json b/tests/unit/metrics/test_cases/pass_at_k_math.json index 0ebd6436a..922f138fd 100644 --- a/tests/unit/metrics/test_cases/pass_at_k_math.json +++ b/tests/unit/metrics/test_cases/pass_at_k_math.json @@ -16,7 +16,7 @@ "text": ["4", "5"] }, "expected_output": { - "pass@k_with_k&n": 0.5 + "pass@k_k=1&n=2": 0.5 }, "tolerance": 0.01, "description": "Test pass at k math with correct math answer" @@ -35,7 +35,7 @@ "text": ["5", "6"] }, "expected_output": { - "pass@k_with_k&n": 0.0 + "pass@k_k=1&n=2": 0.0 }, "tolerance": 0.01, "description": "Test pass at k math with wrong math answer" @@ -54,7 +54,7 @@ "text": ["10", "12", "15"] }, "expected_output": { - "pass@k_with_k&n": 0.66 + "pass@k_k=2&n=3": 0.66 }, "tolerance": 0.01, "description": "Test pass at k math with multiple attempts" From 00d6309cb89439b8492dbffe853221fe623efefc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 23 Sep 2025 16:34:59 +0200 Subject: [PATCH 09/10] recall --- tests/unit/metrics/test_cases/recall_at_k.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/metrics/test_cases/recall_at_k.json b/tests/unit/metrics/test_cases/recall_at_k.json index 8259a0ced..25f026ceb 100644 --- a/tests/unit/metrics/test_cases/recall_at_k.json +++ b/tests/unit/metrics/test_cases/recall_at_k.json @@ -18,7 +18,7 @@ "output_tokens": [] }, "expected_output": { - "recall_with_k": 1 + "recall_k=2": 1 }, "tolerance": 0.01, "description": "Test recall at k with correct choice in top k" @@ -39,7 +39,7 @@ "output_tokens": [] }, "expected_output": { - "recall_with_k": 0 + "recall_k=1": 0 }, "tolerance": 0.01, "description": "Test recall at k with correct choice not in top k" @@ -60,7 +60,7 @@ "output_tokens": [] }, "expected_output": { - "recall_with_k": 1 + "recall_k=2": 1 }, "tolerance": 0.01, "description": "Test recall at k with multiple gold indices" From 2e4355578b7f7d909ea8a9811711898e114397bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 23 Sep 2025 16:40:09 +0200 Subject: [PATCH 10/10] test --- .../SmolLM2-1.7B-Instruct-results-accelerate.json | 4 ++-- .../reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json index 7c8c77d79..5a9e84475 100644 --- a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json +++ b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2fbcbcf4031d545999b8e02afffa2537f642a1239664af16160e5fcd250a4ecc -size 50626 +oid sha256:c432481623defc987cf21e429ad7dabe5f306e530fa1df296caee8b36f3b6842 +size 52145 diff --git a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json index 66ab85090..d5e82a8ac 100644 --- a/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json +++ b/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1302090702deaf018f21f1dc5ffd2a2a2b93e19b50aa459508146f130aa9ecf -size 50565 +oid sha256:1e5a915540836c27238dbeaa935925bc9275d8972d6c0d00a1aa6e3279a7138d +size 52084