diff --git a/langtest/transform/__init__.py b/langtest/transform/__init__.py index 309d92617..3a6400b79 100644 --- a/langtest/transform/__init__.py +++ b/langtest/transform/__init__.py @@ -42,6 +42,7 @@ Result, ) from ..utils.custom_types.helpers import default_user_prompt +from ..utils.util_metrics import calculate_f1_score nest_asyncio.apply() @@ -893,8 +894,12 @@ def run( y_pred = y_pred[valid_indices] y_true = y_true.explode() y_pred = y_pred.explode() - y_pred = y_pred.apply(lambda x: x.split("-")[-1]) - y_true = y_true.apply(lambda x: x.split("-")[-1]) + y_pred = y_pred.apply(lambda x: x.split("-")[-1]).reset_index( + drop=True + ) + y_true = y_true.apply(lambda x: x.split("-")[-1]).reset_index( + drop=True + ) elif isinstance(data[0], SequenceClassificationSample): y_true = pd.Series(data).apply( diff --git a/langtest/transform/fairness.py b/langtest/transform/fairness.py index f34de1e98..13df85acc 100644 --- a/langtest/transform/fairness.py +++ b/langtest/transform/fairness.py @@ -142,12 +142,11 @@ async def run( """ progress = kwargs.get("progress_bar", False) - for sample in sample_list: data = gendered_data[sample.test_case] if len(data[0]) > 0: macro_f1_score = calculate_f1_score( - [x[0] for x in data[0]], data[1], average="macro", zero_division=0 + data[0].to_list(), data[1].to_list(), average="macro", zero_division=0 ) else: macro_f1_score = 1 @@ -232,7 +231,7 @@ async def run( data = gendered_data[sample.test_case] if len(data[0]) > 0: macro_f1_score = calculate_f1_score( - [x[0] for x in data[0]], data[1], average="macro", zero_division=0 + data[0].to_list(), data[1].to_list(), average="macro", zero_division=0 ) else: macro_f1_score = 1