From e49ec6a3c9f3d6207558520c38571e2ee78ed0a3 Mon Sep 17 00:00:00 2001 From: Ali Tarik Date: Thu, 23 Mar 2023 10:42:10 +0300 Subject: [PATCH 1/8] add gender representation --- nlptest/transform/representation.py | 82 +++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/nlptest/transform/representation.py b/nlptest/transform/representation.py index 9ca873c47..38b824acc 100644 --- a/nlptest/transform/representation.py +++ b/nlptest/transform/representation.py @@ -1,9 +1,8 @@ - - from abc import ABC, abstractmethod from typing import List import pandas as pd from nlptest.utils.custom_types import Sample, MinScoreOutput +from nlptest.utils.gender_classifier import GenderClassifier from .utils import default_label_representation ,default_ehtnicity_representation,default_economic_country_representation, default_religion_representation, get_label_representation_dict, get_country_economic_representation_dict, get_religion_name_representation_dict, get_ethnicity_representation_dict, get_entity_representation_proportions class BaseRepresentation(ABC): @@ -38,15 +37,90 @@ def transform(self): class GenderRepresentation(BaseRepresentation): + """ + Subclass of BaseRepresentation that implements the gender representation test. + + Attributes: + alias_name (List[str]): The list of test names that identify the representation measure. + """ alias_name = [ "min_gender_representation_count", "min_gender_representation_proportion" ] - def transform(data: List[Sample]): - return super().transform() + def transform(test, data, params): + """ + Args: + test (str): name of the test + data (List[Sample]): The input data to be evaluated for representation test. + params : parameters specified in config. + + Raises: + ValueError: If sum of specified proportions in config is greater than 1 + + Returns: + List[Sample]: Ethnicity Representation test results. + """ + classifier = GenderClassifier() + genders = [classifier.predict(sample.original) for sample in data] + + gender_counts = { + "male": len([x for x in genders if x == "male"]), + "female": len([x for x in genders if x == "female"]), + "unknown": len([x for x in genders if x == "unknown"]) + } + + samples = [] + if test == "min_gender_representation_count": + if isinstance(params["min_count"], dict): + min_counts = params["min_count"] + else: + min_counts = { + "male": params["min_count"], + "female": params["min_count"], + "unknown": params["min_count"] + } + for k, v in min_counts.items(): + sample = Sample( + original = "-", + category = "representation", + test_type = "min_gender_representation_count", + test_case = k, + expected_results = MinScoreOutput(score=v) , + actual_results = MinScoreOutput(score=gender_counts[k]), + state = "done" + ) + samples.append(sample) + elif test == "min_gender_representation_proportion": + min_proportions = { + "male": 0.26, + "female": 0.26, + "unknown": 0.26 + } + + if isinstance(params["min_proportion"], dict): + min_proportions = params["min_proportion"] + if sum(min_proportions.values()) > 1: + raise ValueError("Sum of proportions cannot be greater than 1. So min_gender_representation_proportion test cannot run.") + + total_samples = len(data) + for k, v in min_proportions.items(): + sample = Sample( + original = "-", + category = "representation", + test_type = "min_gender_representation_proportion", + test_case = k, + expected_results = MinScoreOutput(score=v) , + actual_results = MinScoreOutput(score=gender_counts[k]/total_samples), + state = "done" + ) + samples.append(sample) + return samples + + + class EthnicityRepresentation(BaseRepresentation): """ From 58e6b2d4084360415ad49c7d2beb3ee89c92826d Mon Sep 17 00:00:00 2001 From: Ali Tarik Date: Thu, 23 Mar 2023 13:38:22 +0300 Subject: [PATCH 2/8] fix jsl_text_classification bug --- nlptest/modelhandler/jsl_modelhandler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nlptest/modelhandler/jsl_modelhandler.py b/nlptest/modelhandler/jsl_modelhandler.py index 9672492e7..81319af6b 100644 --- a/nlptest/modelhandler/jsl_modelhandler.py +++ b/nlptest/modelhandler/jsl_modelhandler.py @@ -232,7 +232,8 @@ def __init__( # in order to overwrite configs, light pipeline should be reinitialized. self.model = LightPipeline(model) - def load_model(self, path) -> 'NLUPipeline': + @classmethod + def load_model(cls, path) -> 'NLUPipeline': """Load the NER model into the `model` attribute. Args: path (str): Path to pretrained local or NLP Models Hub SparkNLP model From 144b6451b913ace71b262bc113c5e408190f8bc0 Mon Sep 17 00:00:00 2001 From: Ali Tarik Date: Thu, 23 Mar 2023 13:38:31 +0300 Subject: [PATCH 3/8] fix for text classification --- nlptest/transform/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nlptest/transform/__init__.py b/nlptest/transform/__init__.py index a9a3f4ce4..645a70ea7 100644 --- a/nlptest/transform/__init__.py +++ b/nlptest/transform/__init__.py @@ -643,7 +643,11 @@ def transform(self): for test_name, params in self.tests.items(): data_handler_copy = [x.copy() for x in self._data_handler] - y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions]) + try: + y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions]) + except: + y_true = pd.Series(data_handler_copy).apply(lambda x: [y.label for y in x.expected_results.predictions]) + X_test = pd.Series(data_handler_copy).apply(lambda x: x.original) y_pred = X_test.apply(self._model_handler.predict_raw) From 4f1c4f4fd31855b415a8c695f01605497a41d795 Mon Sep 17 00:00:00 2001 From: Ali Tarik Date: Thu, 23 Mar 2023 13:39:08 +0300 Subject: [PATCH 4/8] remove warnings --- nlptest/transform/accuracy.py | 12 ++++++------ nlptest/transform/fairness.py | 15 +++++++++++---- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/nlptest/transform/accuracy.py b/nlptest/transform/accuracy.py index 0009c3b22..50ae63e11 100644 --- a/nlptest/transform/accuracy.py +++ b/nlptest/transform/accuracy.py @@ -76,7 +76,7 @@ def transform(y_true, y_pred, params): label:params["min_score"] for label in labels } - df_metrics = classification_report(y_true, y_pred, output_dict=True) + df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0) df_metrics.pop("accuracy") df_metrics.pop("macro avg") df_metrics.pop("weighted avg") @@ -131,7 +131,7 @@ def transform(y_true, y_pred, params): label:params["min_score"] for label in labels } - df_metrics = classification_report(y_true, y_pred, output_dict=True) + df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0) df_metrics.pop("accuracy") df_metrics.pop("macro avg") df_metrics.pop("weighted avg") @@ -187,7 +187,7 @@ def transform(y_true, y_pred, params): label:params["min_score"] for label in labels } - df_metrics = classification_report(y_true, y_pred, output_dict=True) + df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0) df_metrics.pop("accuracy") df_metrics.pop("macro avg") df_metrics.pop("weighted avg") @@ -235,7 +235,7 @@ def transform(y_true, y_pred, params): min_score = params["min_score"] - f1 = f1_score(y_true, y_pred, average="micro") + f1 = f1_score(y_true, y_pred, average="micro", zero_division=0) sample = Sample( original = "-", @@ -278,7 +278,7 @@ def transform(y_true, y_pred, params): """ min_score = params["min_score"] - f1 = f1_score(y_true, y_pred, average="macro") + f1 = f1_score(y_true, y_pred, average="macro", zero_division=0) sample = Sample( original = "-", @@ -320,7 +320,7 @@ def transform(y_true, y_pred, params): """ min_score = params["min_score"] - f1 = f1_score(y_true, y_pred, average="weighted") + f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0) sample = Sample( original = "-", diff --git a/nlptest/transform/fairness.py b/nlptest/transform/fairness.py index 1e18e139a..f76b84ca1 100644 --- a/nlptest/transform/fairness.py +++ b/nlptest/transform/fairness.py @@ -78,7 +78,11 @@ def transform(data: List[Sample], model, params): samples = [] for key, val in gendered_data.items(): - y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions]) + try: + y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions]) + except: + y_true = pd.Series(val).apply(lambda x: [y.label for y in x.expected_results.predictions]) + X_test = pd.Series(val).apply(lambda x: x.original) y_pred = X_test.apply(model.predict_raw) @@ -92,7 +96,7 @@ def transform(data: List[Sample], model, params): y_true = y_true.explode().apply(lambda x: x.split("-")[-1]) y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1]) - macro_f1_score = f1_score(y_true, y_pred, average="macro") + macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0) if np.isnan(macro_f1_score): macro_f1_score = 1 @@ -149,7 +153,10 @@ def transform(data: List[Sample], model, params): samples = [] for key, val in gendered_data.items(): - y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions]) + try: + y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions]) + except: + y_true = pd.Series(val).apply(lambda x: [y.label for y in x.expected_results.predictions]) X_test = pd.Series(val).apply(lambda x: x.original) y_pred = X_test.apply(model.predict_raw) @@ -163,7 +170,7 @@ def transform(data: List[Sample], model, params): y_true = y_true.explode().apply(lambda x: x.split("-")[-1]) y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1]) - macro_f1_score = f1_score(y_true, y_pred, average="macro") + macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0) if np.isnan(macro_f1_score): macro_f1_score = 0 From 79fdee68dfe5263722b21a3a56be405620b89532 Mon Sep 17 00:00:00 2001 From: Ali Tarik Date: Thu, 23 Mar 2023 15:53:26 +0300 Subject: [PATCH 5/8] fix max score bug --- nlptest/transform/accuracy.py | 24 ++++++++--------- nlptest/transform/fairness.py | 10 ++++---- nlptest/transform/representation.py | 40 ++++++++++++++--------------- nlptest/utils/custom_types.py | 25 ++++++++---------- 4 files changed, 48 insertions(+), 51 deletions(-) diff --git a/nlptest/transform/accuracy.py b/nlptest/transform/accuracy.py index 50ae63e11..23374624d 100644 --- a/nlptest/transform/accuracy.py +++ b/nlptest/transform/accuracy.py @@ -88,8 +88,8 @@ def transform(y_true, y_pred, params): category = "Accuracy", test_type = "min_precision_score", test_case = k, - expected_results = MinScoreOutput(score=min_scores[k]), - actual_results = MinScoreOutput(score=v["precision"]), + expected_results = MinScoreOutput(min_score=min_scores[k]), + actual_results = MinScoreOutput(min_score=v["precision"]), state = "done" ) precision_samples.append(sample) @@ -143,8 +143,8 @@ def transform(y_true, y_pred, params): category = "Accuracy", test_type = "min_recall_score", test_case = k, - expected_results = MinScoreOutput(score=min_scores[k]), - actual_results = MinScoreOutput(score=v["recall"]), + expected_results = MinScoreOutput(min_score=min_scores[k]), + actual_results = MinScoreOutput(min_score=v["recall"]), state = "done" ) rec_samples.append(sample) @@ -199,8 +199,8 @@ def transform(y_true, y_pred, params): category = "Accuracy", test_type = "min_f1_score", test_case = k, - expected_results = MinScoreOutput(score=min_scores[k]), - actual_results = MinScoreOutput(score=v["f1-score"]), + expected_results = MinScoreOutput(min_score=min_scores[k]), + actual_results = MinScoreOutput(min_score=v["f1-score"]), state = "done" ) f1_samples.append(sample) @@ -242,8 +242,8 @@ def transform(y_true, y_pred, params): category = "Accuracy", test_type = "min_micro_f1_score", test_case = "micro", - expected_results = MinScoreOutput(score=min_score), - actual_results = MinScoreOutput(score=f1), + expected_results = MinScoreOutput(min_score=min_score), + actual_results = MinScoreOutput(min_score=f1), state = "done" ) @@ -285,8 +285,8 @@ def transform(y_true, y_pred, params): category = "Accuracy", test_type = "min__macro_f1_score", test_case = "macro", - expected_results = MinScoreOutput(score=min_score), - actual_results = MinScoreOutput(score=f1), + expected_results = MinScoreOutput(min_score=min_score), + actual_results = MinScoreOutput(min_score=f1), state = "done" ) @@ -327,8 +327,8 @@ def transform(y_true, y_pred, params): category = "Accuracy", test_type = "min_weighted_f1_score", test_case = "weighted", - expected_results = MinScoreOutput(score=min_score), - actual_results = MinScoreOutput(score=f1), + expected_results = MinScoreOutput(min_score=min_score), + actual_results = MinScoreOutput(min_score=f1), state = "done" ) diff --git a/nlptest/transform/fairness.py b/nlptest/transform/fairness.py index f76b84ca1..e7483c86e 100644 --- a/nlptest/transform/fairness.py +++ b/nlptest/transform/fairness.py @@ -105,8 +105,8 @@ def transform(data: List[Sample], model, params): category = "fairness", test_type = "min_gender_f1_score", test_case = key, - expected_results = MinScoreOutput(score=min_scores[key]), - actual_results = MinScoreOutput(score=macro_f1_score), + expected_results = MinScoreOutput(min_score=min_scores[key]), + actual_results = MinScoreOutput(min_score=macro_f1_score), state = "done" ) @@ -178,10 +178,10 @@ def transform(data: List[Sample], model, params): sample = Sample( original = "-", category = "fairness", - test_type = "min_gender_f1_score", + test_type = "max_gender_f1_score", test_case = key, - expected_results = MaxScoreOutput(score=max_scores[key]), - actual_results = MaxScoreOutput(score=macro_f1_score), + expected_results = MaxScoreOutput(max_score=max_scores[key]), + actual_results = MaxScoreOutput(max_score=macro_f1_score), state = "done" ) diff --git a/nlptest/transform/representation.py b/nlptest/transform/representation.py index 38b824acc..b1bf24a1a 100644 --- a/nlptest/transform/representation.py +++ b/nlptest/transform/representation.py @@ -88,8 +88,8 @@ def transform(test, data, params): category = "representation", test_type = "min_gender_representation_count", test_case = k, - expected_results = MinScoreOutput(score=v) , - actual_results = MinScoreOutput(score=gender_counts[k]), + expected_results = MinScoreOutput(min_score=v) , + actual_results = MinScoreOutput(min_score=gender_counts[k]), state = "done" ) samples.append(sample) @@ -112,8 +112,8 @@ def transform(test, data, params): category = "representation", test_type = "min_gender_representation_proportion", test_case = k, - expected_results = MinScoreOutput(score=v) , - actual_results = MinScoreOutput(score=gender_counts[k]/total_samples), + expected_results = MinScoreOutput(min_score=v) , + actual_results = MinScoreOutput(min_score=gender_counts[k]/total_samples), state = "done" ) samples.append(sample) @@ -175,8 +175,8 @@ def transform(test,data,params): category = "representation", test_type = "min_ethnicity_name_representation_count", test_case = key, - expected_results = MinScoreOutput(score=value) , - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value) , + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -211,8 +211,8 @@ def transform(test,data,params): category = "representation", test_type = "min_ethnicity_name_representation_proportion", test_case = key, - expected_results = MinScoreOutput(score=value), - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value), + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -272,8 +272,8 @@ def transform(test,data,params): category = "representation", test_type = "min_label_representation_count", test_case = key, - expected_results = MinScoreOutput(score=value) , - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value) , + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -309,8 +309,8 @@ def transform(test,data,params): category = "representation", test_type = "min_label_representation_proportion", test_case = key, - expected_results = MinScoreOutput(score=value), - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value), + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -371,8 +371,8 @@ def transform(test,data,params): category = "representation", test_type = "min_religion_name_representation_count", test_case = key, - expected_results = MinScoreOutput(score=value) , - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value) , + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -406,8 +406,8 @@ def transform(test,data,params): category = "representation", test_type = "min_religion_name_representation_proportion", test_case = key, - expected_results = MinScoreOutput(score=value), - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value), + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -465,8 +465,8 @@ def transform(test,data,params): category = "representation", test_type = "min_country_economic_representation_count", test_case = key, - expected_results = MinScoreOutput(score=value) , - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value) , + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -500,8 +500,8 @@ def transform(test,data,params): category = "representation", test_type = "min_country_economic_representation_proportion", test_case = key, - expected_results = MinScoreOutput(score=value), - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value), + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) diff --git a/nlptest/utils/custom_types.py b/nlptest/utils/custom_types.py index f1672787f..927467312 100644 --- a/nlptest/utils/custom_types.py +++ b/nlptest/utils/custom_types.py @@ -175,33 +175,30 @@ def __eq__(self, other): class MinScoreOutput(BaseModel): """Output for accuracy/representation tests.""" - score: float + min_score: float def to_str_list(self) -> float: - return self.score + return self.min_score def __repr__(self) -> str: - return f"{self.score}" + return f"{self.min_score}" def __str__(self) -> str: - return f"{self.score}" + return f"{self.min_score}" class MaxScoreOutput(BaseModel): """Output for accuracy/representation tests.""" - score: float + max_score: float def to_str_list(self) -> float: - return self.score + return self.max_score def __repr__(self) -> str: - return f"{self.score}" + return f"{self.max_score}" def __str__(self) -> str: - return f"{self.score}" + return f"{self.max_score}" -class AccuracyOutput(BaseModel): - """Output for accuracy tests.""" - score: float -Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput) +Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput, MaxScoreOutput) class Transformation(BaseModel): original_span: Span @@ -384,9 +381,9 @@ def is_pass(self) -> bool: expected_preds = [j.entity for j in self.expected_results.predictions] return actual_preds == expected_preds elif isinstance(self.actual_results, MinScoreOutput): - return self.actual_results.score >= self.expected_results.score + return self.actual_results.min_score >= self.expected_results.min_score elif isinstance(self.actual_results, MaxScoreOutput): - return self.actual_results.score <= self.expected_results.score + return self.actual_results.max_score <= self.expected_results.max_score else: filtered_actual_results = self.actual_results From 0b32c414e88084ee4158a6d5993737f49f380049 Mon Sep 17 00:00:00 2001 From: Ali Tarik Date: Thu, 23 Mar 2023 19:11:36 +0300 Subject: [PATCH 6/8] fix checkpoints path --- checkpoints.ckpt => nlptest/utils/checkpoints.ckpt | Bin nlptest/utils/gender_classifier.py | 6 +++++- 2 files changed, 5 insertions(+), 1 deletion(-) rename checkpoints.ckpt => nlptest/utils/checkpoints.ckpt (100%) diff --git a/checkpoints.ckpt b/nlptest/utils/checkpoints.ckpt similarity index 100% rename from checkpoints.ckpt rename to nlptest/utils/checkpoints.ckpt diff --git a/nlptest/utils/gender_classifier.py b/nlptest/utils/gender_classifier.py index a389a2816..9097af957 100644 --- a/nlptest/utils/gender_classifier.py +++ b/nlptest/utils/gender_classifier.py @@ -1,5 +1,6 @@ import torch import logging +import os from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer class GenderClassifier(): @@ -7,7 +8,10 @@ def __init__(self) -> None: logging.getLogger("transformers").setLevel(logging.ERROR) tokenizer = AutoTokenizer.from_pretrained("microsoft/xtremedistil-l6-h256-uncased") model = AutoModelForSequenceClassification.from_pretrained("microsoft/xtremedistil-l6-h256-uncased", num_labels=3) - ckpts = torch.load("checkpoints.ckpt") + + curr_dir = os.path.dirname(__file__) + ckpt_path = os.path.join(curr_dir, 'checkpoints.ckpt') + ckpts = torch.load(ckpt_path) model.load_state_dict(ckpts) self.pipe = pipeline("text-classification", model=model, tokenizer=tokenizer) From dbd3c0590a10238d16d38e4388b387464d249b82 Mon Sep 17 00:00:00 2001 From: Ali Tarik Date: Thu, 23 Mar 2023 19:39:04 +0300 Subject: [PATCH 7/8] include checkpoints in setup --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 4b101ac82..1e3c01018 100644 --- a/setup.py +++ b/setup.py @@ -157,9 +157,9 @@ # }, # If there are data files included in your packages that need to be # installed, specify them here. - # package_data={ # Optional - # "sample": ["package_data.dat"], - # }, + package_data={ # Optional + "nlptest": ["utils/checkpoints.ckpt"], + }, # Although 'package_data' is the preferred approach, in some case you may # need to place data files outside of your packages. See: # http://docs.python.org/distutils/setupscript.html#installing-additional-files From df3b4ad419cdcebcf9a0e2c8f6a0c9c1e7ee5be0 Mon Sep 17 00:00:00 2001 From: Ali Tarik Date: Thu, 23 Mar 2023 19:55:02 +0300 Subject: [PATCH 8/8] cleanup warnings --- nlptest/transform/fairness.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/nlptest/transform/fairness.py b/nlptest/transform/fairness.py index e7483c86e..02c73a2a5 100644 --- a/nlptest/transform/fairness.py +++ b/nlptest/transform/fairness.py @@ -78,12 +78,13 @@ def transform(data: List[Sample], model, params): samples = [] for key, val in gendered_data.items(): + val = pd.Series(val, dtype="object") try: - y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions]) + y_true = val.apply(lambda x: [y.entity for y in x.expected_results.predictions]) except: - y_true = pd.Series(val).apply(lambda x: [y.label for y in x.expected_results.predictions]) + y_true = val.apply(lambda x: [y.label for y in x.expected_results.predictions]) - X_test = pd.Series(val).apply(lambda x: x.original) + X_test = val.apply(lambda x: x.original) y_pred = X_test.apply(model.predict_raw) valid_indices = y_true.apply(len) == y_pred.apply(len) @@ -95,9 +96,10 @@ def transform(data: List[Sample], model, params): y_true = y_true.explode().apply(lambda x: x.split("-")[-1]) y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1]) - - macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0) - if np.isnan(macro_f1_score): + + if len(y_true)>0: + macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0) + else: macro_f1_score = 1 sample = Sample( @@ -153,11 +155,13 @@ def transform(data: List[Sample], model, params): samples = [] for key, val in gendered_data.items(): + val = pd.Series(val, dtype="object") + try: - y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions]) + y_true = val.apply(lambda x: [y.entity for y in x.expected_results.predictions]) except: - y_true = pd.Series(val).apply(lambda x: [y.label for y in x.expected_results.predictions]) - X_test = pd.Series(val).apply(lambda x: x.original) + y_true = val.apply(lambda x: [y.label for y in x.expected_results.predictions]) + X_test = val.apply(lambda x: x.original) y_pred = X_test.apply(model.predict_raw) valid_indices = y_true.apply(len) == y_pred.apply(len) @@ -170,9 +174,9 @@ def transform(data: List[Sample], model, params): y_true = y_true.explode().apply(lambda x: x.split("-")[-1]) y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1]) - macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0) - - if np.isnan(macro_f1_score): + if len(y_true)>0: + macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0) + else: macro_f1_score = 0 sample = Sample(