diff --git a/nlptest/modelhandler/jsl_modelhandler.py b/nlptest/modelhandler/jsl_modelhandler.py index 9672492e7..81319af6b 100644 --- a/nlptest/modelhandler/jsl_modelhandler.py +++ b/nlptest/modelhandler/jsl_modelhandler.py @@ -232,7 +232,8 @@ def __init__( # in order to overwrite configs, light pipeline should be reinitialized. self.model = LightPipeline(model) - def load_model(self, path) -> 'NLUPipeline': + @classmethod + def load_model(cls, path) -> 'NLUPipeline': """Load the NER model into the `model` attribute. Args: path (str): Path to pretrained local or NLP Models Hub SparkNLP model diff --git a/nlptest/transform/__init__.py b/nlptest/transform/__init__.py index a9a3f4ce4..645a70ea7 100644 --- a/nlptest/transform/__init__.py +++ b/nlptest/transform/__init__.py @@ -643,7 +643,11 @@ def transform(self): for test_name, params in self.tests.items(): data_handler_copy = [x.copy() for x in self._data_handler] - y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions]) + try: + y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions]) + except: + y_true = pd.Series(data_handler_copy).apply(lambda x: [y.label for y in x.expected_results.predictions]) + X_test = pd.Series(data_handler_copy).apply(lambda x: x.original) y_pred = X_test.apply(self._model_handler.predict_raw) diff --git a/nlptest/transform/accuracy.py b/nlptest/transform/accuracy.py index 46edd599e..cc1c4b1db 100644 --- a/nlptest/transform/accuracy.py +++ b/nlptest/transform/accuracy.py @@ -76,7 +76,7 @@ def transform(y_true, y_pred, params): label:params["min_score"] for label in labels } - df_metrics = classification_report(y_true, y_pred, output_dict=True) + df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0) df_metrics.pop("accuracy") df_metrics.pop("macro avg") df_metrics.pop("weighted avg") @@ -90,8 +90,8 @@ def transform(y_true, y_pred, params): category = "Accuracy", test_type = "min_precision_score", test_case = k, - expected_results = MinScoreOutput(score=min_scores[k]), - actual_results = MinScoreOutput(score=v["precision"]), + expected_results = MinScoreOutput(min_score=min_scores[k]), + actual_results = MinScoreOutput(min_score=v["precision"]), state = "done" ) precision_samples.append(sample) @@ -133,7 +133,7 @@ def transform(y_true, y_pred, params): label:params["min_score"] for label in labels } - df_metrics = classification_report(y_true, y_pred, output_dict=True) + df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0) df_metrics.pop("accuracy") df_metrics.pop("macro avg") df_metrics.pop("weighted avg") @@ -147,8 +147,8 @@ def transform(y_true, y_pred, params): category = "Accuracy", test_type = "min_recall_score", test_case = k, - expected_results = MinScoreOutput(score=min_scores[k]), - actual_results = MinScoreOutput(score=v["recall"]), + expected_results = MinScoreOutput(min_score=min_scores[k]), + actual_results = MinScoreOutput(min_score=v["recall"]), state = "done" ) rec_samples.append(sample) @@ -191,7 +191,7 @@ def transform(y_true, y_pred, params): label:params["min_score"] for label in labels } - df_metrics = classification_report(y_true, y_pred, output_dict=True) + df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0) df_metrics.pop("accuracy") df_metrics.pop("macro avg") df_metrics.pop("weighted avg") @@ -205,8 +205,8 @@ def transform(y_true, y_pred, params): category = "Accuracy", test_type = "min_f1_score", test_case = k, - expected_results = MinScoreOutput(score=min_scores[k]), - actual_results = MinScoreOutput(score=v["f1-score"]), + expected_results = MinScoreOutput(min_score=min_scores[k]), + actual_results = MinScoreOutput(min_score=v["f1-score"]), state = "done" ) f1_samples.append(sample) @@ -241,15 +241,15 @@ def transform(y_true, y_pred, params): min_score = params["min_score"] - f1 = f1_score(y_true, y_pred, average="micro") + f1 = f1_score(y_true, y_pred, average="micro", zero_division=0) sample = Sample( original = "-", category = "Accuracy", test_type = "min_micro_f1_score", test_case = "micro", - expected_results = MinScoreOutput(score=min_score), - actual_results = MinScoreOutput(score=f1), + expected_results = MinScoreOutput(min_score=min_score), + actual_results = MinScoreOutput(min_score=f1), state = "done" ) @@ -284,15 +284,15 @@ def transform(y_true, y_pred, params): """ min_score = params["min_score"] - f1 = f1_score(y_true, y_pred, average="macro") + f1 = f1_score(y_true, y_pred, average="macro", zero_division=0) sample = Sample( original = "-", category = "Accuracy", test_type = "min__macro_f1_score", test_case = "macro", - expected_results = MinScoreOutput(score=min_score), - actual_results = MinScoreOutput(score=f1), + expected_results = MinScoreOutput(min_score=min_score), + actual_results = MinScoreOutput(min_score=f1), state = "done" ) @@ -326,15 +326,15 @@ def transform(y_true, y_pred, params): """ min_score = params["min_score"] - f1 = f1_score(y_true, y_pred, average="weighted") + f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0) sample = Sample( original = "-", category = "Accuracy", test_type = "min_weighted_f1_score", test_case = "weighted", - expected_results = MinScoreOutput(score=min_score), - actual_results = MinScoreOutput(score=f1), + expected_results = MinScoreOutput(min_score=min_score), + actual_results = MinScoreOutput(min_score=f1), state = "done" ) diff --git a/nlptest/transform/fairness.py b/nlptest/transform/fairness.py index 3f28fb8c4..e229592c2 100644 --- a/nlptest/transform/fairness.py +++ b/nlptest/transform/fairness.py @@ -80,8 +80,13 @@ def transform(data: List[Sample], model, params): for key, val in gendered_data.items(): if key not in min_scores.keys(): continue - y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions]) - X_test = pd.Series(val).apply(lambda x: x.original) + val = pd.Series(val, dtype="object") + try: + y_true = val.apply(lambda x: [y.entity for y in x.expected_results.predictions]) + except: + y_true = val.apply(lambda x: [y.label for y in x.expected_results.predictions]) + X_test = val.apply(lambda x: x.original) + y_pred = X_test.apply(model.predict_raw) valid_indices = y_true.apply(len) == y_pred.apply(len) @@ -93,9 +98,10 @@ def transform(data: List[Sample], model, params): y_true = y_true.explode().apply(lambda x: x.split("-")[-1]) y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1]) - - macro_f1_score = f1_score(y_true, y_pred, average="macro") - if np.isnan(macro_f1_score): + + if len(y_true)>0: + macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0) + else: macro_f1_score = 1 sample = Sample( @@ -103,8 +109,8 @@ def transform(data: List[Sample], model, params): category = "fairness", test_type = "min_gender_f1_score", test_case = key, - expected_results = MinScoreOutput(score=min_scores[key]), - actual_results = MinScoreOutput(score=macro_f1_score), + expected_results = MinScoreOutput(min_score=min_scores[key]), + actual_results = MinScoreOutput(min_score=macro_f1_score), state = "done" ) @@ -152,6 +158,13 @@ def transform(data: List[Sample], model, params): for key, val in gendered_data.items(): if key not in max_scores.keys(): continue + val = pd.Series(val, dtype="object") + + try: + y_true = val.apply(lambda x: [y.entity for y in x.expected_results.predictions]) + except: + y_true = val.apply(lambda x: [y.label for y in x.expected_results.predictions]) + X_test = val.apply(lambda x: x.original) y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions]) X_test = pd.Series(val).apply(lambda x: x.original) y_pred = X_test.apply(model.predict_raw) @@ -166,9 +179,9 @@ def transform(data: List[Sample], model, params): y_true = y_true.explode().apply(lambda x: x.split("-")[-1]) y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1]) - macro_f1_score = f1_score(y_true, y_pred, average="macro") - - if np.isnan(macro_f1_score): + if len(y_true)>0: + macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0) + else: macro_f1_score = 0 sample = Sample( @@ -176,8 +189,8 @@ def transform(data: List[Sample], model, params): category = "fairness", test_type = "max_gender_f1_score", test_case = key, - expected_results = MaxScoreOutput(score=max_scores[key]), - actual_results = MaxScoreOutput(score=macro_f1_score), + expected_results = MaxScoreOutput(max_score=max_scores[key]), + actual_results = MaxScoreOutput(max_score=macro_f1_score), state = "done" ) diff --git a/nlptest/transform/representation.py b/nlptest/transform/representation.py index 9ca873c47..b1bf24a1a 100644 --- a/nlptest/transform/representation.py +++ b/nlptest/transform/representation.py @@ -1,9 +1,8 @@ - - from abc import ABC, abstractmethod from typing import List import pandas as pd from nlptest.utils.custom_types import Sample, MinScoreOutput +from nlptest.utils.gender_classifier import GenderClassifier from .utils import default_label_representation ,default_ehtnicity_representation,default_economic_country_representation, default_religion_representation, get_label_representation_dict, get_country_economic_representation_dict, get_religion_name_representation_dict, get_ethnicity_representation_dict, get_entity_representation_proportions class BaseRepresentation(ABC): @@ -38,15 +37,90 @@ def transform(self): class GenderRepresentation(BaseRepresentation): + """ + Subclass of BaseRepresentation that implements the gender representation test. + + Attributes: + alias_name (List[str]): The list of test names that identify the representation measure. + """ alias_name = [ "min_gender_representation_count", "min_gender_representation_proportion" ] - def transform(data: List[Sample]): - return super().transform() + def transform(test, data, params): + """ + Args: + test (str): name of the test + data (List[Sample]): The input data to be evaluated for representation test. + params : parameters specified in config. + + Raises: + ValueError: If sum of specified proportions in config is greater than 1 + + Returns: + List[Sample]: Ethnicity Representation test results. + """ + classifier = GenderClassifier() + genders = [classifier.predict(sample.original) for sample in data] + + gender_counts = { + "male": len([x for x in genders if x == "male"]), + "female": len([x for x in genders if x == "female"]), + "unknown": len([x for x in genders if x == "unknown"]) + } + + samples = [] + if test == "min_gender_representation_count": + if isinstance(params["min_count"], dict): + min_counts = params["min_count"] + else: + min_counts = { + "male": params["min_count"], + "female": params["min_count"], + "unknown": params["min_count"] + } + for k, v in min_counts.items(): + sample = Sample( + original = "-", + category = "representation", + test_type = "min_gender_representation_count", + test_case = k, + expected_results = MinScoreOutput(min_score=v) , + actual_results = MinScoreOutput(min_score=gender_counts[k]), + state = "done" + ) + samples.append(sample) + elif test == "min_gender_representation_proportion": + min_proportions = { + "male": 0.26, + "female": 0.26, + "unknown": 0.26 + } + + if isinstance(params["min_proportion"], dict): + min_proportions = params["min_proportion"] + if sum(min_proportions.values()) > 1: + raise ValueError("Sum of proportions cannot be greater than 1. So min_gender_representation_proportion test cannot run.") + + total_samples = len(data) + for k, v in min_proportions.items(): + sample = Sample( + original = "-", + category = "representation", + test_type = "min_gender_representation_proportion", + test_case = k, + expected_results = MinScoreOutput(min_score=v) , + actual_results = MinScoreOutput(min_score=gender_counts[k]/total_samples), + state = "done" + ) + samples.append(sample) + return samples + + + class EthnicityRepresentation(BaseRepresentation): """ @@ -101,8 +175,8 @@ def transform(test,data,params): category = "representation", test_type = "min_ethnicity_name_representation_count", test_case = key, - expected_results = MinScoreOutput(score=value) , - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value) , + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -137,8 +211,8 @@ def transform(test,data,params): category = "representation", test_type = "min_ethnicity_name_representation_proportion", test_case = key, - expected_results = MinScoreOutput(score=value), - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value), + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -198,8 +272,8 @@ def transform(test,data,params): category = "representation", test_type = "min_label_representation_count", test_case = key, - expected_results = MinScoreOutput(score=value) , - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value) , + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -235,8 +309,8 @@ def transform(test,data,params): category = "representation", test_type = "min_label_representation_proportion", test_case = key, - expected_results = MinScoreOutput(score=value), - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value), + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -297,8 +371,8 @@ def transform(test,data,params): category = "representation", test_type = "min_religion_name_representation_count", test_case = key, - expected_results = MinScoreOutput(score=value) , - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value) , + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -332,8 +406,8 @@ def transform(test,data,params): category = "representation", test_type = "min_religion_name_representation_proportion", test_case = key, - expected_results = MinScoreOutput(score=value), - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value), + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -391,8 +465,8 @@ def transform(test,data,params): category = "representation", test_type = "min_country_economic_representation_count", test_case = key, - expected_results = MinScoreOutput(score=value) , - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value) , + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) @@ -426,8 +500,8 @@ def transform(test,data,params): category = "representation", test_type = "min_country_economic_representation_proportion", test_case = key, - expected_results = MinScoreOutput(score=value), - actual_results = MinScoreOutput(score=actual_representation[key]), + expected_results = MinScoreOutput(min_score=value), + actual_results = MinScoreOutput(min_score=actual_representation[key]), state = "done" ) sample_list.append(sample) diff --git a/checkpoints.ckpt b/nlptest/utils/checkpoints.ckpt similarity index 100% rename from checkpoints.ckpt rename to nlptest/utils/checkpoints.ckpt diff --git a/nlptest/utils/custom_types.py b/nlptest/utils/custom_types.py index f1672787f..927467312 100644 --- a/nlptest/utils/custom_types.py +++ b/nlptest/utils/custom_types.py @@ -175,33 +175,30 @@ def __eq__(self, other): class MinScoreOutput(BaseModel): """Output for accuracy/representation tests.""" - score: float + min_score: float def to_str_list(self) -> float: - return self.score + return self.min_score def __repr__(self) -> str: - return f"{self.score}" + return f"{self.min_score}" def __str__(self) -> str: - return f"{self.score}" + return f"{self.min_score}" class MaxScoreOutput(BaseModel): """Output for accuracy/representation tests.""" - score: float + max_score: float def to_str_list(self) -> float: - return self.score + return self.max_score def __repr__(self) -> str: - return f"{self.score}" + return f"{self.max_score}" def __str__(self) -> str: - return f"{self.score}" + return f"{self.max_score}" -class AccuracyOutput(BaseModel): - """Output for accuracy tests.""" - score: float -Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput) +Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput, MaxScoreOutput) class Transformation(BaseModel): original_span: Span @@ -384,9 +381,9 @@ def is_pass(self) -> bool: expected_preds = [j.entity for j in self.expected_results.predictions] return actual_preds == expected_preds elif isinstance(self.actual_results, MinScoreOutput): - return self.actual_results.score >= self.expected_results.score + return self.actual_results.min_score >= self.expected_results.min_score elif isinstance(self.actual_results, MaxScoreOutput): - return self.actual_results.score <= self.expected_results.score + return self.actual_results.max_score <= self.expected_results.max_score else: filtered_actual_results = self.actual_results diff --git a/nlptest/utils/gender_classifier.py b/nlptest/utils/gender_classifier.py index a389a2816..9097af957 100644 --- a/nlptest/utils/gender_classifier.py +++ b/nlptest/utils/gender_classifier.py @@ -1,5 +1,6 @@ import torch import logging +import os from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer class GenderClassifier(): @@ -7,7 +8,10 @@ def __init__(self) -> None: logging.getLogger("transformers").setLevel(logging.ERROR) tokenizer = AutoTokenizer.from_pretrained("microsoft/xtremedistil-l6-h256-uncased") model = AutoModelForSequenceClassification.from_pretrained("microsoft/xtremedistil-l6-h256-uncased", num_labels=3) - ckpts = torch.load("checkpoints.ckpt") + + curr_dir = os.path.dirname(__file__) + ckpt_path = os.path.join(curr_dir, 'checkpoints.ckpt') + ckpts = torch.load(ckpt_path) model.load_state_dict(ckpts) self.pipe = pipeline("text-classification", model=model, tokenizer=tokenizer) diff --git a/setup.py b/setup.py index 4b101ac82..1e3c01018 100644 --- a/setup.py +++ b/setup.py @@ -157,9 +157,9 @@ # }, # If there are data files included in your packages that need to be # installed, specify them here. - # package_data={ # Optional - # "sample": ["package_data.dat"], - # }, + package_data={ # Optional + "nlptest": ["utils/checkpoints.ckpt"], + }, # Although 'package_data' is the preferred approach, in some case you may # need to place data files outside of your packages. See: # http://docs.python.org/distutils/setupscript.html#installing-additional-files