From e49ec6a3c9f3d6207558520c38571e2ee78ed0a3 Mon Sep 17 00:00:00 2001
From: Ali Tarik <alimirik55@gmail.com>
Date: Thu, 23 Mar 2023 10:42:10 +0300
Subject: [PATCH 1/8] add gender representation

---
 nlptest/transform/representation.py | 82 +++++++++++++++++++++++++++--
 1 file changed, 78 insertions(+), 4 deletions(-)

diff --git a/nlptest/transform/representation.py b/nlptest/transform/representation.py
index 9ca873c47..38b824acc 100644
--- a/nlptest/transform/representation.py
+++ b/nlptest/transform/representation.py
@@ -1,9 +1,8 @@
-
-
 from abc import ABC, abstractmethod
 from typing import List
 import pandas as pd
 from nlptest.utils.custom_types import Sample, MinScoreOutput
+from nlptest.utils.gender_classifier import GenderClassifier
 from .utils import default_label_representation ,default_ehtnicity_representation,default_economic_country_representation,  default_religion_representation, get_label_representation_dict, get_country_economic_representation_dict, get_religion_name_representation_dict, get_ethnicity_representation_dict, get_entity_representation_proportions
 
 class BaseRepresentation(ABC):
@@ -38,15 +37,90 @@ def transform(self):
 
 
 class GenderRepresentation(BaseRepresentation):
+    """
+    Subclass of BaseRepresentation that implements the gender representation test.
+
+    Attributes:
+        alias_name (List[str]): The list of test names that identify the representation measure.
 
+    """
     alias_name = [
         "min_gender_representation_count",
         "min_gender_representation_proportion"
     ]
     
-    def transform(data: List[Sample]):
-        return super().transform()
+    def transform(test, data, params):
+        """
+        Args:
+            test (str): name of the test
+            data (List[Sample]): The input data to be evaluated for representation test.
+            params : parameters specified in config.
+
+        Raises:
+            ValueError: If sum of specified proportions in config is greater than 1
+    
+        Returns:
+            List[Sample]: Ethnicity Representation test results.
+        """    
+        classifier = GenderClassifier()
+        genders = [classifier.predict(sample.original) for sample in data]
+
+        gender_counts = {
+            "male": len([x for x in genders if x == "male"]),
+            "female": len([x for x in genders if x == "female"]),
+            "unknown": len([x for x in genders if x == "unknown"])
+        }
+
+        samples = []
+        if test == "min_gender_representation_count":
+            if isinstance(params["min_count"], dict):
+                min_counts = params["min_count"]
+            else:
+                min_counts = {
+                    "male": params["min_count"],
+                    "female": params["min_count"],
+                    "unknown": params["min_count"]
+                }
 
+            for k, v in min_counts.items():
+                sample = Sample(
+                    original = "-",
+                    category = "representation",
+                    test_type = "min_gender_representation_count",
+                    test_case = k,
+                    expected_results = MinScoreOutput(score=v) ,
+                    actual_results = MinScoreOutput(score=gender_counts[k]),
+                    state = "done"
+                )
+                samples.append(sample)
+        elif test == "min_gender_representation_proportion":
+            min_proportions = {
+                "male": 0.26,
+                "female": 0.26,
+                "unknown": 0.26
+            }
+
+            if isinstance(params["min_proportion"], dict):
+                min_proportions = params["min_proportion"]
+                if sum(min_proportions.values()) > 1:
+                    raise ValueError("Sum of proportions cannot be greater than 1. So min_gender_representation_proportion test cannot run.")
+
+            total_samples = len(data)
+            for k, v in min_proportions.items():
+                sample = Sample(
+                    original = "-",
+                    category = "representation",
+                    test_type = "min_gender_representation_proportion",
+                    test_case = k,
+                    expected_results = MinScoreOutput(score=v) ,
+                    actual_results = MinScoreOutput(score=gender_counts[k]/total_samples),
+                    state = "done"
+                )
+                samples.append(sample)
+        return samples
+            
+        
+        
 class EthnicityRepresentation(BaseRepresentation):
     
     """

From 58e6b2d4084360415ad49c7d2beb3ee89c92826d Mon Sep 17 00:00:00 2001
From: Ali Tarik <alimirik55@gmail.com>
Date: Thu, 23 Mar 2023 13:38:22 +0300
Subject: [PATCH 2/8] fix jsl_text_classification bug

---
 nlptest/modelhandler/jsl_modelhandler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nlptest/modelhandler/jsl_modelhandler.py b/nlptest/modelhandler/jsl_modelhandler.py
index 9672492e7..81319af6b 100644
--- a/nlptest/modelhandler/jsl_modelhandler.py
+++ b/nlptest/modelhandler/jsl_modelhandler.py
@@ -232,7 +232,8 @@ def __init__(
         #   in order to overwrite configs, light pipeline should be reinitialized.
         self.model = LightPipeline(model)
 
-    def load_model(self, path) -> 'NLUPipeline':
+    @classmethod
+    def load_model(cls, path) -> 'NLUPipeline':
         """Load the NER model into the `model` attribute.
         Args:
             path (str): Path to pretrained local or NLP Models Hub SparkNLP model

From 144b6451b913ace71b262bc113c5e408190f8bc0 Mon Sep 17 00:00:00 2001
From: Ali Tarik <alimirik55@gmail.com>
Date: Thu, 23 Mar 2023 13:38:31 +0300
Subject: [PATCH 3/8] fix for text classification

---
 nlptest/transform/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/nlptest/transform/__init__.py b/nlptest/transform/__init__.py
index a9a3f4ce4..645a70ea7 100644
--- a/nlptest/transform/__init__.py
+++ b/nlptest/transform/__init__.py
@@ -643,7 +643,11 @@ def transform(self):
         for test_name, params in self.tests.items():            
             data_handler_copy = [x.copy() for x in self._data_handler]
 
-            y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            try:
+                y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            except:
+                y_true = pd.Series(data_handler_copy).apply(lambda x: [y.label for y in x.expected_results.predictions])
+
             X_test = pd.Series(data_handler_copy).apply(lambda x: x.original)
             y_pred = X_test.apply(self._model_handler.predict_raw)
 

From 4f1c4f4fd31855b415a8c695f01605497a41d795 Mon Sep 17 00:00:00 2001
From: Ali Tarik <alimirik55@gmail.com>
Date: Thu, 23 Mar 2023 13:39:08 +0300
Subject: [PATCH 4/8] remove warnings

---
 nlptest/transform/accuracy.py | 12 ++++++------
 nlptest/transform/fairness.py | 15 +++++++++++----
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/nlptest/transform/accuracy.py b/nlptest/transform/accuracy.py
index 0009c3b22..50ae63e11 100644
--- a/nlptest/transform/accuracy.py
+++ b/nlptest/transform/accuracy.py
@@ -76,7 +76,7 @@ def transform(y_true, y_pred, params):
                 label:params["min_score"] for label in labels
             }
 
-        df_metrics = classification_report(y_true, y_pred, output_dict=True)
+        df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
         df_metrics.pop("accuracy")
         df_metrics.pop("macro avg")
         df_metrics.pop("weighted avg")
@@ -131,7 +131,7 @@ def transform(y_true, y_pred, params):
                 label:params["min_score"] for label in labels
             }
         
-        df_metrics = classification_report(y_true, y_pred, output_dict=True)
+        df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
         df_metrics.pop("accuracy")
         df_metrics.pop("macro avg")
         df_metrics.pop("weighted avg")
@@ -187,7 +187,7 @@ def transform(y_true, y_pred, params):
                 label:params["min_score"] for label in labels
             }
 
-        df_metrics = classification_report(y_true, y_pred, output_dict=True)
+        df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
         df_metrics.pop("accuracy")
         df_metrics.pop("macro avg")
         df_metrics.pop("weighted avg")
@@ -235,7 +235,7 @@ def transform(y_true, y_pred, params):
 
         min_score = params["min_score"]
 
-        f1 = f1_score(y_true, y_pred, average="micro")
+        f1 = f1_score(y_true, y_pred, average="micro", zero_division=0)
 
         sample = Sample(
             original = "-",
@@ -278,7 +278,7 @@ def transform(y_true, y_pred, params):
         """
 
         min_score = params["min_score"]
-        f1 = f1_score(y_true, y_pred, average="macro")
+        f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
 
         sample = Sample(
             original = "-",
@@ -320,7 +320,7 @@ def transform(y_true, y_pred, params):
         """
 
         min_score = params["min_score"]
-        f1 = f1_score(y_true, y_pred, average="weighted")
+        f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
 
         sample = Sample(
             original = "-",
diff --git a/nlptest/transform/fairness.py b/nlptest/transform/fairness.py
index 1e18e139a..f76b84ca1 100644
--- a/nlptest/transform/fairness.py
+++ b/nlptest/transform/fairness.py
@@ -78,7 +78,11 @@ def transform(data: List[Sample], model, params):
         samples = []
 
         for key, val in gendered_data.items():
-            y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            try:
+                y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            except:
+                y_true = pd.Series(val).apply(lambda x: [y.label for y in x.expected_results.predictions])
+
             X_test = pd.Series(val).apply(lambda x: x.original)
             y_pred = X_test.apply(model.predict_raw)
             
@@ -92,7 +96,7 @@ def transform(data: List[Sample], model, params):
             y_true = y_true.explode().apply(lambda x: x.split("-")[-1])
             y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1])
 
-            macro_f1_score = f1_score(y_true, y_pred, average="macro")
+            macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0)
             if np.isnan(macro_f1_score):
                 macro_f1_score = 1
 
@@ -149,7 +153,10 @@ def transform(data: List[Sample], model, params):
         samples = []
 
         for key, val in gendered_data.items():
-            y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            try:
+                y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            except:
+                y_true = pd.Series(val).apply(lambda x: [y.label for y in x.expected_results.predictions])
             X_test = pd.Series(val).apply(lambda x: x.original)
             y_pred = X_test.apply(model.predict_raw)
             
@@ -163,7 +170,7 @@ def transform(data: List[Sample], model, params):
             y_true = y_true.explode().apply(lambda x: x.split("-")[-1])
             y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1])
 
-            macro_f1_score = f1_score(y_true, y_pred, average="macro")
+            macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0)
 
             if np.isnan(macro_f1_score):
                 macro_f1_score = 0

From 79fdee68dfe5263722b21a3a56be405620b89532 Mon Sep 17 00:00:00 2001
From: Ali Tarik <alimirik55@gmail.com>
Date: Thu, 23 Mar 2023 15:53:26 +0300
Subject: [PATCH 5/8] fix max score bug

---
 nlptest/transform/accuracy.py       | 24 ++++++++---------
 nlptest/transform/fairness.py       | 10 ++++----
 nlptest/transform/representation.py | 40 ++++++++++++++---------------
 nlptest/utils/custom_types.py       | 25 ++++++++----------
 4 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/nlptest/transform/accuracy.py b/nlptest/transform/accuracy.py
index 50ae63e11..23374624d 100644
--- a/nlptest/transform/accuracy.py
+++ b/nlptest/transform/accuracy.py
@@ -88,8 +88,8 @@ def transform(y_true, y_pred, params):
                 category = "Accuracy",
                 test_type = "min_precision_score",
                 test_case = k,
-                expected_results = MinScoreOutput(score=min_scores[k]),
-                actual_results = MinScoreOutput(score=v["precision"]),
+                expected_results = MinScoreOutput(min_score=min_scores[k]),
+                actual_results = MinScoreOutput(min_score=v["precision"]),
                 state = "done"
             )
             precision_samples.append(sample)
@@ -143,8 +143,8 @@ def transform(y_true, y_pred, params):
                 category = "Accuracy",
                 test_type = "min_recall_score",
                 test_case = k,
-                expected_results = MinScoreOutput(score=min_scores[k]),
-                actual_results = MinScoreOutput(score=v["recall"]),
+                expected_results = MinScoreOutput(min_score=min_scores[k]),
+                actual_results = MinScoreOutput(min_score=v["recall"]),
                 state = "done"
             )
             rec_samples.append(sample)
@@ -199,8 +199,8 @@ def transform(y_true, y_pred, params):
                 category = "Accuracy",
                 test_type = "min_f1_score",
                 test_case = k,
-                expected_results = MinScoreOutput(score=min_scores[k]),
-                actual_results = MinScoreOutput(score=v["f1-score"]),
+                expected_results = MinScoreOutput(min_score=min_scores[k]),
+                actual_results = MinScoreOutput(min_score=v["f1-score"]),
                 state = "done"
             )
             f1_samples.append(sample)
@@ -242,8 +242,8 @@ def transform(y_true, y_pred, params):
             category = "Accuracy",
             test_type = "min_micro_f1_score",
             test_case = "micro",
-            expected_results = MinScoreOutput(score=min_score),
-            actual_results = MinScoreOutput(score=f1),
+            expected_results = MinScoreOutput(min_score=min_score),
+            actual_results = MinScoreOutput(min_score=f1),
 
             state = "done"
         )
@@ -285,8 +285,8 @@ def transform(y_true, y_pred, params):
             category = "Accuracy",
             test_type = "min__macro_f1_score",
             test_case = "macro",
-            expected_results = MinScoreOutput(score=min_score),
-            actual_results = MinScoreOutput(score=f1),
+            expected_results = MinScoreOutput(min_score=min_score),
+            actual_results = MinScoreOutput(min_score=f1),
             state = "done"
         )
 
@@ -327,8 +327,8 @@ def transform(y_true, y_pred, params):
             category = "Accuracy",
             test_type = "min_weighted_f1_score",
             test_case = "weighted",
-            expected_results = MinScoreOutput(score=min_score),
-            actual_results = MinScoreOutput(score=f1),
+            expected_results = MinScoreOutput(min_score=min_score),
+            actual_results = MinScoreOutput(min_score=f1),
             state = "done"
         )
 
diff --git a/nlptest/transform/fairness.py b/nlptest/transform/fairness.py
index f76b84ca1..e7483c86e 100644
--- a/nlptest/transform/fairness.py
+++ b/nlptest/transform/fairness.py
@@ -105,8 +105,8 @@ def transform(data: List[Sample], model, params):
                 category = "fairness",
                 test_type = "min_gender_f1_score",
                 test_case = key,
-                expected_results = MinScoreOutput(score=min_scores[key]),
-                actual_results = MinScoreOutput(score=macro_f1_score),
+                expected_results = MinScoreOutput(min_score=min_scores[key]),
+                actual_results = MinScoreOutput(min_score=macro_f1_score),
                 state = "done"
             )
 
@@ -178,10 +178,10 @@ def transform(data: List[Sample], model, params):
             sample = Sample(
                 original = "-",
                 category = "fairness",
-                test_type = "min_gender_f1_score",
+                test_type = "max_gender_f1_score",
                 test_case = key,
-                expected_results = MaxScoreOutput(score=max_scores[key]),
-                actual_results = MaxScoreOutput(score=macro_f1_score),
+                expected_results = MaxScoreOutput(max_score=max_scores[key]),
+                actual_results = MaxScoreOutput(max_score=macro_f1_score),
                 state = "done"
             )
 
diff --git a/nlptest/transform/representation.py b/nlptest/transform/representation.py
index 38b824acc..b1bf24a1a 100644
--- a/nlptest/transform/representation.py
+++ b/nlptest/transform/representation.py
@@ -88,8 +88,8 @@ def transform(test, data, params):
                     category = "representation",
                     test_type = "min_gender_representation_count",
                     test_case = k,
-                    expected_results = MinScoreOutput(score=v) ,
-                    actual_results = MinScoreOutput(score=gender_counts[k]),
+                    expected_results = MinScoreOutput(min_score=v) ,
+                    actual_results = MinScoreOutput(min_score=gender_counts[k]),
                     state = "done"
                 )
                 samples.append(sample)
@@ -112,8 +112,8 @@ def transform(test, data, params):
                     category = "representation",
                     test_type = "min_gender_representation_proportion",
                     test_case = k,
-                    expected_results = MinScoreOutput(score=v) ,
-                    actual_results = MinScoreOutput(score=gender_counts[k]/total_samples),
+                    expected_results = MinScoreOutput(min_score=v) ,
+                    actual_results = MinScoreOutput(min_score=gender_counts[k]/total_samples),
                     state = "done"
                 )
                 samples.append(sample)
@@ -175,8 +175,8 @@ def transform(test,data,params):
                     category = "representation",
                     test_type = "min_ethnicity_name_representation_count",
                     test_case = key,
-                    expected_results = MinScoreOutput(score=value) ,
-                    actual_results = MinScoreOutput(score=actual_representation[key]),
+                    expected_results = MinScoreOutput(min_score=value) ,
+                    actual_results = MinScoreOutput(min_score=actual_representation[key]),
                     state = "done"
                 )
                 sample_list.append(sample)
@@ -211,8 +211,8 @@ def transform(test,data,params):
                         category = "representation",
                         test_type = "min_ethnicity_name_representation_proportion",
                         test_case = key,
-                        expected_results = MinScoreOutput(score=value),
-                        actual_results = MinScoreOutput(score=actual_representation[key]),
+                        expected_results = MinScoreOutput(min_score=value),
+                        actual_results = MinScoreOutput(min_score=actual_representation[key]),
                         state = "done"
                     )
                     sample_list.append(sample)
@@ -272,8 +272,8 @@ def transform(test,data,params):
                     category = "representation",
                     test_type = "min_label_representation_count",
                     test_case = key,
-                    expected_results = MinScoreOutput(score=value) ,
-                    actual_results = MinScoreOutput(score=actual_representation[key]),
+                    expected_results = MinScoreOutput(min_score=value) ,
+                    actual_results = MinScoreOutput(min_score=actual_representation[key]),
                     state = "done"
                 )
                 sample_list.append(sample)
@@ -309,8 +309,8 @@ def transform(test,data,params):
                         category = "representation",
                         test_type = "min_label_representation_proportion",
                         test_case = key,
-                        expected_results = MinScoreOutput(score=value),
-                        actual_results = MinScoreOutput(score=actual_representation[key]),
+                        expected_results = MinScoreOutput(min_score=value),
+                        actual_results = MinScoreOutput(min_score=actual_representation[key]),
                         state = "done"
                     )
                     sample_list.append(sample)
@@ -371,8 +371,8 @@ def transform(test,data,params):
                     category = "representation",
                     test_type = "min_religion_name_representation_count",
                     test_case = key,
-                    expected_results = MinScoreOutput(score=value) ,
-                    actual_results = MinScoreOutput(score=actual_representation[key]),
+                    expected_results = MinScoreOutput(min_score=value) ,
+                    actual_results = MinScoreOutput(min_score=actual_representation[key]),
                     state = "done"
                 )
                 sample_list.append(sample)
@@ -406,8 +406,8 @@ def transform(test,data,params):
                         category = "representation",
                         test_type = "min_religion_name_representation_proportion",
                         test_case = key,
-                        expected_results = MinScoreOutput(score=value),
-                        actual_results = MinScoreOutput(score=actual_representation[key]),
+                        expected_results = MinScoreOutput(min_score=value),
+                        actual_results = MinScoreOutput(min_score=actual_representation[key]),
                         state = "done"
                     )
                     sample_list.append(sample)
@@ -465,8 +465,8 @@ def transform(test,data,params):
                     category = "representation",
                     test_type = "min_country_economic_representation_count",
                     test_case = key,
-                    expected_results = MinScoreOutput(score=value) ,
-                    actual_results = MinScoreOutput(score=actual_representation[key]),
+                    expected_results = MinScoreOutput(min_score=value) ,
+                    actual_results = MinScoreOutput(min_score=actual_representation[key]),
                     state = "done"
                 )
                 sample_list.append(sample)
@@ -500,8 +500,8 @@ def transform(test,data,params):
                         category = "representation",
                         test_type = "min_country_economic_representation_proportion",
                         test_case = key,
-                        expected_results = MinScoreOutput(score=value),
-                        actual_results = MinScoreOutput(score=actual_representation[key]),
+                        expected_results = MinScoreOutput(min_score=value),
+                        actual_results = MinScoreOutput(min_score=actual_representation[key]),
                         state = "done"
                     )
                     sample_list.append(sample)
diff --git a/nlptest/utils/custom_types.py b/nlptest/utils/custom_types.py
index f1672787f..927467312 100644
--- a/nlptest/utils/custom_types.py
+++ b/nlptest/utils/custom_types.py
@@ -175,33 +175,30 @@ def __eq__(self, other):
     
 class MinScoreOutput(BaseModel):
     """Output for accuracy/representation tests."""
-    score: float
+    min_score: float
 
     def to_str_list(self) -> float:
-        return self.score
+        return self.min_score
     
     def __repr__(self) -> str:
-        return f"{self.score}"
+        return f"{self.min_score}"
     def __str__(self) -> str:
-        return f"{self.score}"
+        return f"{self.min_score}"
 
 class MaxScoreOutput(BaseModel):
     """Output for accuracy/representation tests."""
-    score: float
+    max_score: float
 
     def to_str_list(self) -> float:
-        return self.score
+        return self.max_score
     
     def __repr__(self) -> str:
-        return f"{self.score}"
+        return f"{self.max_score}"
     def __str__(self) -> str:
-        return f"{self.score}"
+        return f"{self.max_score}"
 
-class AccuracyOutput(BaseModel):
-    """Output for accuracy tests."""
-    score: float
 
-Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput)
+Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput, MaxScoreOutput)
 
 class Transformation(BaseModel):
     original_span: Span
@@ -384,9 +381,9 @@ def is_pass(self) -> bool:
             expected_preds = [j.entity for j in self.expected_results.predictions]
             return actual_preds == expected_preds
         elif isinstance(self.actual_results, MinScoreOutput):
-            return self.actual_results.score >= self.expected_results.score
+            return self.actual_results.min_score >= self.expected_results.min_score
         elif isinstance(self.actual_results, MaxScoreOutput):
-            return self.actual_results.score <= self.expected_results.score
+            return self.actual_results.max_score <= self.expected_results.max_score
 
         else:
             filtered_actual_results = self.actual_results

From 0b32c414e88084ee4158a6d5993737f49f380049 Mon Sep 17 00:00:00 2001
From: Ali Tarik <alimirik55@gmail.com>
Date: Thu, 23 Mar 2023 19:11:36 +0300
Subject: [PATCH 6/8] fix checkpoints path

---
 checkpoints.ckpt => nlptest/utils/checkpoints.ckpt | Bin
 nlptest/utils/gender_classifier.py                 |   6 +++++-
 2 files changed, 5 insertions(+), 1 deletion(-)
 rename checkpoints.ckpt => nlptest/utils/checkpoints.ckpt (100%)

diff --git a/checkpoints.ckpt b/nlptest/utils/checkpoints.ckpt
similarity index 100%
rename from checkpoints.ckpt
rename to nlptest/utils/checkpoints.ckpt
diff --git a/nlptest/utils/gender_classifier.py b/nlptest/utils/gender_classifier.py
index a389a2816..9097af957 100644
--- a/nlptest/utils/gender_classifier.py
+++ b/nlptest/utils/gender_classifier.py
@@ -1,5 +1,6 @@
 import torch
 import logging
+import os
 from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
 
 class GenderClassifier():
@@ -7,7 +8,10 @@ def __init__(self) -> None:
         logging.getLogger("transformers").setLevel(logging.ERROR)
         tokenizer = AutoTokenizer.from_pretrained("microsoft/xtremedistil-l6-h256-uncased")
         model = AutoModelForSequenceClassification.from_pretrained("microsoft/xtremedistil-l6-h256-uncased", num_labels=3)
-        ckpts = torch.load("checkpoints.ckpt")
+
+        curr_dir = os.path.dirname(__file__)
+        ckpt_path = os.path.join(curr_dir, 'checkpoints.ckpt')
+        ckpts = torch.load(ckpt_path)
         model.load_state_dict(ckpts)
         self.pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
 

From dbd3c0590a10238d16d38e4388b387464d249b82 Mon Sep 17 00:00:00 2001
From: Ali Tarik <alimirik55@gmail.com>
Date: Thu, 23 Mar 2023 19:39:04 +0300
Subject: [PATCH 7/8] include checkpoints in setup

---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 4b101ac82..1e3c01018 100644
--- a/setup.py
+++ b/setup.py
@@ -157,9 +157,9 @@
     # },
     # If there are data files included in your packages that need to be
     # installed, specify them here.
-    # package_data={  # Optional
-    #     "sample": ["package_data.dat"],
-    # },
+    package_data={  # Optional
+        "nlptest": ["utils/checkpoints.ckpt"],
+    },
     # Although 'package_data' is the preferred approach, in some case you may
     # need to place data files outside of your packages. See:
     # http://docs.python.org/distutils/setupscript.html#installing-additional-files

From df3b4ad419cdcebcf9a0e2c8f6a0c9c1e7ee5be0 Mon Sep 17 00:00:00 2001
From: Ali Tarik <alimirik55@gmail.com>
Date: Thu, 23 Mar 2023 19:55:02 +0300
Subject: [PATCH 8/8] cleanup warnings

---
 nlptest/transform/fairness.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/nlptest/transform/fairness.py b/nlptest/transform/fairness.py
index e7483c86e..02c73a2a5 100644
--- a/nlptest/transform/fairness.py
+++ b/nlptest/transform/fairness.py
@@ -78,12 +78,13 @@ def transform(data: List[Sample], model, params):
         samples = []
 
         for key, val in gendered_data.items():
+            val = pd.Series(val, dtype="object")
             try:
-                y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+                y_true = val.apply(lambda x: [y.entity for y in x.expected_results.predictions])
             except:
-                y_true = pd.Series(val).apply(lambda x: [y.label for y in x.expected_results.predictions])
+                y_true = val.apply(lambda x: [y.label for y in x.expected_results.predictions])
 
-            X_test = pd.Series(val).apply(lambda x: x.original)
+            X_test = val.apply(lambda x: x.original)
             y_pred = X_test.apply(model.predict_raw)
             
             valid_indices = y_true.apply(len) == y_pred.apply(len)
@@ -95,9 +96,10 @@ def transform(data: List[Sample], model, params):
 
             y_true = y_true.explode().apply(lambda x: x.split("-")[-1])
             y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1])
-
-            macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0)
-            if np.isnan(macro_f1_score):
+            
+            if len(y_true)>0:
+                macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0)
+            else:
                 macro_f1_score = 1
 
             sample = Sample(
@@ -153,11 +155,13 @@ def transform(data: List[Sample], model, params):
         samples = []
 
         for key, val in gendered_data.items():
+            val = pd.Series(val, dtype="object")
+
             try:
-                y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+                y_true = val.apply(lambda x: [y.entity for y in x.expected_results.predictions])
             except:
-                y_true = pd.Series(val).apply(lambda x: [y.label for y in x.expected_results.predictions])
-            X_test = pd.Series(val).apply(lambda x: x.original)
+                y_true = val.apply(lambda x: [y.label for y in x.expected_results.predictions])
+            X_test = val.apply(lambda x: x.original)
             y_pred = X_test.apply(model.predict_raw)
             
             valid_indices = y_true.apply(len) == y_pred.apply(len)
@@ -170,9 +174,9 @@ def transform(data: List[Sample], model, params):
             y_true = y_true.explode().apply(lambda x: x.split("-")[-1])
             y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1])
 
-            macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0)
-
-            if np.isnan(macro_f1_score):
+            if len(y_true)>0:
+                macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0)
+            else:
                 macro_f1_score = 0
             
             sample = Sample(