PacificAI · alytarik · Mar 23, 2023 · Mar 23, 2023 · Mar 23, 2023 · Mar 23, 2023
diff --git a/nlptest/modelhandler/jsl_modelhandler.py b/nlptest/modelhandler/jsl_modelhandler.py
@@ -232,7 +232,8 @@ def __init__(
         #   in order to overwrite configs, light pipeline should be reinitialized.
         self.model = LightPipeline(model)
 
-    def load_model(self, path) -> 'NLUPipeline':
+    @classmethod
+    def load_model(cls, path) -> 'NLUPipeline':
         """Load the NER model into the `model` attribute.
         Args:
             path (str): Path to pretrained local or NLP Models Hub SparkNLP model

diff --git a/nlptest/transform/__init__.py b/nlptest/transform/__init__.py
@@ -643,7 +643,11 @@ def transform(self):
         for test_name, params in self.tests.items():            
             data_handler_copy = [x.copy() for x in self._data_handler]
 
-            y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            try:
+                y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            except:
+                y_true = pd.Series(data_handler_copy).apply(lambda x: [y.label for y in x.expected_results.predictions])
+
             X_test = pd.Series(data_handler_copy).apply(lambda x: x.original)
             y_pred = X_test.apply(self._model_handler.predict_raw)
 

diff --git a/nlptest/transform/accuracy.py b/nlptest/transform/accuracy.py
@@ -76,7 +76,7 @@ def transform(y_true, y_pred, params):
                 label:params["min_score"] for label in labels
             }
 
-        df_metrics = classification_report(y_true, y_pred, output_dict=True)
+        df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
         df_metrics.pop("accuracy")
         df_metrics.pop("macro avg")
         df_metrics.pop("weighted avg")
@@ -90,8 +90,8 @@ def transform(y_true, y_pred, params):
                 category = "Accuracy",
                 test_type = "min_precision_score",
                 test_case = k,
-                expected_results = MinScoreOutput(score=min_scores[k]),
-                actual_results = MinScoreOutput(score=v["precision"]),
+                expected_results = MinScoreOutput(min_score=min_scores[k]),
+                actual_results = MinScoreOutput(min_score=v["precision"]),
                 state = "done"
             )
             precision_samples.append(sample)
@@ -133,7 +133,7 @@ def transform(y_true, y_pred, params):
                 label:params["min_score"] for label in labels
             }
 
-        df_metrics = classification_report(y_true, y_pred, output_dict=True)
+        df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
         df_metrics.pop("accuracy")
         df_metrics.pop("macro avg")
         df_metrics.pop("weighted avg")
@@ -147,8 +147,8 @@ def transform(y_true, y_pred, params):
                 category = "Accuracy",
                 test_type = "min_recall_score",
                 test_case = k,
-                expected_results = MinScoreOutput(score=min_scores[k]),
-                actual_results = MinScoreOutput(score=v["recall"]),
+                expected_results = MinScoreOutput(min_score=min_scores[k]),
+                actual_results = MinScoreOutput(min_score=v["recall"]),
                 state = "done"
             )
             rec_samples.append(sample)
@@ -191,7 +191,7 @@ def transform(y_true, y_pred, params):
                 label:params["min_score"] for label in labels
             }
 
-        df_metrics = classification_report(y_true, y_pred, output_dict=True)
+        df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
         df_metrics.pop("accuracy")
         df_metrics.pop("macro avg")
         df_metrics.pop("weighted avg")
@@ -205,8 +205,8 @@ def transform(y_true, y_pred, params):
                 category = "Accuracy",
                 test_type = "min_f1_score",
                 test_case = k,
-                expected_results = MinScoreOutput(score=min_scores[k]),
-                actual_results = MinScoreOutput(score=v["f1-score"]),
+                expected_results = MinScoreOutput(min_score=min_scores[k]),
+                actual_results = MinScoreOutput(min_score=v["f1-score"]),
                 state = "done"
             )
             f1_samples.append(sample)
@@ -241,15 +241,15 @@ def transform(y_true, y_pred, params):
 
         min_score = params["min_score"]
 
-        f1 = f1_score(y_true, y_pred, average="micro")
+        f1 = f1_score(y_true, y_pred, average="micro", zero_division=0)
 
         sample = Sample(
             original = "-",
             category = "Accuracy",
             test_type = "min_micro_f1_score",
             test_case = "micro",
-            expected_results = MinScoreOutput(score=min_score),
-            actual_results = MinScoreOutput(score=f1),
+            expected_results = MinScoreOutput(min_score=min_score),
+            actual_results = MinScoreOutput(min_score=f1),
 
             state = "done"
         )
@@ -284,15 +284,15 @@ def transform(y_true, y_pred, params):
         """
 
         min_score = params["min_score"]
-        f1 = f1_score(y_true, y_pred, average="macro")
+        f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
 
         sample = Sample(
             original = "-",
             category = "Accuracy",
             test_type = "min__macro_f1_score",
             test_case = "macro",
-            expected_results = MinScoreOutput(score=min_score),
-            actual_results = MinScoreOutput(score=f1),
+            expected_results = MinScoreOutput(min_score=min_score),
+            actual_results = MinScoreOutput(min_score=f1),
             state = "done"
         )
 
@@ -326,15 +326,15 @@ def transform(y_true, y_pred, params):
         """
 
         min_score = params["min_score"]
-        f1 = f1_score(y_true, y_pred, average="weighted")
+        f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
 
         sample = Sample(
             original = "-",
             category = "Accuracy",
             test_type = "min_weighted_f1_score",
             test_case = "weighted",
-            expected_results = MinScoreOutput(score=min_score),
-            actual_results = MinScoreOutput(score=f1),
+            expected_results = MinScoreOutput(min_score=min_score),
+            actual_results = MinScoreOutput(min_score=f1),
             state = "done"
         )
 

diff --git a/nlptest/transform/fairness.py b/nlptest/transform/fairness.py
@@ -80,8 +80,13 @@ def transform(data: List[Sample], model, params):
         for key, val in gendered_data.items():
             if key not in min_scores.keys():
                 continue
-            y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions])
-            X_test = pd.Series(val).apply(lambda x: x.original)
+            val = pd.Series(val, dtype="object")
+            try:
+                y_true = val.apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            except:
+                y_true = val.apply(lambda x: [y.label for y in x.expected_results.predictions])
+            X_test = val.apply(lambda x: x.original)
+
             y_pred = X_test.apply(model.predict_raw)
 
             valid_indices = y_true.apply(len) == y_pred.apply(len)
@@ -93,18 +98,19 @@ def transform(data: List[Sample], model, params):
 
             y_true = y_true.explode().apply(lambda x: x.split("-")[-1])
             y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1])
-
-            macro_f1_score = f1_score(y_true, y_pred, average="macro")
-            if np.isnan(macro_f1_score):
+
+            if len(y_true)>0:
+                macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0)
+            else:
                 macro_f1_score = 1
 
             sample = Sample(
                 original = "-",
                 category = "fairness",
                 test_type = "min_gender_f1_score",
                 test_case = key,
-                expected_results = MinScoreOutput(score=min_scores[key]),
-                actual_results = MinScoreOutput(score=macro_f1_score),
+                expected_results = MinScoreOutput(min_score=min_scores[key]),
+                actual_results = MinScoreOutput(min_score=macro_f1_score),
                 state = "done"
             )
 
@@ -152,6 +158,13 @@ def transform(data: List[Sample], model, params):
         for key, val in gendered_data.items():
             if key not in max_scores.keys():
                 continue
+            val = pd.Series(val, dtype="object")
+
+            try:
+                y_true = val.apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            except:
+                y_true = val.apply(lambda x: [y.label for y in x.expected_results.predictions])
+            X_test = val.apply(lambda x: x.original)
             y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions])
             X_test = pd.Series(val).apply(lambda x: x.original)
             y_pred = X_test.apply(model.predict_raw)
@@ -166,18 +179,18 @@ def transform(data: List[Sample], model, params):
             y_true = y_true.explode().apply(lambda x: x.split("-")[-1])
             y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1])
 
-            macro_f1_score = f1_score(y_true, y_pred, average="macro")
-
-            if np.isnan(macro_f1_score):
+            if len(y_true)>0:
+                macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0)
+            else:
                 macro_f1_score = 0
 
             sample = Sample(
                 original = "-",
                 category = "fairness",
                 test_type = "max_gender_f1_score",
                 test_case = key,
-                expected_results = MaxScoreOutput(score=max_scores[key]),
-                actual_results = MaxScoreOutput(score=macro_f1_score),
+                expected_results = MaxScoreOutput(max_score=max_scores[key]),
+                actual_results = MaxScoreOutput(max_score=macro_f1_score),
                 state = "done"
             )