diff --git a/nlptest/modelhandler/jsl_modelhandler.py b/nlptest/modelhandler/jsl_modelhandler.py
index 9672492e7..81319af6b 100644
--- a/nlptest/modelhandler/jsl_modelhandler.py
+++ b/nlptest/modelhandler/jsl_modelhandler.py
@@ -232,7 +232,8 @@ def __init__(
         #   in order to overwrite configs, light pipeline should be reinitialized.
         self.model = LightPipeline(model)
 
-    def load_model(self, path) -> 'NLUPipeline':
+    @classmethod
+    def load_model(cls, path) -> 'NLUPipeline':
         """Load the NER model into the `model` attribute.
         Args:
             path (str): Path to pretrained local or NLP Models Hub SparkNLP model
diff --git a/nlptest/transform/__init__.py b/nlptest/transform/__init__.py
index a9a3f4ce4..645a70ea7 100644
--- a/nlptest/transform/__init__.py
+++ b/nlptest/transform/__init__.py
@@ -643,7 +643,11 @@ def transform(self):
         for test_name, params in self.tests.items():            
             data_handler_copy = [x.copy() for x in self._data_handler]
 
-            y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            try:
+                y_true = pd.Series(data_handler_copy).apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            except:
+                y_true = pd.Series(data_handler_copy).apply(lambda x: [y.label for y in x.expected_results.predictions])
+
             X_test = pd.Series(data_handler_copy).apply(lambda x: x.original)
             y_pred = X_test.apply(self._model_handler.predict_raw)
 
diff --git a/nlptest/transform/accuracy.py b/nlptest/transform/accuracy.py
index 46edd599e..cc1c4b1db 100644
--- a/nlptest/transform/accuracy.py
+++ b/nlptest/transform/accuracy.py
@@ -76,7 +76,7 @@ def transform(y_true, y_pred, params):
                 label:params["min_score"] for label in labels
             }
 
-        df_metrics = classification_report(y_true, y_pred, output_dict=True)
+        df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
         df_metrics.pop("accuracy")
         df_metrics.pop("macro avg")
         df_metrics.pop("weighted avg")
@@ -90,8 +90,8 @@ def transform(y_true, y_pred, params):
                 category = "Accuracy",
                 test_type = "min_precision_score",
                 test_case = k,
-                expected_results = MinScoreOutput(score=min_scores[k]),
-                actual_results = MinScoreOutput(score=v["precision"]),
+                expected_results = MinScoreOutput(min_score=min_scores[k]),
+                actual_results = MinScoreOutput(min_score=v["precision"]),
                 state = "done"
             )
             precision_samples.append(sample)
@@ -133,7 +133,7 @@ def transform(y_true, y_pred, params):
                 label:params["min_score"] for label in labels
             }
         
-        df_metrics = classification_report(y_true, y_pred, output_dict=True)
+        df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
         df_metrics.pop("accuracy")
         df_metrics.pop("macro avg")
         df_metrics.pop("weighted avg")
@@ -147,8 +147,8 @@ def transform(y_true, y_pred, params):
                 category = "Accuracy",
                 test_type = "min_recall_score",
                 test_case = k,
-                expected_results = MinScoreOutput(score=min_scores[k]),
-                actual_results = MinScoreOutput(score=v["recall"]),
+                expected_results = MinScoreOutput(min_score=min_scores[k]),
+                actual_results = MinScoreOutput(min_score=v["recall"]),
                 state = "done"
             )
             rec_samples.append(sample)
@@ -191,7 +191,7 @@ def transform(y_true, y_pred, params):
                 label:params["min_score"] for label in labels
             }
 
-        df_metrics = classification_report(y_true, y_pred, output_dict=True)
+        df_metrics = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
         df_metrics.pop("accuracy")
         df_metrics.pop("macro avg")
         df_metrics.pop("weighted avg")
@@ -205,8 +205,8 @@ def transform(y_true, y_pred, params):
                 category = "Accuracy",
                 test_type = "min_f1_score",
                 test_case = k,
-                expected_results = MinScoreOutput(score=min_scores[k]),
-                actual_results = MinScoreOutput(score=v["f1-score"]),
+                expected_results = MinScoreOutput(min_score=min_scores[k]),
+                actual_results = MinScoreOutput(min_score=v["f1-score"]),
                 state = "done"
             )
             f1_samples.append(sample)
@@ -241,15 +241,15 @@ def transform(y_true, y_pred, params):
 
         min_score = params["min_score"]
 
-        f1 = f1_score(y_true, y_pred, average="micro")
+        f1 = f1_score(y_true, y_pred, average="micro", zero_division=0)
 
         sample = Sample(
             original = "-",
             category = "Accuracy",
             test_type = "min_micro_f1_score",
             test_case = "micro",
-            expected_results = MinScoreOutput(score=min_score),
-            actual_results = MinScoreOutput(score=f1),
+            expected_results = MinScoreOutput(min_score=min_score),
+            actual_results = MinScoreOutput(min_score=f1),
 
             state = "done"
         )
@@ -284,15 +284,15 @@ def transform(y_true, y_pred, params):
         """
 
         min_score = params["min_score"]
-        f1 = f1_score(y_true, y_pred, average="macro")
+        f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
 
         sample = Sample(
             original = "-",
             category = "Accuracy",
             test_type = "min__macro_f1_score",
             test_case = "macro",
-            expected_results = MinScoreOutput(score=min_score),
-            actual_results = MinScoreOutput(score=f1),
+            expected_results = MinScoreOutput(min_score=min_score),
+            actual_results = MinScoreOutput(min_score=f1),
             state = "done"
         )
 
@@ -326,15 +326,15 @@ def transform(y_true, y_pred, params):
         """
 
         min_score = params["min_score"]
-        f1 = f1_score(y_true, y_pred, average="weighted")
+        f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)
 
         sample = Sample(
             original = "-",
             category = "Accuracy",
             test_type = "min_weighted_f1_score",
             test_case = "weighted",
-            expected_results = MinScoreOutput(score=min_score),
-            actual_results = MinScoreOutput(score=f1),
+            expected_results = MinScoreOutput(min_score=min_score),
+            actual_results = MinScoreOutput(min_score=f1),
             state = "done"
         )
 
diff --git a/nlptest/transform/fairness.py b/nlptest/transform/fairness.py
index 3f28fb8c4..e229592c2 100644
--- a/nlptest/transform/fairness.py
+++ b/nlptest/transform/fairness.py
@@ -80,8 +80,13 @@ def transform(data: List[Sample], model, params):
         for key, val in gendered_data.items():
             if key not in min_scores.keys():
                 continue
-            y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions])
-            X_test = pd.Series(val).apply(lambda x: x.original)
+            val = pd.Series(val, dtype="object")
+            try:
+                y_true = val.apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            except:
+                y_true = val.apply(lambda x: [y.label for y in x.expected_results.predictions])
+            X_test = val.apply(lambda x: x.original)
+
             y_pred = X_test.apply(model.predict_raw)
             
             valid_indices = y_true.apply(len) == y_pred.apply(len)
@@ -93,9 +98,10 @@ def transform(data: List[Sample], model, params):
 
             y_true = y_true.explode().apply(lambda x: x.split("-")[-1])
             y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1])
-
-            macro_f1_score = f1_score(y_true, y_pred, average="macro")
-            if np.isnan(macro_f1_score):
+            
+            if len(y_true)>0:
+                macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0)
+            else:
                 macro_f1_score = 1
 
             sample = Sample(
@@ -103,8 +109,8 @@ def transform(data: List[Sample], model, params):
                 category = "fairness",
                 test_type = "min_gender_f1_score",
                 test_case = key,
-                expected_results = MinScoreOutput(score=min_scores[key]),
-                actual_results = MinScoreOutput(score=macro_f1_score),
+                expected_results = MinScoreOutput(min_score=min_scores[key]),
+                actual_results = MinScoreOutput(min_score=macro_f1_score),
                 state = "done"
             )
 
@@ -152,6 +158,13 @@ def transform(data: List[Sample], model, params):
         for key, val in gendered_data.items():
             if key not in max_scores.keys():
                 continue
+            val = pd.Series(val, dtype="object")
+
+            try:
+                y_true = val.apply(lambda x: [y.entity for y in x.expected_results.predictions])
+            except:
+                y_true = val.apply(lambda x: [y.label for y in x.expected_results.predictions])
+            X_test = val.apply(lambda x: x.original)
             y_true = pd.Series(val).apply(lambda x: [y.entity for y in x.expected_results.predictions])
             X_test = pd.Series(val).apply(lambda x: x.original)
             y_pred = X_test.apply(model.predict_raw)
@@ -166,9 +179,9 @@ def transform(data: List[Sample], model, params):
             y_true = y_true.explode().apply(lambda x: x.split("-")[-1])
             y_pred = y_pred.explode().apply(lambda x: x.split("-")[-1])
 
-            macro_f1_score = f1_score(y_true, y_pred, average="macro")
-
-            if np.isnan(macro_f1_score):
+            if len(y_true)>0:
+                macro_f1_score = f1_score(y_true, y_pred, average="macro", zero_division=0)
+            else:
                 macro_f1_score = 0
             
             sample = Sample(
@@ -176,8 +189,8 @@ def transform(data: List[Sample], model, params):
                 category = "fairness",
                 test_type = "max_gender_f1_score",
                 test_case = key,
-                expected_results = MaxScoreOutput(score=max_scores[key]),
-                actual_results = MaxScoreOutput(score=macro_f1_score),
+                expected_results = MaxScoreOutput(max_score=max_scores[key]),
+                actual_results = MaxScoreOutput(max_score=macro_f1_score),
                 state = "done"
             )
 
diff --git a/nlptest/transform/representation.py b/nlptest/transform/representation.py
index 9ca873c47..b1bf24a1a 100644
--- a/nlptest/transform/representation.py
+++ b/nlptest/transform/representation.py
@@ -1,9 +1,8 @@
-
-
 from abc import ABC, abstractmethod
 from typing import List
 import pandas as pd
 from nlptest.utils.custom_types import Sample, MinScoreOutput
+from nlptest.utils.gender_classifier import GenderClassifier
 from .utils import default_label_representation ,default_ehtnicity_representation,default_economic_country_representation,  default_religion_representation, get_label_representation_dict, get_country_economic_representation_dict, get_religion_name_representation_dict, get_ethnicity_representation_dict, get_entity_representation_proportions
 
 class BaseRepresentation(ABC):
@@ -38,15 +37,90 @@ def transform(self):
 
 
 class GenderRepresentation(BaseRepresentation):
+    """
+    Subclass of BaseRepresentation that implements the gender representation test.
+
+    Attributes:
+        alias_name (List[str]): The list of test names that identify the representation measure.
 
+    """
     alias_name = [
         "min_gender_representation_count",
         "min_gender_representation_proportion"
     ]
     
-    def transform(data: List[Sample]):
-        return super().transform()
+    def transform(test, data, params):
+        """
+        Args:
+            test (str): name of the test
+            data (List[Sample]): The input data to be evaluated for representation test.
+            params : parameters specified in config.
+
+        Raises:
+            ValueError: If sum of specified proportions in config is greater than 1
+    
+        Returns:
+            List[Sample]: Ethnicity Representation test results.
+        """    
+        classifier = GenderClassifier()
+        genders = [classifier.predict(sample.original) for sample in data]
+
+        gender_counts = {
+            "male": len([x for x in genders if x == "male"]),
+            "female": len([x for x in genders if x == "female"]),
+            "unknown": len([x for x in genders if x == "unknown"])
+        }
+
+        samples = []
+        if test == "min_gender_representation_count":
+            if isinstance(params["min_count"], dict):
+                min_counts = params["min_count"]
+            else:
+                min_counts = {
+                    "male": params["min_count"],
+                    "female": params["min_count"],
+                    "unknown": params["min_count"]
+                }
 
+            for k, v in min_counts.items():
+                sample = Sample(
+                    original = "-",
+                    category = "representation",
+                    test_type = "min_gender_representation_count",
+                    test_case = k,
+                    expected_results = MinScoreOutput(min_score=v) ,
+                    actual_results = MinScoreOutput(min_score=gender_counts[k]),
+                    state = "done"
+                )
+                samples.append(sample)
+        elif test == "min_gender_representation_proportion":
+            min_proportions = {
+                "male": 0.26,
+                "female": 0.26,
+                "unknown": 0.26
+            }
+
+            if isinstance(params["min_proportion"], dict):
+                min_proportions = params["min_proportion"]
+                if sum(min_proportions.values()) > 1:
+                    raise ValueError("Sum of proportions cannot be greater than 1. So min_gender_representation_proportion test cannot run.")
+
+            total_samples = len(data)
+            for k, v in min_proportions.items():
+                sample = Sample(
+                    original = "-",
+                    category = "representation",
+                    test_type = "min_gender_representation_proportion",
+                    test_case = k,
+                    expected_results = MinScoreOutput(min_score=v) ,
+                    actual_results = MinScoreOutput(min_score=gender_counts[k]/total_samples),
+                    state = "done"
+                )
+                samples.append(sample)
+        return samples
+            
+        
+        
 class EthnicityRepresentation(BaseRepresentation):
     
     """
@@ -101,8 +175,8 @@ def transform(test,data,params):
                     category = "representation",
                     test_type = "min_ethnicity_name_representation_count",
                     test_case = key,
-                    expected_results = MinScoreOutput(score=value) ,
-                    actual_results = MinScoreOutput(score=actual_representation[key]),
+                    expected_results = MinScoreOutput(min_score=value) ,
+                    actual_results = MinScoreOutput(min_score=actual_representation[key]),
                     state = "done"
                 )
                 sample_list.append(sample)
@@ -137,8 +211,8 @@ def transform(test,data,params):
                         category = "representation",
                         test_type = "min_ethnicity_name_representation_proportion",
                         test_case = key,
-                        expected_results = MinScoreOutput(score=value),
-                        actual_results = MinScoreOutput(score=actual_representation[key]),
+                        expected_results = MinScoreOutput(min_score=value),
+                        actual_results = MinScoreOutput(min_score=actual_representation[key]),
                         state = "done"
                     )
                     sample_list.append(sample)
@@ -198,8 +272,8 @@ def transform(test,data,params):
                     category = "representation",
                     test_type = "min_label_representation_count",
                     test_case = key,
-                    expected_results = MinScoreOutput(score=value) ,
-                    actual_results = MinScoreOutput(score=actual_representation[key]),
+                    expected_results = MinScoreOutput(min_score=value) ,
+                    actual_results = MinScoreOutput(min_score=actual_representation[key]),
                     state = "done"
                 )
                 sample_list.append(sample)
@@ -235,8 +309,8 @@ def transform(test,data,params):
                         category = "representation",
                         test_type = "min_label_representation_proportion",
                         test_case = key,
-                        expected_results = MinScoreOutput(score=value),
-                        actual_results = MinScoreOutput(score=actual_representation[key]),
+                        expected_results = MinScoreOutput(min_score=value),
+                        actual_results = MinScoreOutput(min_score=actual_representation[key]),
                         state = "done"
                     )
                     sample_list.append(sample)
@@ -297,8 +371,8 @@ def transform(test,data,params):
                     category = "representation",
                     test_type = "min_religion_name_representation_count",
                     test_case = key,
-                    expected_results = MinScoreOutput(score=value) ,
-                    actual_results = MinScoreOutput(score=actual_representation[key]),
+                    expected_results = MinScoreOutput(min_score=value) ,
+                    actual_results = MinScoreOutput(min_score=actual_representation[key]),
                     state = "done"
                 )
                 sample_list.append(sample)
@@ -332,8 +406,8 @@ def transform(test,data,params):
                         category = "representation",
                         test_type = "min_religion_name_representation_proportion",
                         test_case = key,
-                        expected_results = MinScoreOutput(score=value),
-                        actual_results = MinScoreOutput(score=actual_representation[key]),
+                        expected_results = MinScoreOutput(min_score=value),
+                        actual_results = MinScoreOutput(min_score=actual_representation[key]),
                         state = "done"
                     )
                     sample_list.append(sample)
@@ -391,8 +465,8 @@ def transform(test,data,params):
                     category = "representation",
                     test_type = "min_country_economic_representation_count",
                     test_case = key,
-                    expected_results = MinScoreOutput(score=value) ,
-                    actual_results = MinScoreOutput(score=actual_representation[key]),
+                    expected_results = MinScoreOutput(min_score=value) ,
+                    actual_results = MinScoreOutput(min_score=actual_representation[key]),
                     state = "done"
                 )
                 sample_list.append(sample)
@@ -426,8 +500,8 @@ def transform(test,data,params):
                         category = "representation",
                         test_type = "min_country_economic_representation_proportion",
                         test_case = key,
-                        expected_results = MinScoreOutput(score=value),
-                        actual_results = MinScoreOutput(score=actual_representation[key]),
+                        expected_results = MinScoreOutput(min_score=value),
+                        actual_results = MinScoreOutput(min_score=actual_representation[key]),
                         state = "done"
                     )
                     sample_list.append(sample)
diff --git a/checkpoints.ckpt b/nlptest/utils/checkpoints.ckpt
similarity index 100%
rename from checkpoints.ckpt
rename to nlptest/utils/checkpoints.ckpt
diff --git a/nlptest/utils/custom_types.py b/nlptest/utils/custom_types.py
index f1672787f..927467312 100644
--- a/nlptest/utils/custom_types.py
+++ b/nlptest/utils/custom_types.py
@@ -175,33 +175,30 @@ def __eq__(self, other):
     
 class MinScoreOutput(BaseModel):
     """Output for accuracy/representation tests."""
-    score: float
+    min_score: float
 
     def to_str_list(self) -> float:
-        return self.score
+        return self.min_score
     
     def __repr__(self) -> str:
-        return f"{self.score}"
+        return f"{self.min_score}"
     def __str__(self) -> str:
-        return f"{self.score}"
+        return f"{self.min_score}"
 
 class MaxScoreOutput(BaseModel):
     """Output for accuracy/representation tests."""
-    score: float
+    max_score: float
 
     def to_str_list(self) -> float:
-        return self.score
+        return self.max_score
     
     def __repr__(self) -> str:
-        return f"{self.score}"
+        return f"{self.max_score}"
     def __str__(self) -> str:
-        return f"{self.score}"
+        return f"{self.max_score}"
 
-class AccuracyOutput(BaseModel):
-    """Output for accuracy tests."""
-    score: float
 
-Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput)
+Result = TypeVar("Result", NEROutput, SequenceClassificationOutput, MinScoreOutput, MaxScoreOutput)
 
 class Transformation(BaseModel):
     original_span: Span
@@ -384,9 +381,9 @@ def is_pass(self) -> bool:
             expected_preds = [j.entity for j in self.expected_results.predictions]
             return actual_preds == expected_preds
         elif isinstance(self.actual_results, MinScoreOutput):
-            return self.actual_results.score >= self.expected_results.score
+            return self.actual_results.min_score >= self.expected_results.min_score
         elif isinstance(self.actual_results, MaxScoreOutput):
-            return self.actual_results.score <= self.expected_results.score
+            return self.actual_results.max_score <= self.expected_results.max_score
 
         else:
             filtered_actual_results = self.actual_results
diff --git a/nlptest/utils/gender_classifier.py b/nlptest/utils/gender_classifier.py
index a389a2816..9097af957 100644
--- a/nlptest/utils/gender_classifier.py
+++ b/nlptest/utils/gender_classifier.py
@@ -1,5 +1,6 @@
 import torch
 import logging
+import os
 from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
 
 class GenderClassifier():
@@ -7,7 +8,10 @@ def __init__(self) -> None:
         logging.getLogger("transformers").setLevel(logging.ERROR)
         tokenizer = AutoTokenizer.from_pretrained("microsoft/xtremedistil-l6-h256-uncased")
         model = AutoModelForSequenceClassification.from_pretrained("microsoft/xtremedistil-l6-h256-uncased", num_labels=3)
-        ckpts = torch.load("checkpoints.ckpt")
+
+        curr_dir = os.path.dirname(__file__)
+        ckpt_path = os.path.join(curr_dir, 'checkpoints.ckpt')
+        ckpts = torch.load(ckpt_path)
         model.load_state_dict(ckpts)
         self.pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
 
diff --git a/setup.py b/setup.py
index 4b101ac82..1e3c01018 100644
--- a/setup.py
+++ b/setup.py
@@ -157,9 +157,9 @@
     # },
     # If there are data files included in your packages that need to be
     # installed, specify them here.
-    # package_data={  # Optional
-    #     "sample": ["package_data.dat"],
-    # },
+    package_data={  # Optional
+        "nlptest": ["utils/checkpoints.ckpt"],
+    },
     # Although 'package_data' is the preferred approach, in some case you may
     # need to place data files outside of your packages. See:
     # http://docs.python.org/distutils/setupscript.html#installing-additional-files