From 7f339fde8590b0c9b7af6d282450b49a0914a1dd Mon Sep 17 00:00:00 2001
From: sborms <sam.borms@pythonpredictions.com>
Date: Thu, 5 Aug 2021 18:17:35 +0200
Subject: [PATCH 1/4] added model_type parameter, different statistical test
 for regression models, included a simple test

---
 .gitignore                                    |   5 +-
 .../categorical_data_processor.py             | 106 ++++++++++++------
 .../test_categorical_data_processor.py        |  21 +++-
 3 files changed, 91 insertions(+), 41 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4bb934a..4ec86e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,4 @@
-#Ignored directories in root folder
-
+# Ignored directories in root folder
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -107,7 +106,7 @@ ENV/
 # vscode settings
 .vscode/
 
-# Other ignore files
+# Other ignored files
 *.pptx
 *.ppt
 .idea/
diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py
index c24a550..1ebde44 100644
--- a/cobra/preprocessing/categorical_data_processor.py
+++ b/cobra/preprocessing/categorical_data_processor.py
@@ -14,6 +14,7 @@
 - Jan Benisek (implementation)
 - Matthias Roels (implementation)
 """
+
 # standard lib imports
 import re
 from typing import Optional
@@ -38,33 +39,41 @@ class CategoricalDataProcessor(BaseEstimator):
     Attributes
     ----------
     category_size_threshold : int
-        minimal size of a category to keep it as a separate category
+        Minimal size of a category to keep it as a separate category.
     forced_categories : dict
         Map to prevent certain categories from being group into ``Other``
-        for each colum - dict of the form ``{col:[forced vars]}``.
+        for each column - dict of the form ``{col:[forced vars]}``.
     keep_missing : bool
-        Whether or not to keep missing as a separate category
+        Whether or not to keep missing as a separate category.
+    model_type : str
+        Model type ("classification" or "regression").
     p_value_threshold : float
         Significance threshold for regrouping.
     regroup : bool
-        Whether or not to regroup categories
+        Whether or not to regroup categories.
     regroup_name : str
         New name of the non-significant regrouped variables
     scale_contingency_table : bool
-        Whether contingency table should be scaled before chi^2.'
+        Whether contingency table should be scaled before chi^2.
     """
 
-    valid_keys = ["regroup", "regroup_name", "keep_missing",
+    valid_keys = ["model_type", "regroup", "regroup_name", "keep_missing",
                   "category_size_threshold", "p_value_threshold",
                   "scale_contingency_table", "forced_categories"]
 
-    def __init__(self, regroup: bool = True, regroup_name: str = "Other",
+    def __init__(self,
+                 model_type: str = "classification",
+                 regroup: bool = True,
+                 regroup_name: str = "Other",
                  keep_missing: bool = True,
                  category_size_threshold: int = 5,
                  p_value_threshold: float = 0.001,
                  scale_contingency_table: bool = True,
                  forced_categories: dict = {}):
 
+        # assert model_type in ["classification", "regression"], "define an appropriate model type"
+
+        self.model_type = model_type
         self.regroup = regroup
         self.regroup_name = regroup_name
         self.keep_missing = keep_missing
@@ -136,12 +145,12 @@ def fit(self, data: pd.DataFrame, column_names: list,
         Parameters
         ----------
         data : pd.DataFrame
-            data used to compute the mapping to encode the categorical
+            Data used to compute the mapping to encode the categorical
             variables with.
         column_names : list
-            Columns of data to be processed
+            Columns of data to be processed.
         target_column : str
-            Column name of the target
+            Column name of the target.
         """
 
         if not self.regroup:
@@ -168,8 +177,8 @@ def fit(self, data: pd.DataFrame, column_names: list,
 
     def _fit_column(self, data: pd.DataFrame, column_name: str,
                     target_column) -> set:
-        """Compute which categories to regroup into "Other" for a particular
-        column
+        """Compute which categories to regroup into "Other"
+        for a particular column
 
         Parameters
         ----------
@@ -183,13 +192,18 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
         list
             list of categories to combine into a category "Other"
         """
+        model_type = self.model_type
+
         if len(data[column_name].unique()) == 1:
             log.warning(f"Predictor {column_name} is constant"
                         " and will be ignored in computation.")
             return set(data[column_name].unique())
 
         y = data[target_column]
-        incidence = y.mean()
+        if model_type == "classification":
+            incidence = y.mean()
+        else:
+            incidence = None
 
         combined_categories = set()
 
@@ -201,13 +215,14 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
         unique_categories = list(X.unique())
 
         # do not merge categories in case of dummies, i.e. 0 and 1
-        # (and possibly "Missings")
+        # (and possibly "Missing")
         if (len(unique_categories) == 2
             or (len(unique_categories) == 3
                 and "Missing" in unique_categories)):
             return set(unique_categories)
 
         # get small categories and add them to the merged category list
+        # does not apply incidence factor when model_type = "regression"
         small_categories = (CategoricalDataProcessor
                             ._get_small_categories(
                                 X,
@@ -221,6 +236,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
 
             pval = (CategoricalDataProcessor
                     ._compute_p_value(X, y, category,
+                                      model_type,
                                       self.scale_contingency_table))
 
             # if not significant, add it to the list
@@ -348,24 +364,27 @@ def _get_small_categories(predictor_series: pd.Series,
                               incidence: float,
                               category_size_threshold: int) -> set:
         """Fetch categories with a size below a certain threshold.
-        Note that we use an additional weighting with the overall incidence
+        Note that we use an additional weighting with the overall incidence.
 
         Parameters
         ----------
         predictor_series : pd.Series
-            Description
+            Variables data.
         incidence : float
-            global train incidence
+            Global train incidence.
         category_size_threshold : int
-            minimal size of a category to keep it as a separate category
+            Minimal size of a category to keep it as a separate category.
 
         Returns
         -------
         set
-            List a categories with a count below a certain threshold
+            List a categories with a count below a certain threshold.
         """
         category_counts = predictor_series.groupby(predictor_series).size()
-        factor = max(incidence, 1 - incidence)
+        if incidence is not None:
+            factor = max(incidence, 1 - incidence)
+        else:
+            factor = 1
 
         # Get all categories with a count below a threshold
         bool_mask = (category_counts*factor) <= category_size_threshold
@@ -404,10 +423,14 @@ def _replace_missings(data: pd.DataFrame,
 
     @staticmethod
     def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
+                         model_type: str,
                          scale_contingency_table: bool) -> float:
-        """Calculates p-value in contingency table (chi-square test) in
-        order to evaluate whether category of interest is significantly
-        different from the rest of the categories, given the target variable.
+        """Calculates p-value in order to evaluate whether category of
+        interest is significantly different from the rest of the
+        categories, given the target variable.
+
+        In case model_type is "classification", chi-squared test based on a contingency table.
+        In case model_type is "regression", Kruskal-Wallis test.
 
         Parameters
         ----------
@@ -416,31 +439,42 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
         y : pd.Series
             Target data.
         category : str
-            Category for which we carry out the test
+            Category for which we carry out the test.
+        model_type : str
+            Model type ("classification" or "regression").
         scale_contingency_table : bool
-            Whether we scale contingency table with incidence rate
+            Whether we scale contingency table with incidence rate.
+            Only used when model_type = "classification".
 
         Returns
         -------
         float
-            p-value of chi-square test
+            p-value of applied statistical test
         """
         df = pd.concat([X, y], axis=1)
+        df.columns = ["X", "y"]
         df["other_categories"] = np.where(X == category, 0, 1)
 
-        contigency_table = pd.crosstab(index=df['other_categories'], columns=y,
-                                       margins=False)
+        if model_type == "classification":
+            contingency_table = pd.crosstab(index=df["other_categories"], columns=df["y"],
+                                            margins=False)
+
+            # if true, we scale the "other" categories
+            if scale_contingency_table:
+                size_other_cats = contingency_table.iloc[1].sum()
+                incidence_mean = y.mean()
+
+                contingency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats
+                contingency_table.iloc[1, 1] = incidence_mean * size_other_cats
+                contingency_table = contingency_table.values.astype(np.int64)
 
-        # if true, we scale the "other" categories
-        if scale_contingency_table:
-            size_other_cats = contigency_table.iloc[1].sum()
-            incidence_mean = y.mean()
+            pval = stats.chi2_contingency(contingency_table, correction=False)[1]
 
-            contigency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats
-            contigency_table.iloc[1, 1] = incidence_mean * size_other_cats
-            contigency_table = contigency_table.values.astype(np.int64)
+        elif model_type == "regression":
+            pval = stats.kruskal(df.y[df.other_categories == 0],
+                                 df.y[df.other_categories == 1])[1]
 
-        return stats.chi2_contingency(contigency_table, correction=False)[1]
+        return pval
 
     @staticmethod
     def _replace_categories(data: pd.Series, categories: set,
diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py
index 5b4ec3f..97191b7 100644
--- a/tests/preprocessing/test_categorical_data_processor.py
+++ b/tests/preprocessing/test_categorical_data_processor.py
@@ -20,6 +20,7 @@ def test_attributes_to_dict(self):
         actual = processor.attributes_to_dict()
 
         expected = {
+            "model_type": "classification",
             "regroup": True,
             "regroup_name": "Other",
             "keep_missing": True,
@@ -79,7 +80,23 @@ def test_compute_p_value(self, scale_contingency_table, expected):
         category = "c1"
 
         actual = (CategoricalDataProcessor
-                  ._compute_p_value(X, y, category, scale_contingency_table))
+                  ._compute_p_value(X, y, category, "classification", scale_contingency_table))
+
+        assert pytest.approx(actual, abs=1e-5) == expected
+
+    @pytest.mark.parametrize("seed, expected",
+                             [(505, 0.02222),
+                              (603, 0.89230)])
+    def test_compute_p_value_regression(self, seed, expected):
+
+        np.random.seed(seed)
+
+        X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
+        y = pd.Series(data=np.random.uniform(0, 1, 100)*5)
+        category = "c1"
+
+        actual = (CategoricalDataProcessor
+                  ._compute_p_value(X, y, category, "regression", None))
 
         assert pytest.approx(actual, abs=1e-5) == expected
 
@@ -87,7 +104,7 @@ def test_get_small_categories(self):
 
         data = pd.Series(data=(["c1"]*50 + ["c2"]*25 + ["c3"]*15 + ["c4"]*5))
         incidence = 0.35
-        threshold = 10  # to make it easy to manualy compute
+        threshold = 10  # to make it easy to manualLy compute
         expected = {"c3", "c4"}
 
         actual = (CategoricalDataProcessor

From 79871abbb665106c213266a5f7075827e77cacbf Mon Sep 17 00:00:00 2001
From: sborms <sam.borms@pythonpredictions.com>
Date: Fri, 6 Aug 2021 10:00:28 +0200
Subject: [PATCH 2/4] unit test renaming

---
 tests/preprocessing/test_categorical_data_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py
index 97191b7..6e026d3 100644
--- a/tests/preprocessing/test_categorical_data_processor.py
+++ b/tests/preprocessing/test_categorical_data_processor.py
@@ -73,7 +73,7 @@ def test_set_attributes_from_dict(self, attribute):
     @pytest.mark.parametrize("scale_contingency_table, expected",
                              [(False, 0.01329),
                               (True, 0.43437)])
-    def test_compute_p_value(self, scale_contingency_table, expected):
+    def test_compute_p_value_classification(self, scale_contingency_table, expected):
 
         X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
         y = pd.Series(data=([0]*35 + [1]*35 + [0]*15 + [1]*5 + [0]*8 + [1]*2))

From d4e0d713496ddb580bee506c1f8f75c74f630440 Mon Sep 17 00:00:00 2001
From: Sander Vanden Hautte <4201920+sandervh14@users.noreply.github.com>
Date: Fri, 6 Aug 2021 09:20:40 +0000
Subject: [PATCH 3/4] Issue #65: model_type check in categorical data processor
 constructor.

---
 cobra/preprocessing/categorical_data_processor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py
index 1ebde44..8a07331 100644
--- a/cobra/preprocessing/categorical_data_processor.py
+++ b/cobra/preprocessing/categorical_data_processor.py
@@ -71,7 +71,8 @@ def __init__(self,
                  scale_contingency_table: bool = True,
                  forced_categories: dict = {}):
 
-        # assert model_type in ["classification", "regression"], "define an appropriate model type"
+        if model_type not in ["classification", "regression"]:
+            raise ValueError("An unexpected model_type was provided. Valid model_types are either 'classification' or 'regression'.")
 
         self.model_type = model_type
         self.regroup = regroup

From 07338fb213dd269abe104c12a6887c4dac0b6fb1 Mon Sep 17 00:00:00 2001
From: sborms <sam.borms@pythonpredictions.com>
Date: Fri, 6 Aug 2021 17:07:13 +0200
Subject: [PATCH 4/4] add model_type functionality in PreProcessor class &
 tests

---
 .../categorical_data_processor.py             |  4 +-
 cobra/preprocessing/preprocessor.py           | 60 +++++++++++--------
 tests/preprocessing/test_preprocessor.py      |  1 +
 3 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py
index 8a07331..5993b5d 100644
--- a/cobra/preprocessing/categorical_data_processor.py
+++ b/cobra/preprocessing/categorical_data_processor.py
@@ -46,7 +46,7 @@ class CategoricalDataProcessor(BaseEstimator):
     keep_missing : bool
         Whether or not to keep missing as a separate category.
     model_type : str
-        Model type ("classification" or "regression").
+        Model type (``classification`` or ``regression``).
     p_value_threshold : float
         Significance threshold for regrouping.
     regroup : bool
@@ -442,7 +442,7 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
         category : str
             Category for which we carry out the test.
         model_type : str
-            Model type ("classification" or "regression").
+            Model type (``classification`` or ``regression``).
         scale_contingency_table : bool
             Whether we scale contingency table with incidence rate.
             Only used when model_type = "classification".
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
index 0177e34..3338ec4 100644
--- a/cobra/preprocessing/preprocessor.py
+++ b/cobra/preprocessing/preprocessor.py
@@ -44,23 +44,27 @@ class PreProcessor(BaseEstimator):
     ----------
     categorical_data_processor : CategoricalDataProcessor
         Instance of CategoricalDataProcessor to do the preprocessing of
-        categorical variables
+        categorical variables. The model_type variable is specified
+        here (``classification`` or ``regression``).
     discretizer : KBinsDiscretizer
         Instance of KBinsDiscretizer to do the prepocessing of continuous
-        variables by means of discretization
+        variables by means of discretization.
     serialization_path : str
-        path to save the pipeline to
+        Path to save the pipeline to.
     stratify_split : bool
-        Whether or not to stratify the train-test split
+        Whether or not to stratify the train-test split.
     target_encoder : TargetEncoder
-        Instance of TargetEncoder to do the incidence replacement
+        Instance of TargetEncoder to do the incidence replacement.
     """
 
-    def __init__(self, categorical_data_processor: CategoricalDataProcessor,
+    def __init__(self,
+                 categorical_data_processor: CategoricalDataProcessor,
                  discretizer: KBinsDiscretizer,
                  target_encoder: TargetEncoder,
                  is_fitted: bool = False):
 
+        self.model_type = categorical_data_processor.model_type
+
         self._categorical_data_processor = categorical_data_processor
         self._discretizer = discretizer
         self._target_encoder = target_encoder
@@ -69,6 +73,7 @@ def __init__(self, categorical_data_processor: CategoricalDataProcessor,
 
     @classmethod
     def from_params(cls,
+                    model_type: str = "classification",
                     n_bins: int = 10,
                     strategy: str = "quantile",
                     closed: str = "right",
@@ -91,16 +96,18 @@ def from_params(cls,
 
         Parameters
         ----------
+        model_type : str
+            Model type (``classification`` or ``regression``).
         n_bins : int, optional
             Number of bins to produce. Raises ValueError if ``n_bins < 2``.
         strategy : str, optional
             Binning strategy. Currently only ``uniform`` and ``quantile``
-            e.g. equifrequency is supported
+            e.g. equifrequency is supported.
         closed : str, optional
-            Whether to close the bins (intervals) from the left or right
+            Whether to close the bins (intervals) from the left or right.
         auto_adapt_bins : bool, optional
-            reduces the number of bins (starting from n_bins) as a function of
-            the number of missings
+            Reduces the number of bins (starting from n_bins) as a function of
+            the number of missings.
         starting_precision : int, optional
             Initial precision for the bin edges to start from,
             can also be negative. Given a list of bin edges, the class will
@@ -110,33 +117,32 @@ def from_params(cls,
             will be made to round up the numbers of the bin edges
             e.g. ``5.55 -> 10``, ``146 -> 100``, ...
         label_format : str, optional
-            format string to display the bin labels
+            Format string to display the bin labels
             e.g. ``min - max``, ``(min, max]``, ...
         change_endpoint_format : bool, optional
             Whether or not to change the format of the lower and upper bins
             into ``< x`` and ``> y`` resp.
         regroup : bool
-            Whether or not to regroup categories
+            Whether or not to regroup categories.
         regroup_name : str
-            New name of the non-significant regrouped variables
+            New name of the non-significant regrouped variables.
         keep_missing : bool
-            Whether or not to keep missing as a separate category
+            Whether or not to keep missing as a separate category.
         category_size_threshold : int
-            minimal size of a category to keep it as a separate category
+            Minimal size of a category to keep it as a separate category.
         p_value_threshold : float
             Significance threshold for regrouping.
         forced_categories : dict
             Map to prevent certain categories from being group into ``Other``
             for each column - dict of the form ``{col:[forced vars]}``.
         scale_contingency_table : bool
-            Whether contingency table should be scaled before chi^2.'
+            Whether contingency table should be scaled before chi^2.
         weight : float, optional
             Smoothing parameters (non-negative). The higher the value of the
             parameter, the bigger the contribution of the overall mean.
-            When set to zero, there is no smoothing
-            (e.g. the pure target incidence is used).
+            When set to zero, there is no smoothing (e.g. the pure target incidence is used).
         imputation_strategy : str, optional
-            in case there is a particular column which contains new categories,
+            In case there is a particular column which contains new categories,
             the encoding will lead to NULL values which should be imputed.
             Valid strategies are to replace with the global mean of the train
             set or the min (resp. max) incidence of the categories of that
@@ -145,9 +151,11 @@ def from_params(cls,
         Returns
         -------
         PreProcessor
-            Description
+            class encapsulating CategoricalDataProcessor,
+            KBinsDiscretizer, and TargetEncoder instances
         """
         categorical_data_processor = CategoricalDataProcessor(
+            model_type,
             regroup,
             regroup_name,
             keep_missing,
@@ -155,15 +163,17 @@ def from_params(cls,
             p_value_threshold,
             scale_contingency_table,
             forced_categories)
+
         discretizer = KBinsDiscretizer(n_bins, strategy, closed,
                                        auto_adapt_bins,
                                        starting_precision,
                                        label_format,
                                        change_endpoint_format)
 
-        target_encoder = TargetEncoder(weight)
+        target_encoder = TargetEncoder(weight, imputation_strategy)
 
-        return cls(categorical_data_processor, discretizer, target_encoder)
+        return cls(model_type,
+                   categorical_data_processor, discretizer, target_encoder)
 
     @classmethod
     def from_pipeline(cls, pipeline: dict):
@@ -187,12 +197,13 @@ def from_pipeline(cls, pipeline: dict):
         """
 
         if not PreProcessor._is_valid_pipeline(pipeline):
-            raise ValueError("Invalid pipeline")  # To do: specify error
+            raise ValueError("Invalid pipeline")  ## TODO: specify error
 
         categorical_data_processor = CategoricalDataProcessor()
         categorical_data_processor.set_attributes_from_dict(
             pipeline["categorical_data_processor"]
         )
+        model_type = categorical_data_processor.model_type
 
         discretizer = KBinsDiscretizer()
         discretizer.set_attributes_from_dict(pipeline["discretizer"])
@@ -200,7 +211,8 @@ def from_pipeline(cls, pipeline: dict):
         target_encoder = TargetEncoder()
         target_encoder.set_attributes_from_dict(pipeline["target_encoder"])
 
-        return cls(categorical_data_processor, discretizer, target_encoder,
+        return cls(model_type,
+                   categorical_data_processor, discretizer, target_encoder,
                    is_fitted=pipeline["_is_fitted"])
 
     def fit(self, train_data: pd.DataFrame, continuous_vars: list,
diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py
index 80f6d73..2e42759 100644
--- a/tests/preprocessing/test_preprocessor.py
+++ b/tests/preprocessing/test_preprocessor.py
@@ -97,6 +97,7 @@ def test_is_valid_pipeline(self, injection_location: str,
         # is_valid_pipeline only checks for relevant keys atm
         pipeline_dict = {
             "categorical_data_processor": {
+                "model_type": None,
                 "regroup": None,
                 "regroup_name": None,
                 "keep_missing": None,