From f1e67748ad7ed11e82f0b495c7c9824a8d2f229a Mon Sep 17 00:00:00 2001
From: sborms <sam.borms@pythonpredictions.com>
Date: Thu, 12 Aug 2021 11:48:37 +0200
Subject: [PATCH] univariate preselection based on RMSE, added new unit testing

---
 cobra/model_building/univariate_selection.py  | 152 ++++++++++++------
 .../categorical_data_processor.py             |   2 +-
 .../model_building/test_forward_selection.py  |   1 -
 .../test_univariate_selection.py              |  61 +++++++
 4 files changed, 162 insertions(+), 54 deletions(-)
 create mode 100644 tests/model_building/test_univariate_selection.py

diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py
index 60cf8ff..df20792 100644
--- a/cobra/model_building/univariate_selection.py
+++ b/cobra/model_building/univariate_selection.py
@@ -7,7 +7,8 @@
 - Jan Benisek (initial implementation)
 """
 import pandas as pd
-from sklearn.metrics import roc_auc_score
+from sklearn.metrics import roc_auc_score, mean_squared_error
+from numpy import sqrt
 import cobra.utils as utils
 
 
@@ -15,13 +16,17 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
                                     target_enc_selection_data: pd.DataFrame,
                                     predictors: list,
                                     target_column: str,
-                                    preselect_auc_threshold: float=0.053,
-                                    preselect_overtrain_threshold: float=0.05
+                                    model_type: str = "classification",
+                                    preselect_auc_threshold: float = 0.053,
+                                    preselect_rmse_threshold: float = 5,
+                                    preselect_overtrain_threshold: float = 0.05
                                     ) -> pd.DataFrame:
-    """Perform a preselection of predictors based on an AUC threshold of
-    a univariate model on a train and selection dataset and return a datframe
-    containing for each variable the train and selection AUC along with a
+    """Perform a preselection of predictors based on an AUC (in case of
+    classification) or a RMSE (in case of regression) threshold of
+    a univariate model on a train and selection dataset and return a DataFrame
+    containing for each variable the train and selection AUC or RMSE along with a
     boolean "preselection" column.
+
     As the AUC just calculates the quality of a ranking, all monotonous
     transformations of a given ranking (i.e. transformations that do not alter
     the ranking itself) will lead to the same AUC.
@@ -32,86 +37,129 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
     the training set.
     Therefore, no univariate model is trained here as the target encoded train
     and selection data is/must be used as inputs for this function. These will
-    be used as predicted scores to compute the AUC with against the target
+    be used as predicted scores to compute the AUC with against the target.
 
     Parameters
     ----------
+    model_type : str
+        Model type ("classification" or "regression").
     target_enc_train_data : pd.DataFrame
-        Train data
+        Train data.
     target_enc_selection_data : pd.DataFrame
-        Selection data
+        Selection data.
     predictors : list
-        list of predictors (e.g. column names in the train set and selection
-        data sets)
+        List of predictors (e.g. column names in the train set and selection
+        data sets).
     target_column : str
-        name of the target column
+        Name of the target column.
     preselect_auc_threshold : float, optional
-        threshold on AUC to select predictor
+        Threshold on min. AUC to select predictor. Ignored if model_type is "regression".
+    preselect_rmse_threshold : float, optional
+        Threshold on max. RMSE to select predictor. Ignored if model_type is "classification".
+        It is important to note that the threshold depends heavily on the scale of
+        the target variable, and should be modified accordingly.
     preselect_overtrain_threshold : float, optional
-        threshold on the difference between train and selection AUC
+        Threshold on the difference between train and selection AUC or RMSE (in case
+        of the latter, as a proportion).
 
     Returns
     -------
     pd.DataFrame
-        DataFrame containing for each variable the train auc and
-        selection auc allong with a boolean indicating whether or not it is
-        selected based on the criteria
+        DataFrame containing for each variable the train AUC or RMSE and
+        selection AUC or RMSE along with a boolean indicating whether or not it is
+        selected based on the criteria.
     """
     result = []
 
-    for predictor in predictors:
+    if model_type == "classification":
+        for predictor in predictors:
+
+            cleaned_predictor = utils.clean_predictor_name(predictor)
+
+            auc_train = roc_auc_score(
+                y_true=target_enc_train_data[target_column],
+                y_score=target_enc_train_data[predictor])
+
+            auc_selection = roc_auc_score(
+                y_true=target_enc_selection_data[target_column],
+                y_score=target_enc_selection_data[predictor])
+
+            result.append({"predictor": cleaned_predictor,
+                           "AUC train": auc_train,
+                           "AUC selection": auc_selection})
+
+        df_auc = pd.DataFrame(result)
+
+        # Filter based on min. AUC
+        auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold
 
-        cleaned_predictor = utils.clean_predictor_name(predictor)
+        # Identify those variables for which the AUC difference between train
+        # and selection is within a user-defined ratio
+        auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"])
+                         < preselect_overtrain_threshold)
 
-        auc_train = roc_auc_score(
-            y_true=target_enc_train_data[target_column],
-            y_score=target_enc_train_data[predictor])
+        df_auc["preselection"] = auc_thresh & auc_overtrain
 
-        auc_selection = roc_auc_score(
-            y_true=target_enc_selection_data[target_column],
-            y_score=target_enc_selection_data[predictor]
-            )
+        df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True)
 
-        result.append({"predictor": cleaned_predictor,
-                       "AUC train": auc_train,
-                       "AUC selection": auc_selection})
+    elif model_type == "regression":
+        for predictor in predictors:
+            cleaned_predictor = utils.clean_predictor_name(predictor)
 
-    df_auc = pd.DataFrame(result)
+            rmse_train = sqrt(mean_squared_error(
+                y_true=target_enc_train_data[target_column],
+                y_pred=target_enc_train_data[predictor]))
 
-    # Filter based on min AUC
-    auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold
+            rmse_selection = sqrt(mean_squared_error(
+                y_true=target_enc_selection_data[target_column],
+                y_pred=target_enc_selection_data[predictor]))
 
-    # Identify those variables for which the AUC difference between train
-    # and selection is within a user-defined ratio
-    auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"])
-                     < preselect_overtrain_threshold)
+            result.append({"predictor": cleaned_predictor,
+                           "RMSE train": rmse_train,
+                           "RMSE selection": rmse_selection})
 
-    df_auc["preselection"] = auc_thresh & auc_overtrain
+        df_rmse = pd.DataFrame(result)
 
-    return (df_auc.sort_values(by='AUC selection', ascending=False)
-            .reset_index(drop=True))
+        # Filter based on max. RMSE
+        rmse_thresh = df_rmse.loc[:, "RMSE selection"] < preselect_rmse_threshold
 
+        # Identify those variables for which the RMSE difference between train
+        # and selection is within a user-defined ratio
+        rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"])  # flip subtraction vs. AUC
+                          < preselect_overtrain_threshold)
 
-def get_preselected_predictors(df_auc: pd.DataFrame) -> list:
-    """Wrapper function to extract a list of predictors from df_auc
+        df_rmse["preselection"] = rmse_thresh & rmse_overtrain
+
+        df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(drop=True)  # lower is better
+
+    return df_out
+
+
+def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
+    """Wrapper function to extract a list of predictors from df_auc.
 
     Parameters
     ----------
-    df_auc : pd.DataFrame
-        DataFrame containing for each variable the train auc and
-        test auc allong with a boolean indicating whether or not it is selected
-        based on the criteria
+    df_metric : pd.DataFrame
+        DataFrame containing for each variable the train AUC or RMSE and
+        test AUC or RMSE along with a boolean indicating whether or not it is selected
+        based on the criteria.
     Returns
     -------
     list
         list of preselected predictors
     """
-    predictor_list = (df_auc[df_auc["preselection"]]
-                      .sort_values(by='AUC selection', ascending=False)
-                      .predictor.tolist())
 
-    return [col + "_enc" for col in predictor_list]
+    if "AUC selection" in df_metric.columns:
+        predictor_list = (df_metric[df_metric["preselection"]]
+                          .sort_values(by="AUC selection", ascending=False)
+                          .predictor.tolist())
+    elif "RMSE selection" in df_metric.columns:
+        predictor_list = (df_metric[df_metric["preselection"]]
+                          .sort_values(by="RMSE selection", ascending=True)  # lower is better
+                          .predictor.tolist())
 
+    return [col + "_enc" for col in predictor_list]
 
 def compute_correlations(target_enc_train_data: pd.DataFrame,
                          predictors: list) -> pd.DataFrame:
@@ -121,15 +169,15 @@ def compute_correlations(target_enc_train_data: pd.DataFrame,
     Parameters
     ----------
     target_enc_train_data : pd.DataFrame
-        data to compute correlation
+        Data to compute correlation.
     predictors : list
         List of column names of the DataFrame between which to compute
-        the correlation matrix
+        the correlation matrix.
 
     Returns
     -------
     pd.DataFrame
-        The correlation matrix of the training set
+        The correlation matrix of the training set.
     """
 
     correlations = target_enc_train_data[predictors].corr()
diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py
index 8a07331..703fc82 100644
--- a/cobra/preprocessing/categorical_data_processor.py
+++ b/cobra/preprocessing/categorical_data_processor.py
@@ -72,7 +72,7 @@ def __init__(self,
                  forced_categories: dict = {}):
 
         if model_type not in ["classification", "regression"]:
-            raise ValueError("An unexpected model_type was provided. Valid model_types are either 'classification' or 'regression'.")
+            raise ValueError("An unexpected model_type was provided. A valid model_type is either 'classification' or 'regression'.")
 
         self.model_type = model_type
         self.regroup = regroup
diff --git a/tests/model_building/test_forward_selection.py b/tests/model_building/test_forward_selection.py
index 8c7c6ce..2641631 100644
--- a/tests/model_building/test_forward_selection.py
+++ b/tests/model_building/test_forward_selection.py
@@ -2,7 +2,6 @@
 import pytest
 
 import pandas as pd
-import numpy as np
 
 from cobra.model_building.models import LogisticRegressionModel
 from cobra.model_building.forward_selection import ForwardFeatureSelection
diff --git a/tests/model_building/test_univariate_selection.py b/tests/model_building/test_univariate_selection.py
new file mode 100644
index 0000000..55e924e
--- /dev/null
+++ b/tests/model_building/test_univariate_selection.py
@@ -0,0 +1,61 @@
+import pytest
+
+import pandas as pd
+
+from cobra.model_building import univariate_selection
+
+
+def mock_data():
+    return pd.DataFrame({"var1_enc": [0.42] * 10,
+                         "var2_enc": [0.94] * 10,
+                         "var3_enc": [0.87] * 10})
+
+class TestUnivariateSelection:
+
+    def test_preselection_classification(self):
+
+        X = mock_data()
+        y = pd.DataFrame([1] * 5 + [0] * 5, columns=["target"])
+
+        basetable = pd.concat([y, X], axis=1)
+        basetable["split"] = ["train"] * 3 + ["selection"] * 6 + ["train"]
+
+        df_auc = univariate_selection.compute_univariate_preselection(
+            target_enc_train_data=basetable[basetable["split"] == "train"],
+            target_enc_selection_data=basetable[basetable["split"] == "selection"],
+            predictors=X.columns,
+            target_column="target",
+            model_type="classification",
+            preselect_auc_threshold=0.48,
+            preselect_overtrain_threshold=0.05)
+
+        assert all(c in df_auc.columns for c in ["AUC train", "AUC selection"])
+
+        preselected_predictors = (univariate_selection
+                                  .get_preselected_predictors(df_auc))
+
+        assert preselected_predictors == ["var1_enc", "var2_enc", "var3_enc"]
+
+    def test_preselection_regression(self):
+
+        X = mock_data()
+        y = pd.DataFrame([6.0, 9.0, 4.2, 5.5, 0.7, 1.9, 8.7, 8.0, 2.0, 7.2], columns=["target"])
+
+        basetable = pd.concat([y, X], axis=1)
+        basetable["split"] = ["train"] * 3 + ["selection"] * 6 + ["train"]
+
+        df_rmse = univariate_selection.compute_univariate_preselection(
+            target_enc_train_data=basetable[basetable["split"] == "train"],
+            target_enc_selection_data=basetable[basetable["split"] == "selection"],
+            predictors=X.columns,
+            target_column="target",
+            model_type="regression",
+            preselect_auc_threshold=5,
+            preselect_overtrain_threshold=0.05)
+
+        assert all(c in df_rmse.columns for c in ["RMSE train", "RMSE selection"])
+
+        preselected_predictors = (univariate_selection
+                                  .get_preselected_predictors(df_rmse))
+
+        assert preselected_predictors == ["var2_enc", "var3_enc"]