From f1e67748ad7ed11e82f0b495c7c9824a8d2f229a Mon Sep 17 00:00:00 2001 From: sborms Date: Thu, 12 Aug 2021 11:48:37 +0200 Subject: [PATCH] univariate preselection based on RMSE, added new unit testing --- cobra/model_building/univariate_selection.py | 152 ++++++++++++------ .../categorical_data_processor.py | 2 +- .../model_building/test_forward_selection.py | 1 - .../test_univariate_selection.py | 61 +++++++ 4 files changed, 162 insertions(+), 54 deletions(-) create mode 100644 tests/model_building/test_univariate_selection.py diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 60cf8ff..df20792 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -7,7 +7,8 @@ - Jan Benisek (initial implementation) """ import pandas as pd -from sklearn.metrics import roc_auc_score +from sklearn.metrics import roc_auc_score, mean_squared_error +from numpy import sqrt import cobra.utils as utils @@ -15,13 +16,17 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, target_enc_selection_data: pd.DataFrame, predictors: list, target_column: str, - preselect_auc_threshold: float=0.053, - preselect_overtrain_threshold: float=0.05 + model_type: str = "classification", + preselect_auc_threshold: float = 0.053, + preselect_rmse_threshold: float = 5, + preselect_overtrain_threshold: float = 0.05 ) -> pd.DataFrame: - """Perform a preselection of predictors based on an AUC threshold of - a univariate model on a train and selection dataset and return a datframe - containing for each variable the train and selection AUC along with a + """Perform a preselection of predictors based on an AUC (in case of + classification) or a RMSE (in case of regression) threshold of + a univariate model on a train and selection dataset and return a DataFrame + containing for each variable the train and selection AUC or RMSE along with a boolean "preselection" column. + As the AUC just calculates the quality of a ranking, all monotonous transformations of a given ranking (i.e. transformations that do not alter the ranking itself) will lead to the same AUC. @@ -32,86 +37,129 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, the training set. Therefore, no univariate model is trained here as the target encoded train and selection data is/must be used as inputs for this function. These will - be used as predicted scores to compute the AUC with against the target + be used as predicted scores to compute the AUC with against the target. Parameters ---------- + model_type : str + Model type ("classification" or "regression"). target_enc_train_data : pd.DataFrame - Train data + Train data. target_enc_selection_data : pd.DataFrame - Selection data + Selection data. predictors : list - list of predictors (e.g. column names in the train set and selection - data sets) + List of predictors (e.g. column names in the train set and selection + data sets). target_column : str - name of the target column + Name of the target column. preselect_auc_threshold : float, optional - threshold on AUC to select predictor + Threshold on min. AUC to select predictor. Ignored if model_type is "regression". + preselect_rmse_threshold : float, optional + Threshold on max. RMSE to select predictor. Ignored if model_type is "classification". + It is important to note that the threshold depends heavily on the scale of + the target variable, and should be modified accordingly. preselect_overtrain_threshold : float, optional - threshold on the difference between train and selection AUC + Threshold on the difference between train and selection AUC or RMSE (in case + of the latter, as a proportion). Returns ------- pd.DataFrame - DataFrame containing for each variable the train auc and - selection auc allong with a boolean indicating whether or not it is - selected based on the criteria + DataFrame containing for each variable the train AUC or RMSE and + selection AUC or RMSE along with a boolean indicating whether or not it is + selected based on the criteria. """ result = [] - for predictor in predictors: + if model_type == "classification": + for predictor in predictors: + + cleaned_predictor = utils.clean_predictor_name(predictor) + + auc_train = roc_auc_score( + y_true=target_enc_train_data[target_column], + y_score=target_enc_train_data[predictor]) + + auc_selection = roc_auc_score( + y_true=target_enc_selection_data[target_column], + y_score=target_enc_selection_data[predictor]) + + result.append({"predictor": cleaned_predictor, + "AUC train": auc_train, + "AUC selection": auc_selection}) + + df_auc = pd.DataFrame(result) + + # Filter based on min. AUC + auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold - cleaned_predictor = utils.clean_predictor_name(predictor) + # Identify those variables for which the AUC difference between train + # and selection is within a user-defined ratio + auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) + < preselect_overtrain_threshold) - auc_train = roc_auc_score( - y_true=target_enc_train_data[target_column], - y_score=target_enc_train_data[predictor]) + df_auc["preselection"] = auc_thresh & auc_overtrain - auc_selection = roc_auc_score( - y_true=target_enc_selection_data[target_column], - y_score=target_enc_selection_data[predictor] - ) + df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True) - result.append({"predictor": cleaned_predictor, - "AUC train": auc_train, - "AUC selection": auc_selection}) + elif model_type == "regression": + for predictor in predictors: + cleaned_predictor = utils.clean_predictor_name(predictor) - df_auc = pd.DataFrame(result) + rmse_train = sqrt(mean_squared_error( + y_true=target_enc_train_data[target_column], + y_pred=target_enc_train_data[predictor])) - # Filter based on min AUC - auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold + rmse_selection = sqrt(mean_squared_error( + y_true=target_enc_selection_data[target_column], + y_pred=target_enc_selection_data[predictor])) - # Identify those variables for which the AUC difference between train - # and selection is within a user-defined ratio - auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) - < preselect_overtrain_threshold) + result.append({"predictor": cleaned_predictor, + "RMSE train": rmse_train, + "RMSE selection": rmse_selection}) - df_auc["preselection"] = auc_thresh & auc_overtrain + df_rmse = pd.DataFrame(result) - return (df_auc.sort_values(by='AUC selection', ascending=False) - .reset_index(drop=True)) + # Filter based on max. RMSE + rmse_thresh = df_rmse.loc[:, "RMSE selection"] < preselect_rmse_threshold + # Identify those variables for which the RMSE difference between train + # and selection is within a user-defined ratio + rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"]) # flip subtraction vs. AUC + < preselect_overtrain_threshold) -def get_preselected_predictors(df_auc: pd.DataFrame) -> list: - """Wrapper function to extract a list of predictors from df_auc + df_rmse["preselection"] = rmse_thresh & rmse_overtrain + + df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(drop=True) # lower is better + + return df_out + + +def get_preselected_predictors(df_metric: pd.DataFrame) -> list: + """Wrapper function to extract a list of predictors from df_auc. Parameters ---------- - df_auc : pd.DataFrame - DataFrame containing for each variable the train auc and - test auc allong with a boolean indicating whether or not it is selected - based on the criteria + df_metric : pd.DataFrame + DataFrame containing for each variable the train AUC or RMSE and + test AUC or RMSE along with a boolean indicating whether or not it is selected + based on the criteria. Returns ------- list list of preselected predictors """ - predictor_list = (df_auc[df_auc["preselection"]] - .sort_values(by='AUC selection', ascending=False) - .predictor.tolist()) - return [col + "_enc" for col in predictor_list] + if "AUC selection" in df_metric.columns: + predictor_list = (df_metric[df_metric["preselection"]] + .sort_values(by="AUC selection", ascending=False) + .predictor.tolist()) + elif "RMSE selection" in df_metric.columns: + predictor_list = (df_metric[df_metric["preselection"]] + .sort_values(by="RMSE selection", ascending=True) # lower is better + .predictor.tolist()) + return [col + "_enc" for col in predictor_list] def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) -> pd.DataFrame: @@ -121,15 +169,15 @@ def compute_correlations(target_enc_train_data: pd.DataFrame, Parameters ---------- target_enc_train_data : pd.DataFrame - data to compute correlation + Data to compute correlation. predictors : list List of column names of the DataFrame between which to compute - the correlation matrix + the correlation matrix. Returns ------- pd.DataFrame - The correlation matrix of the training set + The correlation matrix of the training set. """ correlations = target_enc_train_data[predictors].corr() diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 8a07331..703fc82 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -72,7 +72,7 @@ def __init__(self, forced_categories: dict = {}): if model_type not in ["classification", "regression"]: - raise ValueError("An unexpected model_type was provided. Valid model_types are either 'classification' or 'regression'.") + raise ValueError("An unexpected model_type was provided. A valid model_type is either 'classification' or 'regression'.") self.model_type = model_type self.regroup = regroup diff --git a/tests/model_building/test_forward_selection.py b/tests/model_building/test_forward_selection.py index 8c7c6ce..2641631 100644 --- a/tests/model_building/test_forward_selection.py +++ b/tests/model_building/test_forward_selection.py @@ -2,7 +2,6 @@ import pytest import pandas as pd -import numpy as np from cobra.model_building.models import LogisticRegressionModel from cobra.model_building.forward_selection import ForwardFeatureSelection diff --git a/tests/model_building/test_univariate_selection.py b/tests/model_building/test_univariate_selection.py new file mode 100644 index 0000000..55e924e --- /dev/null +++ b/tests/model_building/test_univariate_selection.py @@ -0,0 +1,61 @@ +import pytest + +import pandas as pd + +from cobra.model_building import univariate_selection + + +def mock_data(): + return pd.DataFrame({"var1_enc": [0.42] * 10, + "var2_enc": [0.94] * 10, + "var3_enc": [0.87] * 10}) + +class TestUnivariateSelection: + + def test_preselection_classification(self): + + X = mock_data() + y = pd.DataFrame([1] * 5 + [0] * 5, columns=["target"]) + + basetable = pd.concat([y, X], axis=1) + basetable["split"] = ["train"] * 3 + ["selection"] * 6 + ["train"] + + df_auc = univariate_selection.compute_univariate_preselection( + target_enc_train_data=basetable[basetable["split"] == "train"], + target_enc_selection_data=basetable[basetable["split"] == "selection"], + predictors=X.columns, + target_column="target", + model_type="classification", + preselect_auc_threshold=0.48, + preselect_overtrain_threshold=0.05) + + assert all(c in df_auc.columns for c in ["AUC train", "AUC selection"]) + + preselected_predictors = (univariate_selection + .get_preselected_predictors(df_auc)) + + assert preselected_predictors == ["var1_enc", "var2_enc", "var3_enc"] + + def test_preselection_regression(self): + + X = mock_data() + y = pd.DataFrame([6.0, 9.0, 4.2, 5.5, 0.7, 1.9, 8.7, 8.0, 2.0, 7.2], columns=["target"]) + + basetable = pd.concat([y, X], axis=1) + basetable["split"] = ["train"] * 3 + ["selection"] * 6 + ["train"] + + df_rmse = univariate_selection.compute_univariate_preselection( + target_enc_train_data=basetable[basetable["split"] == "train"], + target_enc_selection_data=basetable[basetable["split"] == "selection"], + predictors=X.columns, + target_column="target", + model_type="regression", + preselect_auc_threshold=5, + preselect_overtrain_threshold=0.05) + + assert all(c in df_rmse.columns for c in ["RMSE train", "RMSE selection"]) + + preselected_predictors = (univariate_selection + .get_preselected_predictors(df_rmse)) + + assert preselected_predictors == ["var2_enc", "var3_enc"]