Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 100 additions & 52 deletions cobra/model_building/univariate_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,26 @@
- Jan Benisek (initial implementation)
"""
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score, mean_squared_error
from numpy import sqrt
import cobra.utils as utils


def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
target_enc_selection_data: pd.DataFrame,
predictors: list,
target_column: str,
preselect_auc_threshold: float=0.053,
preselect_overtrain_threshold: float=0.05
model_type: str = "classification",
preselect_auc_threshold: float = 0.053,
preselect_rmse_threshold: float = 5,
preselect_overtrain_threshold: float = 0.05
) -> pd.DataFrame:
"""Perform a preselection of predictors based on an AUC threshold of
a univariate model on a train and selection dataset and return a datframe
containing for each variable the train and selection AUC along with a
"""Perform a preselection of predictors based on an AUC (in case of
classification) or a RMSE (in case of regression) threshold of
a univariate model on a train and selection dataset and return a DataFrame
containing for each variable the train and selection AUC or RMSE along with a
boolean "preselection" column.

As the AUC just calculates the quality of a ranking, all monotonous
transformations of a given ranking (i.e. transformations that do not alter
the ranking itself) will lead to the same AUC.
Expand All @@ -32,86 +37,129 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
the training set.
Therefore, no univariate model is trained here as the target encoded train
and selection data is/must be used as inputs for this function. These will
be used as predicted scores to compute the AUC with against the target
be used as predicted scores to compute the AUC with against the target.

Parameters
----------
model_type : str
Model type ("classification" or "regression").
target_enc_train_data : pd.DataFrame
Train data
Train data.
target_enc_selection_data : pd.DataFrame
Selection data
Selection data.
predictors : list
list of predictors (e.g. column names in the train set and selection
data sets)
List of predictors (e.g. column names in the train set and selection
data sets).
target_column : str
name of the target column
Name of the target column.
preselect_auc_threshold : float, optional
threshold on AUC to select predictor
Threshold on min. AUC to select predictor. Ignored if model_type is "regression".
preselect_rmse_threshold : float, optional
Threshold on max. RMSE to select predictor. Ignored if model_type is "classification".
It is important to note that the threshold depends heavily on the scale of
the target variable, and should be modified accordingly.
preselect_overtrain_threshold : float, optional
threshold on the difference between train and selection AUC
Threshold on the difference between train and selection AUC or RMSE (in case
of the latter, as a proportion).

Returns
-------
pd.DataFrame
DataFrame containing for each variable the train auc and
selection auc allong with a boolean indicating whether or not it is
selected based on the criteria
DataFrame containing for each variable the train AUC or RMSE and
selection AUC or RMSE along with a boolean indicating whether or not it is
selected based on the criteria.
"""
result = []

for predictor in predictors:
if model_type == "classification":
for predictor in predictors:

cleaned_predictor = utils.clean_predictor_name(predictor)

auc_train = roc_auc_score(
y_true=target_enc_train_data[target_column],
y_score=target_enc_train_data[predictor])

auc_selection = roc_auc_score(
y_true=target_enc_selection_data[target_column],
y_score=target_enc_selection_data[predictor])

result.append({"predictor": cleaned_predictor,
"AUC train": auc_train,
"AUC selection": auc_selection})

df_auc = pd.DataFrame(result)

# Filter based on min. AUC
auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold

cleaned_predictor = utils.clean_predictor_name(predictor)
# Identify those variables for which the AUC difference between train
# and selection is within a user-defined ratio
auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"])
< preselect_overtrain_threshold)

auc_train = roc_auc_score(
y_true=target_enc_train_data[target_column],
y_score=target_enc_train_data[predictor])
df_auc["preselection"] = auc_thresh & auc_overtrain

auc_selection = roc_auc_score(
y_true=target_enc_selection_data[target_column],
y_score=target_enc_selection_data[predictor]
)
df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True)

result.append({"predictor": cleaned_predictor,
"AUC train": auc_train,
"AUC selection": auc_selection})
elif model_type == "regression":
for predictor in predictors:
cleaned_predictor = utils.clean_predictor_name(predictor)

df_auc = pd.DataFrame(result)
rmse_train = sqrt(mean_squared_error(
y_true=target_enc_train_data[target_column],
y_pred=target_enc_train_data[predictor]))

# Filter based on min AUC
auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold
rmse_selection = sqrt(mean_squared_error(
y_true=target_enc_selection_data[target_column],
y_pred=target_enc_selection_data[predictor]))

# Identify those variables for which the AUC difference between train
# and selection is within a user-defined ratio
auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"])
< preselect_overtrain_threshold)
result.append({"predictor": cleaned_predictor,
"RMSE train": rmse_train,
"RMSE selection": rmse_selection})

df_auc["preselection"] = auc_thresh & auc_overtrain
df_rmse = pd.DataFrame(result)

return (df_auc.sort_values(by='AUC selection', ascending=False)
.reset_index(drop=True))
# Filter based on max. RMSE
rmse_thresh = df_rmse.loc[:, "RMSE selection"] < preselect_rmse_threshold

# Identify those variables for which the RMSE difference between train
# and selection is within a user-defined ratio
rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"]) # flip subtraction vs. AUC
< preselect_overtrain_threshold)

def get_preselected_predictors(df_auc: pd.DataFrame) -> list:
"""Wrapper function to extract a list of predictors from df_auc
df_rmse["preselection"] = rmse_thresh & rmse_overtrain

df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(drop=True) # lower is better

return df_out


def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
"""Wrapper function to extract a list of predictors from df_auc.

Parameters
----------
df_auc : pd.DataFrame
DataFrame containing for each variable the train auc and
test auc allong with a boolean indicating whether or not it is selected
based on the criteria
df_metric : pd.DataFrame
DataFrame containing for each variable the train AUC or RMSE and
test AUC or RMSE along with a boolean indicating whether or not it is selected
based on the criteria.
Returns
-------
list
list of preselected predictors
"""
predictor_list = (df_auc[df_auc["preselection"]]
.sort_values(by='AUC selection', ascending=False)
.predictor.tolist())

return [col + "_enc" for col in predictor_list]
if "AUC selection" in df_metric.columns:
predictor_list = (df_metric[df_metric["preselection"]]
.sort_values(by="AUC selection", ascending=False)
.predictor.tolist())
elif "RMSE selection" in df_metric.columns:
predictor_list = (df_metric[df_metric["preselection"]]
.sort_values(by="RMSE selection", ascending=True) # lower is better
.predictor.tolist())

return [col + "_enc" for col in predictor_list]

def compute_correlations(target_enc_train_data: pd.DataFrame,
predictors: list) -> pd.DataFrame:
Expand All @@ -121,15 +169,15 @@ def compute_correlations(target_enc_train_data: pd.DataFrame,
Parameters
----------
target_enc_train_data : pd.DataFrame
data to compute correlation
Data to compute correlation.
predictors : list
List of column names of the DataFrame between which to compute
the correlation matrix
the correlation matrix.

Returns
-------
pd.DataFrame
The correlation matrix of the training set
The correlation matrix of the training set.
"""

correlations = target_enc_train_data[predictors].corr()
Expand Down
2 changes: 1 addition & 1 deletion cobra/preprocessing/categorical_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def __init__(self,
forced_categories: dict = {}):

if model_type not in ["classification", "regression"]:
raise ValueError("An unexpected model_type was provided. Valid model_types are either 'classification' or 'regression'.")
raise ValueError("An unexpected model_type was provided. A valid model_type is either 'classification' or 'regression'.")

self.model_type = model_type
self.regroup = regroup
Expand Down
1 change: 0 additions & 1 deletion tests/model_building/test_forward_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import pytest

import pandas as pd
import numpy as np

from cobra.model_building.models import LogisticRegressionModel
from cobra.model_building.forward_selection import ForwardFeatureSelection
Expand Down
61 changes: 61 additions & 0 deletions tests/model_building/test_univariate_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pytest

import pandas as pd

from cobra.model_building import univariate_selection


def mock_data():
return pd.DataFrame({"var1_enc": [0.42] * 10,
"var2_enc": [0.94] * 10,
"var3_enc": [0.87] * 10})

class TestUnivariateSelection:

def test_preselection_classification(self):

X = mock_data()
y = pd.DataFrame([1] * 5 + [0] * 5, columns=["target"])

basetable = pd.concat([y, X], axis=1)
basetable["split"] = ["train"] * 3 + ["selection"] * 6 + ["train"]

df_auc = univariate_selection.compute_univariate_preselection(
target_enc_train_data=basetable[basetable["split"] == "train"],
target_enc_selection_data=basetable[basetable["split"] == "selection"],
predictors=X.columns,
target_column="target",
model_type="classification",
preselect_auc_threshold=0.48,
preselect_overtrain_threshold=0.05)

assert all(c in df_auc.columns for c in ["AUC train", "AUC selection"])

preselected_predictors = (univariate_selection
.get_preselected_predictors(df_auc))

assert preselected_predictors == ["var1_enc", "var2_enc", "var3_enc"]

def test_preselection_regression(self):

X = mock_data()
y = pd.DataFrame([6.0, 9.0, 4.2, 5.5, 0.7, 1.9, 8.7, 8.0, 2.0, 7.2], columns=["target"])

basetable = pd.concat([y, X], axis=1)
basetable["split"] = ["train"] * 3 + ["selection"] * 6 + ["train"]

df_rmse = univariate_selection.compute_univariate_preselection(
target_enc_train_data=basetable[basetable["split"] == "train"],
target_enc_selection_data=basetable[basetable["split"] == "selection"],
predictors=X.columns,
target_column="target",
model_type="regression",
preselect_auc_threshold=5,
preselect_overtrain_threshold=0.05)

assert all(c in df_rmse.columns for c in ["RMSE train", "RMSE selection"])

preselected_predictors = (univariate_selection
.get_preselected_predictors(df_rmse))

assert preselected_predictors == ["var2_enc", "var3_enc"]