Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 18 additions & 17 deletions cobra/evaluation/pigs_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def plot_incidence(pig_tables: pd.DataFrame,
ax.set_xlabel('{} bins' ''.format(variable), fontsize=16)
ax.xaxis.set_tick_params(labelsize=14)
plt.setp(ax.get_xticklabels(),
rotation=45, ha="right", rotation_mode="anchor")
rotation=90, ha="right", rotation_mode="anchor")
ax.yaxis.set_tick_params(labelsize=14)

if model_type == "classification":
Expand All @@ -180,21 +180,19 @@ def plot_incidence(pig_tables: pd.DataFrame,
ax.yaxis.set_major_formatter(
FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
elif model_type == "regression":
# If both the difference between the highest avg target of all bins
# versus the global avg target AND the difference between the
# lowest avg target versus the global avg target are both smaller
# than 25% of the global avg target itself, we increase the y
# axis range, to avoid that the minor avg target differences are
# spread out over the configure figure height, suggesting
# incorrectly that there are big differences in avg target across
# the bins and versus the global avg target.
# If the difference between the highest avg. target of all bins
# versus the global avg. target AND the difference between the
# lowest avg. target versus the global avg. target are both smaller
# than 25% of the global avg. target itself, we increase the
# y-axis range, to avoid that the minor avg. target differences are
# spread out over the configured figure height, suggesting
# incorrectly that there are big differences in avg. target across
# the bins and versus the global avg. target.
# (Motivation for the AND above: if on one end there IS enough
# difference, the effect that we discuss here does not occur.)
global_avg_target = max(df_plot['global_avg_target']) # series of same number, for every bin.
if (np.abs((max(df_plot['avg_target']) - global_avg_target))
/ global_avg_target < 0.25) \
and (np.abs((min(df_plot['avg_target']) - global_avg_target))
/ global_avg_target < 0.25):
global_avg_target = max(df_plot['global_avg_target']) # series of same number, for every bin.
if ((np.abs((max(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)
and (np.abs((min(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)):
ax.set_ylim(global_avg_target * 0.75,
global_avg_target * 1.25)

Expand All @@ -213,7 +211,7 @@ def plot_incidence(pig_tables: pd.DataFrame,

# Set labels & ticks
ax2.set_xlabel('{} bins' ''.format(variable), fontsize=16)
ax2.xaxis.set_tick_params(rotation=45, labelsize=14)
ax2.xaxis.set_tick_params(rotation=90, labelsize=14)

ax2.yaxis.set_tick_params(labelsize=14)
ax2.yaxis.set_major_formatter(
Expand All @@ -234,9 +232,9 @@ def plot_incidence(pig_tables: pd.DataFrame,
title = "Incidence plot - " + variable
else:
title = "Mean target plot - " + variable
fig.suptitle(title, fontsize=22, y=1.02)
fig.suptitle(title, fontsize=22)
ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102),
loc=3, ncol=3, mode="expand", borderaxespad=0.,
loc=3, ncol=1, mode="expand", borderaxespad=0.,
prop={"size": 14})

# Set order of layers
Expand All @@ -245,5 +243,8 @@ def plot_incidence(pig_tables: pd.DataFrame,

del df_plot

plt.tight_layout()
plt.margins(0.01)

# Show
plt.show()
54 changes: 39 additions & 15 deletions cobra/model_building/forward_selection.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

import logging
from typing import Callable, Optional

import pandas as pd
from tqdm.auto import tqdm
Expand Down Expand Up @@ -29,7 +30,7 @@ class ForwardFeatureSelection:
more or less with the maximum number of steps in the forward feature
selection.
pos_only : bool
Whether or not the model coefficients should all be positive.
Whether or not the model coefficients should all be positive (no sign flips).
self._fitted_models : list
List of fitted models.
"""
Expand Down Expand Up @@ -76,8 +77,8 @@ def get_model_from_step(self, step: int):

def compute_model_performances(self, data: pd.DataFrame,
target_column_name: str,
splits: list = ["train", "selection",
"validation"]
splits: list = ["train", "selection", "validation"],
metric: Optional[Callable] = None,
) -> pd.DataFrame:
"""Compute for each model the performance for different sets (e.g.
train-selection-validation) and return them along with a list of
Expand All @@ -93,6 +94,13 @@ def compute_model_performances(self, data: pd.DataFrame,
Name of the target column.
splits : list, optional
List of splits to compute performance on.
metric: Callable (function), optional
Function that computes an evaluation metric to evaluate the model's
performances, instead of the default metric (AUC for
classification, RMSE for regression).
The function should require y_true and y_pred arguments.
Metric functions from sklearn can be used, for example, see
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.

Returns
-------
Expand All @@ -117,7 +125,8 @@ def compute_model_performances(self, data: pd.DataFrame,
f"{split}_performance": model.evaluate(
data[data["split"] == split],
data[data["split"] == split][target_column_name],
split=split # parameter used for caching
split=split, # parameter used for caching
metric=metric
)
for split in splits
})
Expand All @@ -139,7 +148,9 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
Parameters
----------
train_data : pd.DataFrame
Data on which to fit the model.
Data on which to fit the model. The "train" split is used to
train a model, the "selection" split is used to evaluate
the actual forward feature selection.
target_column_name : str
Name of the target column.
predictors : list
Expand Down Expand Up @@ -178,12 +189,14 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
filtered_predictors,
forced_predictors)

def _forward_selection(self, train_data: pd.DataFrame,
target_column_name: str, predictors: list,
def _forward_selection(self,
train_data: pd.DataFrame,
target_column_name: str,
predictors: list,
forced_predictors: list = []) -> list:
"""Perform the forward feature selection algorithm to compute a list
of models (with increasing performance). The length of the list,
i.e. the number of models is bounded by the max_predictors class
i.e. the number of models, is bounded by the max_predictors class
attribute.

Parameters
Expand All @@ -208,10 +221,11 @@ def _forward_selection(self, train_data: pd.DataFrame,

max_steps = 1 + min(self.max_predictors,
len(predictors) + len(forced_predictors))

for step in tqdm(range(1, max_steps), desc="Sequentially adding best "
"predictor..."):
if step <= len(forced_predictors):
# first, we go through forced predictors
# first, we go through the forced predictors
candidate_predictors = [var for var in forced_predictors
if var not in current_predictors]
else:
Expand All @@ -230,13 +244,19 @@ def _forward_selection(self, train_data: pd.DataFrame,
.union(set(model.predictors)))

fitted_models.append(model)
# else:
# # If model returns None for the first time,
# # one can in theory stop the feature selection process
# # but we leave it run such that tqdm cleanly finishes
# break

if not fitted_models:
log.error("No models found in forward selection")
log.error("No models found in forward selection.")

return fitted_models

def _find_next_best_model(self, train_data: pd.DataFrame,
def _find_next_best_model(self,
train_data: pd.DataFrame,
target_column_name: str,
candidate_predictors: list,
current_predictors: list):
Expand Down Expand Up @@ -272,15 +292,19 @@ def _find_next_best_model(self, train_data: pd.DataFrame,
"for the given model_type specified as "
"ForwardFeatureSelection argument.")

fit_data = train_data[train_data["split"] == "train"] # data to fit the models with
sel_data = train_data[train_data["split"] == "selection"] # data to compare the models with

for pred in candidate_predictors:
# Train a model with an additional predictor
model = self._train_model(train_data, target_column_name,
model = self._train_model(fit_data, target_column_name,
(current_predictors + [pred]))

# Evaluate the model
performance = (model
.evaluate(train_data[current_predictors + [pred]],
train_data[target_column_name],
split="train"))
.evaluate(sel_data[current_predictors + [pred]],
sel_data[target_column_name],
split="selection"))

if self.pos_only and (not (model.get_coef() >= 0).all()):
continue
Expand Down
69 changes: 45 additions & 24 deletions cobra/model_building/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@

# third party imports
from typing import Callable, Optional

import numpy as np
import pandas as pd
from scipy import stats
Expand Down Expand Up @@ -83,12 +85,12 @@ def deserialize(self, model_dict: dict):
self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]

def get_coef(self) -> np.array:
"""Returns the model coefficients
"""Returns the model coefficients.

Returns
-------
np.array
array of model coefficients
Array of model coefficients.
"""
return self.logit.coef_[0]

Expand Down Expand Up @@ -144,7 +146,8 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
return self.logit.predict_proba(X[self.predictors])[:, 1]

def evaluate(self, X: pd.DataFrame, y: pd.Series,
split: str=None) -> float:
split: str=None,
metric: Optional[Callable]=None) -> float:
"""Evaluate the model on a given data set (X, y). The optional split
parameter is to indicate that the data set belongs to
(train, selection, validation), so that the computation on these sets
Expand All @@ -157,19 +160,28 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
y : pd.Series
Dataset containing the target of each observation.
split : str, optional
Split of the dataset (e.g. train-selection-validation).
Split name of the dataset (e.g. "train", "selection", or "validation").
metric: Callable (function), optional
Function that computes an evaluation metric to evaluate the model's
performances, instead of the default metric (AUC).
The function should require y_true and y_pred arguments.
Metric functions from sklearn can be used, for example, see
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.

Returns
-------
float
The performance score of the model (AUC).
The performance score of the model (AUC by default).
"""

if (split is None) or (split not in self._eval_metrics_by_split):

y_pred = self.score_model(X)

performance = roc_auc_score(y_true=y, y_score=y_pred)
if metric is None:
performance = roc_auc_score(y_true=y, y_score=y_pred)
else:
performance = metric(y_true=y, y_pred=y_pred)

if split is None:
return performance
Expand Down Expand Up @@ -249,12 +261,12 @@ def __init__(self):
self._eval_metrics_by_split = {}

def serialize(self) -> dict:
"""Serialize model as JSON
"""Serialize model as JSON.

Returns
-------
dict
dictionary containing the serialized JSON
Dictionary containing the serialized JSON.
"""
serialized_model = {
"meta": "linear-regression",
Expand All @@ -272,7 +284,7 @@ def serialize(self) -> dict:
return serialized_model

def deserialize(self, model_dict: dict):
"""Deserialize a model previously stored as JSON
"""Deserialize a model previously stored as JSON.

Parameters
----------
Expand All @@ -282,7 +294,7 @@ def deserialize(self, model_dict: dict):
Raises
------
ValueError
In case JSON file is no valid serialized model
In case JSON file is no valid serialized model.
"""

if not self._is_valid_dict(model_dict):
Expand All @@ -296,37 +308,37 @@ def deserialize(self, model_dict: dict):
self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]

def get_coef(self) -> np.array:
"""Returns the model coefficients
"""Returns the model coefficients.

Returns
-------
np.array
array of model coefficients
Array of model coefficients.
"""
return self.linear.coef_[0]
return self.linear.coef_

def get_intercept(self) -> float:
"""Returns the intercept of the model
"""Returns the intercept of the model.

Returns
-------
float
intercept of the model
Intercept of the model.
"""
return self.linear.intercept_[0]

def get_coef_by_predictor(self) -> dict:
"""Returns a dictionary mapping predictor (key) to coefficient (value)
"""Returns a dictionary mapping predictor (key) to coefficient (value).

Returns
-------
dict
map ``{predictor: coefficient}``
A map ``{predictor: coefficient}``.
"""
return dict(zip(self.predictors, self.linear.coef_[0]))
return dict(zip(self.predictors, self.linear.coef_))

def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
"""Fit the model
"""Fit the model.

Parameters
----------
Expand Down Expand Up @@ -357,7 +369,8 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
return self.linear.predict(X[self.predictors])

def evaluate(self, X: pd.DataFrame, y: pd.Series,
split: str=None) -> float:
split: str=None,
metric: Optional[Callable]=None) -> float:
"""Evaluate the model on a given data set (X, y). The optional split
parameter is to indicate that the data set belongs to
(train, selection, validation), so that the computation on these sets
Expand All @@ -370,19 +383,27 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
y : pd.Series
Dataset containing the target of each observation.
split : str, optional
Split of the dataset (e.g. train-selection-validation).
Split name of the dataset (e.g. "train", "selection", or "validation").
metric: Callable (function), optional
Function that computes an evaluation metric to evaluate the model's
performances, instead of the default metric (RMSE).
The function should require y_true and y_pred arguments.
Metric functions from sklearn can be used, for example, see
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.

Returns
-------
float
The performance score of the model (RMSE).
The performance score of the model (RMSE by default).
"""

if (split is None) or (split not in self._eval_metrics_by_split):

y_pred = self.score_model(X)

performance = sqrt(mean_squared_error(y_true=y, y_pred=y_pred))
if metric is None:
performance = sqrt(mean_squared_error(y_true=y, y_pred=y_pred))
else:
performance = metric(y_true=y, y_pred=y_pred)

if split is None:
return performance
Expand Down
Loading