Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions cobra/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class ClassificationEvaluator():
cumulative_gains : tuple
Data for plotting cumulative gains curve.
evaluation_metrics : dict
Map containing various scalar evaluation metrics (precision, recall, accuracy, AUC, F1, etc.)
Map containing various scalar evaluation metrics (precision, recall, accuracy, AUC, F1, etc.).
lift_at : float
Parameter to determine at which top level percentage the lift of the
model should be computed.
Expand Down Expand Up @@ -191,7 +191,7 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)):
ax.set_xlabel("False Positive Rate", fontsize=15)
ax.set_ylabel("True Positive Rate", fontsize=15)
ax.legend(loc="lower right")
ax.set_title("ROC Curve", fontsize=20)
ax.set_title("ROC curve", fontsize=20)

if path:
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
Expand Down Expand Up @@ -274,7 +274,7 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):
ax.grid(False)

# Description
ax.set_title("Cumulative response", fontsize=20)
ax.set_title("Cumulative Response curve", fontsize=20)

if path is not None:
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
Expand Down Expand Up @@ -323,7 +323,7 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)):
ax.grid(False)

# Description
ax.set_title("Cumulative Lift", fontsize=20)
ax.set_title("Cumulative Lift curve", fontsize=20)

if path is not None:
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
Expand All @@ -350,7 +350,7 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)):
ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3,
ls="--", color="darkorange", label="random selection")

ax.set_title("Cumulative Gains", fontsize=20)
ax.set_title("Cumulative Gains curve", fontsize=20)

# Format axes
ax.set_xlim([0, 100])
Expand Down Expand Up @@ -681,7 +681,7 @@ def plot_predictions(self, path: str=None, dim: tuple=(12, 8)):
ax.set_xlabel("Index", fontsize=15)
ax.set_ylabel("Value", fontsize=15)
ax.legend(loc="best")
ax.set_title("Prediction Plot", fontsize=20)
ax.set_title("Predictions vs. Actuals", fontsize=20)

if path:
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
Expand Down Expand Up @@ -722,7 +722,7 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)):
ax.set_yticks(range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")])))+1, 1))

ax.legend(loc="best")
ax.set_title("Q-Q Plot", fontsize=20)
ax.set_title("Q-Q plot", fontsize=20)

if path:
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
Expand Down
31 changes: 16 additions & 15 deletions cobra/evaluation/pigs_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,21 +180,19 @@ def plot_incidence(pig_tables: pd.DataFrame,
ax.yaxis.set_major_formatter(
FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
elif model_type == "regression":
# If both the difference between the highest avg target of all bins
# versus the global avg target AND the difference between the
# lowest avg target versus the global avg target are both smaller
# than 25% of the global avg target itself, we increase the y
# axis range, to avoid that the minor avg target differences are
# spread out over the configure figure height, suggesting
# incorrectly that there are big differences in avg target across
# the bins and versus the global avg target.
# If the difference between the highest avg. target of all bins
# versus the global avg. target AND the difference between the
# lowest avg. target versus the global avg. target are both smaller
# than 25% of the global avg. target itself, we increase the
# y-axis range, to avoid that the minor avg. target differences are
# spread out over the configured figure height, suggesting
# incorrectly that there are big differences in avg. target across
# the bins and versus the global avg. target.
# (Motivation for the AND above: if on one end there IS enough
# difference, the effect that we discuss here does not occur.)
global_avg_target = max(df_plot['global_avg_target']) # series of same number, for every bin.
if (np.abs((max(df_plot['avg_target']) - global_avg_target))
/ global_avg_target < 0.25) \
and (np.abs((min(df_plot['avg_target']) - global_avg_target))
/ global_avg_target < 0.25):
global_avg_target = max(df_plot['global_avg_target']) # series of same number, for every bin.
if ((np.abs((max(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)
and (np.abs((min(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)):
ax.set_ylim(global_avg_target * 0.75,
global_avg_target * 1.25)

Expand Down Expand Up @@ -234,9 +232,9 @@ def plot_incidence(pig_tables: pd.DataFrame,
title = "Incidence plot - " + variable
else:
title = "Mean target plot - " + variable
fig.suptitle(title, fontsize=22, y=1.02)
fig.suptitle(title, fontsize=22)
ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102),
loc=3, ncol=3, mode="expand", borderaxespad=0.,
loc=3, ncol=1, mode="expand", borderaxespad=0.,
prop={"size": 14})

# Set order of layers
Expand All @@ -245,5 +243,8 @@ def plot_incidence(pig_tables: pd.DataFrame,

del df_plot

plt.tight_layout()
plt.margins(0.01)

# Show
plt.show()
56 changes: 39 additions & 17 deletions cobra/model_building/forward_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class ForwardFeatureSelection:
more or less with the maximum number of steps in the forward feature
selection.
pos_only : bool
Whether or not the model coefficients should all be positive.
Whether or not the model coefficients should all be positive (no sign flips).
self._fitted_models : list
List of fitted models.
"""
Expand Down Expand Up @@ -76,8 +76,7 @@ def get_model_from_step(self, step: int):

def compute_model_performances(self, data: pd.DataFrame,
target_column_name: str,
splits: list = ["train", "selection",
"validation"]
splits: list=["train", "selection", "validation"]
) -> pd.DataFrame:
"""Compute for each model the performance for different sets (e.g.
train-selection-validation) and return them along with a list of
Expand Down Expand Up @@ -111,7 +110,7 @@ def compute_model_performances(self, data: pd.DataFrame,
"last_added_predictor": list(last_added_predictor)[0]
}

# Evaluate model on each data set split,
# Evaluate model on each dataset split,
# e.g. train-selection-validation
tmp.update({
f"{split}_performance": model.evaluate(
Expand Down Expand Up @@ -139,7 +138,11 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
Parameters
----------
train_data : pd.DataFrame
Data on which to fit the model.
Data on which to fit the model. Should include a "train"
and "selection" split for correct model selection! The
"train" split is used to train a model, the "selection"
split is used to evaluate which model to include in the
actual forward feature selection.
target_column_name : str
Name of the target column.
predictors : list
Expand All @@ -155,6 +158,12 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
In case the number of forced predictors is larger than the maximum
number of allowed predictors in the model.
"""

assert "split" in train_data.columns, "The train_data input df does not include a split column."
print(train_data["split"].unique())
assert len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0, \
"The train_data input df does not include a 'train' and 'selection' split."

# remove excluded predictors from predictor lists
filtered_predictors = [var for var in predictors
if (var not in excluded_predictors and
Expand All @@ -163,13 +172,13 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
# checks on predictor lists and self.max_predictors attr
if len(forced_predictors) > self.max_predictors:
raise ValueError("Size of forced_predictors cannot be bigger than "
"max_predictors")
"max_predictors.")
elif len(forced_predictors) == self.max_predictors:
log.info("Size of forced_predictors equals max_predictors "
"only one model will be trained...")
# train model with all forced_predictors (only)
(self._fitted_models
.append(self._train_model(train_data,
.append(self._train_model(train_data[train_data["split"] == "train"],
target_column_name,
forced_predictors)))
else:
Expand All @@ -178,12 +187,14 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
filtered_predictors,
forced_predictors)

def _forward_selection(self, train_data: pd.DataFrame,
target_column_name: str, predictors: list,
def _forward_selection(self,
train_data: pd.DataFrame,
target_column_name: str,
predictors: list,
forced_predictors: list = []) -> list:
"""Perform the forward feature selection algorithm to compute a list
of models (with increasing performance). The length of the list,
i.e. the number of models is bounded by the max_predictors class
i.e. the number of models, is bounded by the max_predictors class
attribute.

Parameters
Expand All @@ -208,10 +219,11 @@ def _forward_selection(self, train_data: pd.DataFrame,

max_steps = 1 + min(self.max_predictors,
len(predictors) + len(forced_predictors))

for step in tqdm(range(1, max_steps), desc="Sequentially adding best "
"predictor..."):
if step <= len(forced_predictors):
# first, we go through forced predictors
# first, we go through the forced predictors
candidate_predictors = [var for var in forced_predictors
if var not in current_predictors]
else:
Expand All @@ -230,13 +242,19 @@ def _forward_selection(self, train_data: pd.DataFrame,
.union(set(model.predictors)))

fitted_models.append(model)
# else:
# # If model returns None for the first time,
# # one can in theory stop the feature selection process
# # but we leave it run such that tqdm cleanly finishes
# break

if not fitted_models:
log.error("No models found in forward selection")
log.error("No models found in forward selection.")

return fitted_models

def _find_next_best_model(self, train_data: pd.DataFrame,
def _find_next_best_model(self,
train_data: pd.DataFrame,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just calling it "basetable" or "basetable_parts" might be better, "train_data" could suggest to people - when they're not scrolling down - that only train data is used for feature selection.

Copy link
Copy Markdown
Contributor Author

@sborms sborms Oct 1, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, and also thought about changing it, but instead I emphasized in the documentation of ForwardFeatureSelection.fit() for train_data what the argument should be, and added an assertion to make sure it includes both a train & selection split.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok.

target_column_name: str,
candidate_predictors: list,
current_predictors: list):
Expand Down Expand Up @@ -272,15 +290,19 @@ def _find_next_best_model(self, train_data: pd.DataFrame,
"for the given model_type specified as "
"ForwardFeatureSelection argument.")

fit_data = train_data[train_data["split"] == "train"] # data to fit the models with
sel_data = train_data[train_data["split"] == "selection"] # data to compare the models with

for pred in candidate_predictors:
# Train a model with an additional predictor
model = self._train_model(train_data, target_column_name,
model = self._train_model(fit_data, target_column_name,
(current_predictors + [pred]))

# Evaluate the model
performance = (model
.evaluate(train_data[current_predictors + [pred]],
train_data[target_column_name],
split="train"))
.evaluate(sel_data[current_predictors + [pred]],
sel_data[target_column_name],
split="selection"))
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems like, since sel_data is now passed instead of train_data, split="selection" is redundant, default argument of evaluate apparently is split=None and then evaluation will just run fine on only selection split, since that is the only data split available in sel_data; though, if you want to explicitly leave split="selection" argument there for absolute clarity, I can follow that reasoning.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, I put it there for clarity and that way it also populates the _eval_metrics_by_split model attribute. But your reasoning is correct.


if self.pos_only and (not (model.get_coef() >= 0).all()):
continue
Expand Down
34 changes: 17 additions & 17 deletions cobra/model_building/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,12 @@ def deserialize(self, model_dict: dict):
self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]

def get_coef(self) -> np.array:
"""Returns the model coefficients
"""Returns the model coefficients.

Returns
-------
np.array
array of model coefficients
Array of model coefficients.
"""
return self.logit.coef_[0]

Expand Down Expand Up @@ -157,7 +157,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
y : pd.Series
Dataset containing the target of each observation.
split : str, optional
Split of the dataset (e.g. train-selection-validation).
Split name of the dataset (e.g. "train", "selection", or "validation").

Returns
-------
Expand Down Expand Up @@ -249,12 +249,12 @@ def __init__(self):
self._eval_metrics_by_split = {}

def serialize(self) -> dict:
"""Serialize model as JSON
"""Serialize model as JSON.

Returns
-------
dict
dictionary containing the serialized JSON
Dictionary containing the serialized JSON.
"""
serialized_model = {
"meta": "linear-regression",
Expand All @@ -272,7 +272,7 @@ def serialize(self) -> dict:
return serialized_model

def deserialize(self, model_dict: dict):
"""Deserialize a model previously stored as JSON
"""Deserialize a model previously stored as JSON.

Parameters
----------
Expand All @@ -282,7 +282,7 @@ def deserialize(self, model_dict: dict):
Raises
------
ValueError
In case JSON file is no valid serialized model
In case JSON file is no valid serialized model.
"""

if not self._is_valid_dict(model_dict):
Expand All @@ -296,37 +296,37 @@ def deserialize(self, model_dict: dict):
self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]

def get_coef(self) -> np.array:
"""Returns the model coefficients
"""Returns the model coefficients.

Returns
-------
np.array
array of model coefficients
Array of model coefficients.
"""
return self.linear.coef_[0]
return self.linear.coef_

def get_intercept(self) -> float:
"""Returns the intercept of the model
"""Returns the intercept of the model.

Returns
-------
float
intercept of the model
Intercept of the model.
"""
return self.linear.intercept_[0]

def get_coef_by_predictor(self) -> dict:
"""Returns a dictionary mapping predictor (key) to coefficient (value)
"""Returns a dictionary mapping predictor (key) to coefficient (value).

Returns
-------
dict
map ``{predictor: coefficient}``
A map ``{predictor: coefficient}``.
"""
return dict(zip(self.predictors, self.linear.coef_[0]))
return dict(zip(self.predictors, self.linear.coef_))

def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
"""Fit the model
"""Fit the model.

Parameters
----------
Expand Down Expand Up @@ -370,7 +370,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
y : pd.Series
Dataset containing the target of each observation.
split : str, optional
Split of the dataset (e.g. train-selection-validation).
Split name of the dataset (e.g. "train", "selection", or "validation").

Returns
-------
Expand Down
Loading