PythonPredictions · sborms · Oct 1, 2021 · Sep 30, 2021 · Sep 30, 2021 · Sep 30, 2021
diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py
@@ -170,7 +170,7 @@ def plot_incidence(pig_tables: pd.DataFrame,
         ax.set_xlabel('{} bins' ''.format(variable), fontsize=16)
         ax.xaxis.set_tick_params(labelsize=14)
         plt.setp(ax.get_xticklabels(),
-                 rotation=45, ha="right", rotation_mode="anchor")
+                 rotation=90, ha="right", rotation_mode="anchor")
         ax.yaxis.set_tick_params(labelsize=14)
 
         if model_type == "classification":
@@ -180,21 +180,19 @@ def plot_incidence(pig_tables: pd.DataFrame,
             ax.yaxis.set_major_formatter(
                 FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
         elif model_type == "regression":
-            # If both the difference between the highest avg target of all bins
-            # versus the global avg target AND the difference between the
-            # lowest avg target versus the global avg target are both smaller
-            # than 25% of the global avg target itself, we increase the y
-            # axis range, to avoid that the minor avg target differences are
-            # spread out over the configure figure height, suggesting
-            # incorrectly that there are big differences in avg target across
-            # the bins and versus the global avg target.
+            # If the difference between the highest avg. target of all bins
+            # versus the global avg. target AND the difference between the
+            # lowest avg. target versus the global avg. target are both smaller
+            # than 25% of the global avg. target itself, we increase the
+            # y-axis range, to avoid that the minor avg. target differences are
+            # spread out over the configured figure height, suggesting
+            # incorrectly that there are big differences in avg. target across
+            # the bins and versus the global avg. target.
             # (Motivation for the AND above: if on one end there IS enough
             # difference, the effect that we discuss here does not occur.)
-            global_avg_target = max(df_plot['global_avg_target']) # series of same number, for every bin.
-            if (np.abs((max(df_plot['avg_target']) - global_avg_target))
-                    / global_avg_target < 0.25) \
-                and (np.abs((min(df_plot['avg_target']) - global_avg_target))
-                    / global_avg_target < 0.25):
+            global_avg_target = max(df_plot['global_avg_target'])  # series of same number, for every bin.
+            if ((np.abs((max(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)
+                    and (np.abs((min(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)):
                 ax.set_ylim(global_avg_target * 0.75,
                             global_avg_target * 1.25)
 
@@ -213,7 +211,7 @@ def plot_incidence(pig_tables: pd.DataFrame,
 
         # Set labels & ticks
         ax2.set_xlabel('{} bins' ''.format(variable), fontsize=16)
-        ax2.xaxis.set_tick_params(rotation=45, labelsize=14)
+        ax2.xaxis.set_tick_params(rotation=90, labelsize=14)
 
         ax2.yaxis.set_tick_params(labelsize=14)
         ax2.yaxis.set_major_formatter(
@@ -234,9 +232,9 @@ def plot_incidence(pig_tables: pd.DataFrame,
             title = "Incidence plot - " + variable
         else:
             title = "Mean target plot - " + variable
-        fig.suptitle(title, fontsize=22, y=1.02)
+        fig.suptitle(title, fontsize=22)
         ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102),
-                  loc=3, ncol=3, mode="expand", borderaxespad=0.,
+                  loc=3, ncol=1, mode="expand", borderaxespad=0.,
                   prop={"size": 14})
 
         # Set order of layers
@@ -245,5 +243,8 @@ def plot_incidence(pig_tables: pd.DataFrame,
 
         del df_plot
 
+        plt.tight_layout()
+        plt.margins(0.01)
+
         # Show
         plt.show()
diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py
@@ -1,5 +1,6 @@
 
 import logging
+from typing import Callable, Optional
 
 import pandas as pd
 from tqdm.auto import tqdm
@@ -29,7 +30,7 @@ class ForwardFeatureSelection:
         more or less with the maximum number of steps in the forward feature
         selection.
     pos_only : bool
-        Whether or not the model coefficients should all be positive.
+        Whether or not the model coefficients should all be positive (no sign flips).
     self._fitted_models : list
         List of fitted models.
     """
@@ -76,8 +77,8 @@ def get_model_from_step(self, step: int):
 
     def compute_model_performances(self, data: pd.DataFrame,
                                    target_column_name: str,
-                                   splits: list = ["train", "selection",
-                                                   "validation"]
+                                   splits: list = ["train", "selection", "validation"],
+                                   metric: Optional[Callable] = None,
                                    ) -> pd.DataFrame:
         """Compute for each model the performance for different sets (e.g.
         train-selection-validation) and return them along with a list of
@@ -93,6 +94,13 @@ def compute_model_performances(self, data: pd.DataFrame,
             Name of the target column.
         splits : list, optional
             List of splits to compute performance on.
+        metric: Callable (function), optional
+            Function that computes an evaluation metric to evaluate the model's
+            performances, instead of the default metric (AUC for
+            classification, RMSE for regression).
+            The function should require y_true and y_pred arguments.
+            Metric functions from sklearn can be used, for example, see
+            https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
 
         Returns
         -------
@@ -117,7 +125,8 @@ def compute_model_performances(self, data: pd.DataFrame,
                 f"{split}_performance": model.evaluate(
                     data[data["split"] == split],
                     data[data["split"] == split][target_column_name],
-                    split=split  # parameter used for caching
+                    split=split,  # parameter used for caching
+                    metric=metric
                 )
                 for split in splits
             })
@@ -139,7 +148,9 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
         Parameters
         ----------
         train_data : pd.DataFrame
-            Data on which to fit the model.
+            Data on which to fit the model. The "train" split is used to
+            train a model, the "selection" split is used to evaluate
+            the actual forward feature selection.
         target_column_name : str
             Name of the target column.
         predictors : list
@@ -178,12 +189,14 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
                                                           filtered_predictors,
                                                           forced_predictors)
 
-    def _forward_selection(self, train_data: pd.DataFrame,
-                           target_column_name: str, predictors: list,
+    def _forward_selection(self,
+                           train_data: pd.DataFrame,
+                           target_column_name: str,
+                           predictors: list,
                            forced_predictors: list = []) -> list:
         """Perform the forward feature selection algorithm to compute a list
         of models (with increasing performance). The length of the list,
-        i.e. the number of models is bounded by the max_predictors class
+        i.e. the number of models, is bounded by the max_predictors class
         attribute.
 
         Parameters
@@ -208,10 +221,11 @@ def _forward_selection(self, train_data: pd.DataFrame,
 
         max_steps = 1 + min(self.max_predictors,
                             len(predictors) + len(forced_predictors))
+
         for step in tqdm(range(1, max_steps), desc="Sequentially adding best "
                                                    "predictor..."):
             if step <= len(forced_predictors):
-                # first, we go through forced predictors
+                # first, we go through the forced predictors
                 candidate_predictors = [var for var in forced_predictors
                                         if var not in current_predictors]
             else:
@@ -230,13 +244,19 @@ def _forward_selection(self, train_data: pd.DataFrame,
                                           .union(set(model.predictors)))
 
                 fitted_models.append(model)
+            # else:
+            #     # If model returns None for the first time,
+            #     # one can in theory stop the feature selection process
+            #     # but we leave it run such that tqdm cleanly finishes
+            #     break
 
         if not fitted_models:
-            log.error("No models found in forward selection")
+            log.error("No models found in forward selection.")
 
         return fitted_models
 
-    def _find_next_best_model(self, train_data: pd.DataFrame,
+    def _find_next_best_model(self,
+                              train_data: pd.DataFrame,
                               target_column_name: str,
                               candidate_predictors: list,
                               current_predictors: list):
@@ -272,15 +292,19 @@ def _find_next_best_model(self, train_data: pd.DataFrame,
                              "for the given model_type specified as "
                              "ForwardFeatureSelection argument.")
 
+        fit_data = train_data[train_data["split"] == "train"]  # data to fit the models with
+        sel_data = train_data[train_data["split"] == "selection"]  # data to compare the models with
+
         for pred in candidate_predictors:
             # Train a model with an additional predictor
-            model = self._train_model(train_data, target_column_name,
+            model = self._train_model(fit_data, target_column_name,
                                       (current_predictors + [pred]))
+
             # Evaluate the model
             performance = (model
-                           .evaluate(train_data[current_predictors + [pred]],
-                                     train_data[target_column_name],
-                                     split="train"))
+                           .evaluate(sel_data[current_predictors + [pred]],
+                                     sel_data[target_column_name],
+                                     split="selection"))
 
             if self.pos_only and (not (model.get_coef() >= 0).all()):
                 continue

diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py
@@ -1,5 +1,7 @@
 
 # third party imports
+from typing import Callable, Optional
+
 import numpy as np
 import pandas as pd
 from scipy import stats
@@ -83,12 +85,12 @@ def deserialize(self, model_dict: dict):
         self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
 
     def get_coef(self) -> np.array:
-        """Returns the model coefficients
+        """Returns the model coefficients.
 
         Returns
         -------
         np.array
-            array of model coefficients
+            Array of model coefficients.
         """
         return self.logit.coef_[0]
 
@@ -144,7 +146,8 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
         return self.logit.predict_proba(X[self.predictors])[:, 1]
 
     def evaluate(self, X: pd.DataFrame, y: pd.Series,
-                 split: str=None) -> float:
+                 split: str=None,
+                 metric: Optional[Callable]=None) -> float:
         """Evaluate the model on a given data set (X, y). The optional split
         parameter is to indicate that the data set belongs to
         (train, selection, validation), so that the computation on these sets
@@ -157,19 +160,28 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
         y : pd.Series
             Dataset containing the target of each observation.
         split : str, optional
-            Split of the dataset (e.g. train-selection-validation).
+            Split name of the dataset (e.g. "train", "selection", or "validation").
+        metric: Callable (function), optional
+            Function that computes an evaluation metric to evaluate the model's
+            performances, instead of the default metric (AUC).
+            The function should require y_true and y_pred arguments.
+            Metric functions from sklearn can be used, for example, see
+            https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
 
         Returns
         -------
         float
-            The performance score of the model (AUC).
+            The performance score of the model (AUC by default).
         """
 
         if (split is None) or (split not in self._eval_metrics_by_split):
 
             y_pred = self.score_model(X)
 
-            performance = roc_auc_score(y_true=y, y_score=y_pred)
+            if metric is None:
+                performance = roc_auc_score(y_true=y, y_score=y_pred)
+            else:
+                performance = metric(y_true=y, y_pred=y_pred)
 
             if split is None:
                 return performance
@@ -249,12 +261,12 @@ def __init__(self):
         self._eval_metrics_by_split = {}
 
     def serialize(self) -> dict:
-        """Serialize model as JSON
+        """Serialize model as JSON.
 
         Returns
         -------
         dict
-            dictionary containing the serialized JSON
+            Dictionary containing the serialized JSON.
         """
         serialized_model = {
             "meta": "linear-regression",
@@ -272,7 +284,7 @@ def serialize(self) -> dict:
         return serialized_model
 
     def deserialize(self, model_dict: dict):
-        """Deserialize a model previously stored as JSON
+        """Deserialize a model previously stored as JSON.
 
         Parameters
         ----------
@@ -282,7 +294,7 @@ def deserialize(self, model_dict: dict):
         Raises
         ------
         ValueError
-            In case JSON file is no valid serialized model
+            In case JSON file is no valid serialized model.
         """
 
         if not self._is_valid_dict(model_dict):
@@ -296,37 +308,37 @@ def deserialize(self, model_dict: dict):
         self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
 
     def get_coef(self) -> np.array:
-        """Returns the model coefficients
+        """Returns the model coefficients.
 
         Returns
         -------
         np.array
-            array of model coefficients
+            Array of model coefficients.
         """
-        return self.linear.coef_[0]
+        return self.linear.coef_
 
     def get_intercept(self) -> float:
-        """Returns the intercept of the model
+        """Returns the intercept of the model.
 
         Returns
         -------
         float
-            intercept of the model
+            Intercept of the model.
         """
         return self.linear.intercept_[0]
 
     def get_coef_by_predictor(self) -> dict:
-        """Returns a dictionary mapping predictor (key) to coefficient (value)
+        """Returns a dictionary mapping predictor (key) to coefficient (value).
 
         Returns
         -------
         dict
-            map ``{predictor: coefficient}``
+            A map ``{predictor: coefficient}``.
         """
-        return dict(zip(self.predictors, self.linear.coef_[0]))
+        return dict(zip(self.predictors, self.linear.coef_))
 
     def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
-        """Fit the model
+        """Fit the model.
 
         Parameters
         ----------
@@ -357,7 +369,8 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
         return self.linear.predict(X[self.predictors])
 
     def evaluate(self, X: pd.DataFrame, y: pd.Series,
-                 split: str=None) -> float:
+                 split: str=None,
+                 metric: Optional[Callable]=None) -> float:
         """Evaluate the model on a given data set (X, y). The optional split
         parameter is to indicate that the data set belongs to
         (train, selection, validation), so that the computation on these sets
@@ -370,19 +383,27 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
         y : pd.Series
             Dataset containing the target of each observation.
         split : str, optional
-            Split of the dataset (e.g. train-selection-validation).
+            Split name of the dataset (e.g. "train", "selection", or "validation").
+        metric: Callable (function), optional
+            Function that computes an evaluation metric to evaluate the model's
+            performances, instead of the default metric (RMSE).
+            The function should require y_true and y_pred arguments.
+            Metric functions from sklearn can be used, for example, see
+            https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
 
         Returns
         -------
         float
-            The performance score of the model (RMSE).
+            The performance score of the model (RMSE by default).
         """
 
         if (split is None) or (split not in self._eval_metrics_by_split):
 
             y_pred = self.score_model(X)
-
-            performance = sqrt(mean_squared_error(y_true=y, y_pred=y_pred))
+            if metric is None:
+                performance = sqrt(mean_squared_error(y_true=y, y_pred=y_pred))
+            else:
+                performance = metric(y_true=y, y_pred=y_pred)
 
             if split is None:
                 return performance