PythonPredictions · sborms · Oct 1, 2021 · Sep 30, 2021 · Sep 30, 2021 · Sep 30, 2021
diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py
@@ -40,7 +40,7 @@ class ClassificationEvaluator():
     cumulative_gains : tuple
         Data for plotting cumulative gains curve.
     evaluation_metrics : dict
-        Map containing various scalar evaluation metrics (precision, recall, accuracy, AUC, F1, etc.)
+        Map containing various scalar evaluation metrics (precision, recall, accuracy, AUC, F1, etc.).
     lift_at : float
         Parameter to determine at which top level percentage the lift of the
         model should be computed.
@@ -191,7 +191,7 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)):
             ax.set_xlabel("False Positive Rate", fontsize=15)
             ax.set_ylabel("True Positive Rate", fontsize=15)
             ax.legend(loc="lower right")
-            ax.set_title("ROC Curve", fontsize=20)
+            ax.set_title("ROC curve", fontsize=20)
 
             if path:
                 plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -274,7 +274,7 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):
             ax.grid(False)
 
             # Description
-            ax.set_title("Cumulative response", fontsize=20)
+            ax.set_title("Cumulative Response curve", fontsize=20)
 
             if path is not None:
                 plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -323,7 +323,7 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)):
             ax.grid(False)
 
             # Description
-            ax.set_title("Cumulative Lift", fontsize=20)
+            ax.set_title("Cumulative Lift curve", fontsize=20)
 
             if path is not None:
                 plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -350,7 +350,7 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)):
             ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3,
                     ls="--", color="darkorange", label="random selection")
 
-            ax.set_title("Cumulative Gains", fontsize=20)
+            ax.set_title("Cumulative Gains curve", fontsize=20)
 
             # Format axes
             ax.set_xlim([0, 100])
@@ -681,7 +681,7 @@ def plot_predictions(self, path: str=None, dim: tuple=(12, 8)):
             ax.set_xlabel("Index", fontsize=15)
             ax.set_ylabel("Value", fontsize=15)
             ax.legend(loc="best")
-            ax.set_title("Prediction Plot", fontsize=20)
+            ax.set_title("Predictions vs. Actuals", fontsize=20)
 
             if path:
                 plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -722,7 +722,7 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)):
             ax.set_yticks(range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")])))+1, 1))
 
             ax.legend(loc="best")
-            ax.set_title("Q-Q Plot", fontsize=20)
+            ax.set_title("Q-Q plot", fontsize=20)
 
             if path:
                 plt.savefig(path, format="png", dpi=300, bbox_inches="tight")

diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py
@@ -180,21 +180,19 @@ def plot_incidence(pig_tables: pd.DataFrame,
             ax.yaxis.set_major_formatter(
                 FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
         elif model_type == "regression":
-            # If both the difference between the highest avg target of all bins
-            # versus the global avg target AND the difference between the
-            # lowest avg target versus the global avg target are both smaller
-            # than 25% of the global avg target itself, we increase the y
-            # axis range, to avoid that the minor avg target differences are
-            # spread out over the configure figure height, suggesting
-            # incorrectly that there are big differences in avg target across
-            # the bins and versus the global avg target.
+            # If the difference between the highest avg. target of all bins
+            # versus the global avg. target AND the difference between the
+            # lowest avg. target versus the global avg. target are both smaller
+            # than 25% of the global avg. target itself, we increase the
+            # y-axis range, to avoid that the minor avg. target differences are
+            # spread out over the configured figure height, suggesting
+            # incorrectly that there are big differences in avg. target across
+            # the bins and versus the global avg. target.
             # (Motivation for the AND above: if on one end there IS enough
             # difference, the effect that we discuss here does not occur.)
-            global_avg_target = max(df_plot['global_avg_target']) # series of same number, for every bin.
-            if (np.abs((max(df_plot['avg_target']) - global_avg_target))
-                    / global_avg_target < 0.25) \
-                and (np.abs((min(df_plot['avg_target']) - global_avg_target))
-                    / global_avg_target < 0.25):
+            global_avg_target = max(df_plot['global_avg_target'])  # series of same number, for every bin.
+            if ((np.abs((max(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)
+                    and (np.abs((min(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)):
                 ax.set_ylim(global_avg_target * 0.75,
                             global_avg_target * 1.25)
 
@@ -234,9 +232,9 @@ def plot_incidence(pig_tables: pd.DataFrame,
             title = "Incidence plot - " + variable
         else:
             title = "Mean target plot - " + variable
-        fig.suptitle(title, fontsize=22, y=1.02)
+        fig.suptitle(title, fontsize=22)
         ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102),
-                  loc=3, ncol=3, mode="expand", borderaxespad=0.,
+                  loc=3, ncol=1, mode="expand", borderaxespad=0.,
                   prop={"size": 14})
 
         # Set order of layers
@@ -245,5 +243,8 @@ def plot_incidence(pig_tables: pd.DataFrame,
 
         del df_plot
 
+        plt.tight_layout()
+        plt.margins(0.01)
+
         # Show
         plt.show()
diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py
@@ -29,7 +29,7 @@ class ForwardFeatureSelection:
         more or less with the maximum number of steps in the forward feature
         selection.
     pos_only : bool
-        Whether or not the model coefficients should all be positive.
+        Whether or not the model coefficients should all be positive (no sign flips).
     self._fitted_models : list
         List of fitted models.
     """
@@ -76,8 +76,7 @@ def get_model_from_step(self, step: int):
 
     def compute_model_performances(self, data: pd.DataFrame,
                                    target_column_name: str,
-                                   splits: list = ["train", "selection",
-                                                   "validation"]
+                                   splits: list=["train", "selection", "validation"]
                                    ) -> pd.DataFrame:
         """Compute for each model the performance for different sets (e.g.
         train-selection-validation) and return them along with a list of
@@ -111,7 +110,7 @@ def compute_model_performances(self, data: pd.DataFrame,
                 "last_added_predictor": list(last_added_predictor)[0]
             }
 
-            # Evaluate model on each data set split,
+            # Evaluate model on each dataset split,
             # e.g. train-selection-validation
             tmp.update({
                 f"{split}_performance": model.evaluate(
@@ -139,7 +138,11 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
         Parameters
         ----------
         train_data : pd.DataFrame
-            Data on which to fit the model.
+            Data on which to fit the model. Should include a "train"
+            and "selection" split for correct model selection! The
+            "train" split is used to train a model, the "selection"
+            split is used to evaluate which model to include in the
+            actual forward feature selection.
         target_column_name : str
             Name of the target column.
         predictors : list
@@ -155,6 +158,12 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
             In case the number of forced predictors is larger than the maximum
             number of allowed predictors in the model.
         """
+
+        assert "split" in train_data.columns, "The train_data input df does not include a split column."
+        print(train_data["split"].unique())
+        assert len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0, \
+            "The train_data input df does not include a 'train' and 'selection' split."
+
         # remove excluded predictors from predictor lists
         filtered_predictors = [var for var in predictors
                                if (var not in excluded_predictors and
@@ -163,13 +172,13 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
         # checks on predictor lists and self.max_predictors attr
         if len(forced_predictors) > self.max_predictors:
             raise ValueError("Size of forced_predictors cannot be bigger than "
-                             "max_predictors")
+                             "max_predictors.")
         elif len(forced_predictors) == self.max_predictors:
             log.info("Size of forced_predictors equals max_predictors "
                      "only one model will be trained...")
             # train model with all forced_predictors (only)
             (self._fitted_models
-             .append(self._train_model(train_data,
+             .append(self._train_model(train_data[train_data["split"] == "train"],
                                        target_column_name,
                                        forced_predictors)))
         else:
@@ -178,12 +187,14 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
                                                           filtered_predictors,
                                                           forced_predictors)
 
-    def _forward_selection(self, train_data: pd.DataFrame,
-                           target_column_name: str, predictors: list,
+    def _forward_selection(self,
+                           train_data: pd.DataFrame,
+                           target_column_name: str,
+                           predictors: list,
                            forced_predictors: list = []) -> list:
         """Perform the forward feature selection algorithm to compute a list
         of models (with increasing performance). The length of the list,
-        i.e. the number of models is bounded by the max_predictors class
+        i.e. the number of models, is bounded by the max_predictors class
         attribute.
 
         Parameters
@@ -208,10 +219,11 @@ def _forward_selection(self, train_data: pd.DataFrame,
 
         max_steps = 1 + min(self.max_predictors,
                             len(predictors) + len(forced_predictors))
+
         for step in tqdm(range(1, max_steps), desc="Sequentially adding best "
                                                    "predictor..."):
             if step <= len(forced_predictors):
-                # first, we go through forced predictors
+                # first, we go through the forced predictors
                 candidate_predictors = [var for var in forced_predictors
                                         if var not in current_predictors]
             else:
@@ -230,13 +242,19 @@ def _forward_selection(self, train_data: pd.DataFrame,
                                           .union(set(model.predictors)))
 
                 fitted_models.append(model)
+            # else:
+            #     # If model returns None for the first time,
+            #     # one can in theory stop the feature selection process
+            #     # but we leave it run such that tqdm cleanly finishes
+            #     break
 
         if not fitted_models:
-            log.error("No models found in forward selection")
+            log.error("No models found in forward selection.")
 
         return fitted_models
 
-    def _find_next_best_model(self, train_data: pd.DataFrame,
+    def _find_next_best_model(self,
+                              train_data: pd.DataFrame,
                               target_column_name: str,
                               candidate_predictors: list,
                               current_predictors: list):
@@ -272,15 +290,19 @@ def _find_next_best_model(self, train_data: pd.DataFrame,
                              "for the given model_type specified as "
                              "ForwardFeatureSelection argument.")
 
+        fit_data = train_data[train_data["split"] == "train"]  # data to fit the models with
+        sel_data = train_data[train_data["split"] == "selection"]  # data to compare the models with
+
         for pred in candidate_predictors:
             # Train a model with an additional predictor
-            model = self._train_model(train_data, target_column_name,
+            model = self._train_model(fit_data, target_column_name,
                                       (current_predictors + [pred]))
+
             # Evaluate the model
             performance = (model
-                           .evaluate(train_data[current_predictors + [pred]],
-                                     train_data[target_column_name],
-                                     split="train"))
+                           .evaluate(sel_data[current_predictors + [pred]],
+                                     sel_data[target_column_name],
+                                     split="selection"))
 
             if self.pos_only and (not (model.get_coef() >= 0).all()):
                 continue

diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py
@@ -83,12 +83,12 @@ def deserialize(self, model_dict: dict):
         self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
 
     def get_coef(self) -> np.array:
-        """Returns the model coefficients
+        """Returns the model coefficients.
 
         Returns
         -------
         np.array
-            array of model coefficients
+            Array of model coefficients.
         """
         return self.logit.coef_[0]
 
@@ -157,7 +157,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
         y : pd.Series
             Dataset containing the target of each observation.
         split : str, optional
-            Split of the dataset (e.g. train-selection-validation).
+            Split name of the dataset (e.g. "train", "selection", or "validation").
 
         Returns
         -------
@@ -249,12 +249,12 @@ def __init__(self):
         self._eval_metrics_by_split = {}
 
     def serialize(self) -> dict:
-        """Serialize model as JSON
+        """Serialize model as JSON.
 
         Returns
         -------
         dict
-            dictionary containing the serialized JSON
+            Dictionary containing the serialized JSON.
         """
         serialized_model = {
             "meta": "linear-regression",
@@ -272,7 +272,7 @@ def serialize(self) -> dict:
         return serialized_model
 
     def deserialize(self, model_dict: dict):
-        """Deserialize a model previously stored as JSON
+        """Deserialize a model previously stored as JSON.
 
         Parameters
         ----------
@@ -282,7 +282,7 @@ def deserialize(self, model_dict: dict):
         Raises
         ------
         ValueError
-            In case JSON file is no valid serialized model
+            In case JSON file is no valid serialized model.
         """
 
         if not self._is_valid_dict(model_dict):
@@ -296,37 +296,37 @@ def deserialize(self, model_dict: dict):
         self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
 
     def get_coef(self) -> np.array:
-        """Returns the model coefficients
+        """Returns the model coefficients.
 
         Returns
         -------
         np.array
-            array of model coefficients
+            Array of model coefficients.
         """
-        return self.linear.coef_[0]
+        return self.linear.coef_
 
     def get_intercept(self) -> float:
-        """Returns the intercept of the model
+        """Returns the intercept of the model.
 
         Returns
         -------
         float
-            intercept of the model
+            Intercept of the model.
         """
         return self.linear.intercept_[0]
 
     def get_coef_by_predictor(self) -> dict:
-        """Returns a dictionary mapping predictor (key) to coefficient (value)
+        """Returns a dictionary mapping predictor (key) to coefficient (value).
 
         Returns
         -------
         dict
-            map ``{predictor: coefficient}``
+            A map ``{predictor: coefficient}``.
         """
-        return dict(zip(self.predictors, self.linear.coef_[0]))
+        return dict(zip(self.predictors, self.linear.coef_))
 
     def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
-        """Fit the model
+        """Fit the model.
 
         Parameters
         ----------
@@ -370,7 +370,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
         y : pd.Series
             Dataset containing the target of each observation.
         split : str, optional
-            Split of the dataset (e.g. train-selection-validation).
+            Split name of the dataset (e.g. "train", "selection", or "validation").
 
         Returns
         -------