Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cobra/model_building/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
from .univariate_selection import get_preselected_predictors
from .univariate_selection import compute_correlations

from .models import LogisticRegressionModel
from .models import LogisticRegressionModel, LinearRegressionModel
from .forward_selection import ForwardFeatureSelection

__all__ = ['compute_univariate_preselection',
'get_preselected_predictors',
'compute_correlations',
'LogisticRegressionModel',
'LinearRegressionModel',
'ForwardFeatureSelection']
95 changes: 53 additions & 42 deletions cobra/model_building/forward_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import pandas as pd
from tqdm.auto import tqdm

from cobra.model_building import LogisticRegressionModel as MLModel
# from cobra.model_building import LogisticRegressionModel as MLModel
from cobra.model_building import LogisticRegressionModel, LinearRegressionModel

log = logging.getLogger(__name__)

Expand All @@ -15,36 +16,46 @@ class ForwardFeatureSelection:

Attributes
----------
model_type : str
Model type (``classification`` or ``regression``).
max_predictors : int
maximum number of predictors allowed in any model. This corresponds
Maximum number of predictors allowed in any model. This corresponds
more or less with the maximum number of steps in the forward feature
selection
selection.
model_name : str
name of the model to use for forward feature selection
Name of the model to use for forward feature selection.
pos_only : bool
whether or not the model coefficients should all be positive
Whether or not the model coefficients should all be positive.
"""

def __init__(self, max_predictors: int=50,
model_name: str="logistic-regression", pos_only: bool=True):
def __init__(self,
model_type: str="classification",
max_predictors: int=50,
model_name: str="logistic-regression",
pos_only: bool=True):

if model_type == "classification":
self.MLModel = LogisticRegressionModel
elif model_type == "regression":
self.MLModel = LinearRegressionModel

self.pos_only = pos_only
self.max_predictors = max_predictors
self.model_name = model_name
self.pos_only = pos_only

self._fitted_models = []

def get_model_from_step(self, step: int) -> MLModel:
def get_model_from_step(self, step: int):
"""Get fitted model from a particular step

Parameters
----------
step : int
Particular step in the forward selection
Particular step in the forward selection.

Returns
-------
MLModel
self.MLModel
Fitted model from the given step

Raises
Expand All @@ -59,8 +70,7 @@ def get_model_from_step(self, step: int) -> MLModel:

def compute_model_performances(self, data: pd.DataFrame,
target_column_name: str,
splits: list=["train", "selection",
"validation"]
splits: list = ["train", "selection", "validation"]
) -> pd.DataFrame:
"""Compute for each model the performance for different sets (e.g.
train-selection-validation) and return them along with a list of
Expand All @@ -71,11 +81,11 @@ def compute_model_performances(self, data: pd.DataFrame,
Parameters
----------
data : pd.DataFrame
dataset for which to compute performance of each model
Dataset for which to compute performance of each model.
target_column_name : str
name of the target column
Name of the target column.
splits : list, optional
list of splits to compute performance on
List of splits to compute performance on.

Returns
-------
Expand Down Expand Up @@ -112,27 +122,27 @@ def compute_model_performances(self, data: pd.DataFrame,
return pd.DataFrame(results)

def fit(self, train_data: pd.DataFrame, target_column_name: str,
predictors: list, forced_predictors: list=[],
excluded_predictors: list=[]):
predictors: list, forced_predictors: list = [],
excluded_predictors: list = []):
"""Fit the forward feature selection estimator

Parameters
----------
data : pd.DataFrame
Data on which to fit the model
Data on which to fit the model.
target_column_name : str
Name of the target column
Name of the target column.
predictors : list
List of predictors on which to train the estimator
List of predictors on which to train the estimator.
forced_predictors : list, optional
List of predictors to force in the estimator
List of predictors to force in the estimator.
excluded_predictors : list, optional
List of predictors to exclude from the estimator
List of predictors to exclude from the estimator.

Raises
------
ValueError
In case the number of forced predictors is larger than the maximum
in case the number of forced predictors is larger than the maximum
number of allowed predictors in the model
"""
# remove excluded predictors from predictor lists
Expand Down Expand Up @@ -169,13 +179,13 @@ def _forward_selection(self, train_data: pd.DataFrame,
Parameters
----------
train_data : pd.DataFrame
Data on which to fit the model
Data on which to fit the model.
target_column_name : str
Name of the target column
Name of the target column.
predictors : list
List of predictors on which to train the models
List of predictors on which to train the models.
forced_predictors : list, optional
List of predictors to force in the models
List of predictors to force in the models.

Returns
-------
Expand Down Expand Up @@ -219,26 +229,26 @@ def _forward_selection(self, train_data: pd.DataFrame,
def _find_next_best_model(self, train_data: pd.DataFrame,
target_column_name: str,
candidate_predictors: list,
current_predictors: list) -> MLModel:
"""Given a list of current predictors which are already to selected to
current_predictors: list):
"""Given a list of current predictors which are already selected to
be include in the model, Find amongst a list candidate predictors
the predictor to add to the selected list so that the resulting model
has the best performance.

Parameters
----------
train_data : pd.DataFrame
Data on which to fit the model
Data on which to fit the model.
target_column_name : str
Name of the target column
Name of the target column.
candidate_predictors : list
List of candidate predictors to test
List of candidate predictors to test.
current_predictors : list
List of predictors on which to train the models
List of predictors on which to train the models.

Returns
-------
MLModel
self.MLModel
Best performing model
"""
# placeholders
Expand Down Expand Up @@ -268,24 +278,25 @@ def _find_next_best_model(self, train_data: pd.DataFrame,
return best_model

def _train_model(self, train_data: pd.DataFrame, target_column_name: str,
predictors: list) -> MLModel:
"""Train the model with a given set of predictors
predictors: list):
"""Train the model with a given set of predictors.

Parameters
----------
train_data : pd.DataFrame
Data on which to fit the model
Data on which to fit the model.
target_column_name : str
Name of the target column
Name of the target column.
predictors : list
List of predictors on which to train the models
List of predictors on which to train the models.

Returns
-------
MLModel
self.MLModel
trained model
"""
model = MLModel()
# model = MLModel()
model = self.MLModel()

model.fit(train_data[predictors], train_data[target_column_name])

Expand Down
Loading