diff --git a/cobra/model_building/__init__.py b/cobra/model_building/__init__.py index 63d074b..7a646c3 100644 --- a/cobra/model_building/__init__.py +++ b/cobra/model_building/__init__.py @@ -2,11 +2,12 @@ from .univariate_selection import get_preselected_predictors from .univariate_selection import compute_correlations -from .models import LogisticRegressionModel +from .models import LogisticRegressionModel, LinearRegressionModel from .forward_selection import ForwardFeatureSelection __all__ = ['compute_univariate_preselection', 'get_preselected_predictors', 'compute_correlations', 'LogisticRegressionModel', + 'LinearRegressionModel', 'ForwardFeatureSelection'] diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index 58b7620..5f35385 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -3,7 +3,8 @@ import pandas as pd from tqdm.auto import tqdm -from cobra.model_building import LogisticRegressionModel as MLModel +# from cobra.model_building import LogisticRegressionModel as MLModel +from cobra.model_building import LogisticRegressionModel, LinearRegressionModel log = logging.getLogger(__name__) @@ -15,36 +16,46 @@ class ForwardFeatureSelection: Attributes ---------- + model_type : str + Model type (``classification`` or ``regression``). max_predictors : int - maximum number of predictors allowed in any model. This corresponds + Maximum number of predictors allowed in any model. This corresponds more or less with the maximum number of steps in the forward feature - selection + selection. model_name : str - name of the model to use for forward feature selection + Name of the model to use for forward feature selection. pos_only : bool - whether or not the model coefficients should all be positive + Whether or not the model coefficients should all be positive. """ - def __init__(self, max_predictors: int=50, - model_name: str="logistic-regression", pos_only: bool=True): + def __init__(self, + model_type: str="classification", + max_predictors: int=50, + model_name: str="logistic-regression", + pos_only: bool=True): + + if model_type == "classification": + self.MLModel = LogisticRegressionModel + elif model_type == "regression": + self.MLModel = LinearRegressionModel - self.pos_only = pos_only self.max_predictors = max_predictors self.model_name = model_name + self.pos_only = pos_only self._fitted_models = [] - def get_model_from_step(self, step: int) -> MLModel: + def get_model_from_step(self, step: int): """Get fitted model from a particular step Parameters ---------- step : int - Particular step in the forward selection + Particular step in the forward selection. Returns ------- - MLModel + self.MLModel Fitted model from the given step Raises @@ -59,8 +70,7 @@ def get_model_from_step(self, step: int) -> MLModel: def compute_model_performances(self, data: pd.DataFrame, target_column_name: str, - splits: list=["train", "selection", - "validation"] + splits: list = ["train", "selection", "validation"] ) -> pd.DataFrame: """Compute for each model the performance for different sets (e.g. train-selection-validation) and return them along with a list of @@ -71,11 +81,11 @@ def compute_model_performances(self, data: pd.DataFrame, Parameters ---------- data : pd.DataFrame - dataset for which to compute performance of each model + Dataset for which to compute performance of each model. target_column_name : str - name of the target column + Name of the target column. splits : list, optional - list of splits to compute performance on + List of splits to compute performance on. Returns ------- @@ -112,27 +122,27 @@ def compute_model_performances(self, data: pd.DataFrame, return pd.DataFrame(results) def fit(self, train_data: pd.DataFrame, target_column_name: str, - predictors: list, forced_predictors: list=[], - excluded_predictors: list=[]): + predictors: list, forced_predictors: list = [], + excluded_predictors: list = []): """Fit the forward feature selection estimator Parameters ---------- data : pd.DataFrame - Data on which to fit the model + Data on which to fit the model. target_column_name : str - Name of the target column + Name of the target column. predictors : list - List of predictors on which to train the estimator + List of predictors on which to train the estimator. forced_predictors : list, optional - List of predictors to force in the estimator + List of predictors to force in the estimator. excluded_predictors : list, optional - List of predictors to exclude from the estimator + List of predictors to exclude from the estimator. Raises ------ ValueError - In case the number of forced predictors is larger than the maximum + in case the number of forced predictors is larger than the maximum number of allowed predictors in the model """ # remove excluded predictors from predictor lists @@ -169,13 +179,13 @@ def _forward_selection(self, train_data: pd.DataFrame, Parameters ---------- train_data : pd.DataFrame - Data on which to fit the model + Data on which to fit the model. target_column_name : str - Name of the target column + Name of the target column. predictors : list - List of predictors on which to train the models + List of predictors on which to train the models. forced_predictors : list, optional - List of predictors to force in the models + List of predictors to force in the models. Returns ------- @@ -219,8 +229,8 @@ def _forward_selection(self, train_data: pd.DataFrame, def _find_next_best_model(self, train_data: pd.DataFrame, target_column_name: str, candidate_predictors: list, - current_predictors: list) -> MLModel: - """Given a list of current predictors which are already to selected to + current_predictors: list): + """Given a list of current predictors which are already selected to be include in the model, Find amongst a list candidate predictors the predictor to add to the selected list so that the resulting model has the best performance. @@ -228,17 +238,17 @@ def _find_next_best_model(self, train_data: pd.DataFrame, Parameters ---------- train_data : pd.DataFrame - Data on which to fit the model + Data on which to fit the model. target_column_name : str - Name of the target column + Name of the target column. candidate_predictors : list - List of candidate predictors to test + List of candidate predictors to test. current_predictors : list - List of predictors on which to train the models + List of predictors on which to train the models. Returns ------- - MLModel + self.MLModel Best performing model """ # placeholders @@ -268,24 +278,25 @@ def _find_next_best_model(self, train_data: pd.DataFrame, return best_model def _train_model(self, train_data: pd.DataFrame, target_column_name: str, - predictors: list) -> MLModel: - """Train the model with a given set of predictors + predictors: list): + """Train the model with a given set of predictors. Parameters ---------- train_data : pd.DataFrame - Data on which to fit the model + Data on which to fit the model. target_column_name : str - Name of the target column + Name of the target column. predictors : list - List of predictors on which to train the models + List of predictors on which to train the models. Returns ------- - MLModel + self.MLModel trained model """ - model = MLModel() + # model = MLModel() + model = self.MLModel() model.fit(train_data[predictors], train_data[target_column_name]) diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index b326111..3207231 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -2,8 +2,9 @@ import numpy as np import pandas as pd from scipy import stats -from sklearn.metrics import roc_auc_score -from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_auc_score, mean_squared_error +from numpy import sqrt +from sklearn.linear_model import LogisticRegression, LinearRegression # custom imports import cobra.utils as utils @@ -11,15 +12,15 @@ class LogisticRegressionModel: """Wrapper around the LogisticRegression class, with additional methods - implemented such as evaluation (using auc), getting a list of coefficients, - a ditionary of coefficients per predictor, ... for convenience + implemented such as evaluation (using AUC), getting a list of coefficients, + a dictionary of coefficients per predictor, ... for convenience. Attributes ---------- logit : LogisticRegression - scikit-learn logistic regression model + scikit-learn logistic regression model. predictors : list - List of predictors used in the model + List of predictors used in the model. """ def __init__(self): @@ -61,7 +62,7 @@ def deserialize(self, model_dict: dict): Parameters ---------- model_dict : dict - Serialized JSON file as a dict + Serialized JSON file as a dict. Raises ------ @@ -74,10 +75,10 @@ def deserialize(self, model_dict: dict): self.logit = LogisticRegression() self.logit.set_params(**model_dict["params"]) - self.logit.classes_ = np.array(model_dict['classes_']) - self.logit.coef_ = np.array(model_dict['coef_']) - self.logit.intercept_ = np.array(model_dict['intercept_']) - self.logit.n_iter_ = np.array(model_dict['intercept_']) + self.logit.classes_ = np.array(model_dict["classes_"]) + self.logit.coef_ = np.array(model_dict["coef_"]) + self.logit.intercept_ = np.array(model_dict["intercept_"]) + self.logit.n_iter_ = np.array(model_dict["intercept_"]) self.predictors = model_dict["predictors"] self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"] @@ -117,9 +118,9 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series): Parameters ---------- X_train : pd.DataFrame - predictors of train data + Predictors of train data. y_train : pd.Series - target of train data + Target of train data. """ self.predictors = list(X_train.columns) self.logit.fit(X_train, y_train) @@ -131,7 +132,7 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: Parameters ---------- X : pd.DataFrame - dataset of predictors to score the model + Dataset of predictors to score the model. Returns ------- @@ -152,16 +153,16 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series, Parameters ---------- X : pd.DataFrame - dataset containing the predictor values for each observation + Dataset containing the predictor values for each observation. y : pd.Series - dataset containig the target of each observation + Dataset containing the target of each observation. split : str, optional - split of the dataset (e.g. train-selection-validation) + Split of the dataset (e.g. train-selection-validation). Returns ------- float - the performance score of the model (e.g. AUC) + the performance score of the model (AUC) """ if (split is None) or (split not in self._eval_metrics_by_split): @@ -184,7 +185,7 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: Parameters ---------- data : pd.DataFrame - data to score the model + Data to score the model. Returns ------- @@ -203,7 +204,7 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: } df = pd.DataFrame.from_dict(importance_by_variable, - orient='index').reset_index() + orient="index").reset_index() df.columns = ["predictor", "importance"] return (df.sort_values(by="importance", ascending=False) @@ -225,3 +226,218 @@ def _is_valid_dict(self, model_dict: dict) -> bool: return False return True + + +class LinearRegressionModel: + + """Wrapper around the LinearRegression class, with additional methods + implemented such as evaluation (using RMSE), getting a list of coefficients, + a dictionary of coefficients per predictor, ... for convenience. + + Attributes + ---------- + linear : LinearRegression + scikit-learn linear regression model. + predictors : list + List of predictors used in the model. + """ + + def __init__(self): + self.linear = LinearRegression(fit_intercept=True, normalize=False) + self._is_fitted = False + # placeholder to keep track of a list of predictors + self.predictors = [] + self._eval_metrics_by_split = {} + + def serialize(self) -> dict: + """Serialize model as JSON + + Returns + ------- + dict + dictionary containing the serialized JSON + """ + serialized_model = { + "meta": "linear-regression", + "predictors": self.predictors, + "_eval_metrics_by_split": self._eval_metrics_by_split, + "params": self.linear.get_params() + } + + if self._is_fitted: + serialized_model.update({ + "coef_": self.linear.coef_.tolist(), + "intercept_": self.linear.intercept_.tolist() + }) + + return serialized_model + + def deserialize(self, model_dict: dict): + """Deserialize a model previously stored as JSON + + Parameters + ---------- + model_dict : dict + Serialized JSON file as a dict. + + Raises + ------ + ValueError + In case JSON file is no valid serialized model + """ + + if not self._is_valid_dict(model_dict): + raise ValueError("No valid serialized model") + + self.linear = LinearRegression() + self.linear.set_params(**model_dict["params"]) + self.linear.coef_ = np.array(model_dict["coef_"]) + self.linear.intercept_ = np.array(model_dict["intercept_"]) + self.predictors = model_dict["predictors"] + self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"] + + def get_coef(self) -> np.array: + """Returns the model coefficients + + Returns + ------- + np.array + array of model coefficients + """ + return self.linear.coef_[0] + + def get_intercept(self) -> float: + """Returns the intercept of the model + + Returns + ------- + float + intercept of the model + """ + return self.linear.intercept_[0] + + def get_coef_by_predictor(self) -> dict: + """Returns a dictionary mapping predictor (key) to coefficient (value) + + Returns + ------- + dict + map ``{predictor: coefficient}`` + """ + return dict(zip(self.predictors, self.linear.coef_[0])) + + def fit(self, X_train: pd.DataFrame, y_train: pd.Series): + """Fit the model + + Parameters + ---------- + X_train : pd.DataFrame + Predictors of train data. + y_train : pd.Series + Target of train data. + """ + self.predictors = list(X_train.columns) + self.linear.fit(X_train, y_train) + self._is_fitted = True + + def score_model(self, X: pd.DataFrame) -> np.ndarray: + """Score a model on a (new) dataset + + Parameters + ---------- + X : pd.DataFrame + Dataset of predictors to score the model. + + Returns + ------- + np.ndarray + score of the model for each observation + """ + # We select predictor columns (self.predictors) here to + # ensure we have the proper predictors and the proper order!!! + return self.linear.predict(X[self.predictors])[:, 1] + + def evaluate(self, X: pd.DataFrame, y: pd.Series, + split: str=None) -> float: + """Evaluate the model on a given data set (X, y). The optional split + parameter is to indicate that the data set belongs to + (train, selection, validation), so that the computation on these sets + can be cached! + + Parameters + ---------- + X : pd.DataFrame + Dataset containing the predictor values for each observation. + y : pd.Series + Dataset containing the target of each observation. + split : str, optional + Split of the dataset (e.g. train-selection-validation). + + Returns + ------- + float + the performance score of the model (RMSE) + """ + + if (split is None) or (split not in self._eval_metrics_by_split): + + y_pred = self.score_model(X) + + performance = sqrt(mean_squared_error(y_true=y, y_pred=y_pred)) + + if split is None: + return performance + else: + self._eval_metrics_by_split[split] = performance + + return self._eval_metrics_by_split[split] + + def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: + """Compute the importance of each predictor in the model and return + it as a DataFrame + + Parameters + ---------- + data : pd.DataFrame + Data to score the model. + + Returns + ------- + pd.DataFrame + DataFrame containing columns predictor and importance + """ + + y_pred = self.score_model(data) + + importance_by_variable = { + utils.clean_predictor_name(predictor): stats.pearsonr( + data[predictor], + y_pred + )[0] + for predictor in self.predictors + } + + df = pd.DataFrame.from_dict(importance_by_variable, + orient="index").reset_index() + df.columns = ["predictor", "importance"] + + return (df.sort_values(by="importance", ascending=False) + .reset_index(drop=True)) + + def _is_valid_dict(self, model_dict: dict) -> bool: + + if ("meta" not in model_dict + or model_dict["meta"] != "linear-regression"): + return False + + attr = ["coef_", "intercept_", "predictors"] + for key in attr: + if not (key in model_dict or type(model_dict[key]) != list): + return False + + if ("params" not in model_dict + or "_eval_metrics_by_split" not in model_dict): + return False + + return True + diff --git a/requirements.txt b/requirements.txt index 9f3d508..9670f11 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ numpy>=1.19.4 pandas>=1.1.5 scipy>=1.5.4 -scikit-learn>=0.23.1 +scikit-learn>=0.24 matplotlib>=3.3.3 seaborn>=0.11.0 tqdm>=4.59.0 \ No newline at end of file diff --git a/tests/model_building/test_forward_selection.py b/tests/model_building/test_forward_selection.py index 8c7c6ce..44620f7 100644 --- a/tests/model_building/test_forward_selection.py +++ b/tests/model_building/test_forward_selection.py @@ -2,9 +2,8 @@ import pytest import pandas as pd -import numpy as np -from cobra.model_building.models import LogisticRegressionModel +from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel from cobra.model_building.forward_selection import ForwardFeatureSelection @@ -12,30 +11,35 @@ def does_not_raise(): yield - -def mock_model_num_pred(n_predictors): - predictors = [f"var{i + 1}_enc" for i in range(n_predictors)] - return mock_model(predictors) - - -def mock_model(predictor_list): - model = LogisticRegressionModel() - model.predictors = predictor_list - - return model - - -def mock_data(add_split_col: bool=False): +def mock_data(add_split_col: bool=False, model_type="classification"): data = pd.DataFrame({"var1_enc": [0.42] * 10, "var2_enc": [0.94] * 10, - "var3_enc": [0.87] * 10, - "target": ([0] * 5 + [1] * 2 + [0] * 2 + [1])}) + "var3_enc": [0.87] * 10}) + + if model_type == "classification": + data["target"] = ([0] * 5 + [1] * 2 + [0] * 2 + [1]) + elif model_type == "regression": + data["target"] = [7, 2, 2, 9, 7, 3, 1, 4, 8, 5] if add_split_col: data.loc[:, "split"] = (["train"] * 7 + ["selection"] * 3) return data +def mock_model_num_pred(n_predictors, model_type="classification"): + predictors = [f"var{i + 1}_enc" for i in range(n_predictors)] + return mock_model(predictors, model_type) + +def mock_model(predictor_list, model_type="classification"): + if model_type == "classification": + model = LogisticRegressionModel() + elif model_type == "regression": + model = LinearRegressionModel() + + model.predictors = predictor_list + + return model + class TestForwardFeatureSelection: @@ -46,27 +50,30 @@ def test_get_model_from_step(self): with pytest.raises(ValueError): forward_selection.get_model_from_step(2) - def test_compute_model_performances(self, mocker): + @pytest.mark.parametrize("model_type", ["classification", "regression"]) + def test_compute_model_performances(self, mocker, model_type): - data = mock_data(add_split_col=True) + data = mock_data(add_split_col=True, model_type=model_type) - fw_selection = ForwardFeatureSelection() + fw_selection = ForwardFeatureSelection(model_type=model_type) fw_selection._fitted_models = [ - mock_model_num_pred(1), - mock_model_num_pred(2), - mock_model_num_pred(3) + mock_model_num_pred(1, model_type=model_type), + mock_model_num_pred(2, model_type=model_type), + mock_model_num_pred(3, model_type=model_type) ] - def mock_evaluate(self, X, y, split): + def mock_evaluate(self, X, y, split): # on AUC scale, but gives the same for RMSE as it is a mock if split == "train": return 0.612 else: return 0.609 - (mocker - .patch(("cobra.model_building.forward_selection" - ".MLModel.evaluate"), - mock_evaluate)) + if model_type == "classification": + patch_fct = "cobra.model_building.forward_selection.LogisticRegressionModel.evaluate" + elif model_type == "regression": + patch_fct = "cobra.model_building.forward_selection.LinearRegressionModel.evaluate" + + mocker.patch(patch_fct, mock_evaluate) actual = (fw_selection .compute_model_performances(data, "target", @@ -84,42 +91,46 @@ def mock_evaluate(self, X, y, split): pd.testing.assert_frame_equal(actual, expected) - @pytest.mark.parametrize("max_predictors, expectation", - [(2, pytest.raises(ValueError)), - (3, does_not_raise()), - (5, does_not_raise()), - (10, does_not_raise()), - (15, does_not_raise())]) - def test_fit(self, mocker, max_predictors: int, expectation): - - # create list of elements [var1_enc, var2_c, ..., var10_enc] + @pytest.mark.parametrize("model_type, max_predictors, expectation", + [("classification", 2, pytest.raises(ValueError)), + ("classification", 3, does_not_raise()), + ("classification", 5, does_not_raise()), + ("classification", 10, does_not_raise()), + ("classification", 15, does_not_raise()), + ("regression", 2, pytest.raises(ValueError)), + ("regression", 3, does_not_raise()), + ("regression", 5, does_not_raise()), + ("regression", 10, does_not_raise()), + ("regression", 15, does_not_raise()) + ]) + def test_fit(self, mocker, model_type, max_predictors: int, expectation): + + # create list of elements [var1_enc, var2_enc, ..., var10_enc] predictors_list = [f"var{i+1}_enc" for i in range(10)] - # extract sublist [var1_enc, var5_enc, var9_enc]: + # extract sublist [var1_enc, var5_enc, var9_enc] forced_predictors_list = predictors_list[::4] ordered_output_list = (forced_predictors_list + [pred for pred in predictors_list if pred not in forced_predictors_list]) - fw_selection = ForwardFeatureSelection(max_predictors=max_predictors) + fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors) def mock_train_model(self, train_data, target_column_name, predictors): - return mock_model(predictors) + return mock_model(predictors, model_type=model_type) def mock_forward_selection(self, train_data, target_column_name, predictors, forced_predictors): - n_models = min(max_predictors, - len(predictors) + len(forced_predictors)) + n_models = min(max_predictors, len(predictors) + len(forced_predictors)) - return [mock_model(ordered_output_list[:i+1]) + return [mock_model(ordered_output_list[:i+1], model_type=model_type) for i in range(n_models)] - (mocker - .patch("cobra.model_building.ForwardFeatureSelection._train_model", - mock_train_model)) + mocker.patch("cobra.model_building.ForwardFeatureSelection._train_model", + mock_train_model) - mocker.patch(("cobra.model_building.ForwardFeatureSelection" - "._forward_selection"), mock_forward_selection) + mocker.patch("cobra.model_building.ForwardFeatureSelection._forward_selection", + mock_forward_selection) with expectation: fw_selection.fit(pd.DataFrame(), "target", @@ -140,8 +151,14 @@ def mock_forward_selection(self, train_data, target_column_name, assert actual == expected - @pytest.mark.parametrize("max_predictors", [5, 10, 15]) - def test_forward_selection(self, mocker, max_predictors: int): + @pytest.mark.parametrize("model_type, max_predictors", [("classification", 5), + ("classification", 10), + ("classification", 15), + ("regression", 5), + ("regression", 10), + ("regression", 15) + ]) + def test_forward_selection(self, mocker, model_type, max_predictors: int): # create list of elements [var1_enc, var2_c, ..., var10_enc] predictors_list = [f"var{i+1}_enc" for i in range(10)] @@ -157,12 +174,12 @@ def test_forward_selection(self, mocker, max_predictors: int): def mock_find_next_best_model(self, train_data, target_column_name, candidate_predictors, current_predictors): - return mock_model(current_predictors + candidate_predictors[0:1]) + return mock_model(current_predictors + candidate_predictors[0:1], model_type=model_type) mocker.patch(("cobra.model_building.ForwardFeatureSelection." "_find_next_best_model"), mock_find_next_best_model) - fw_selection = ForwardFeatureSelection(max_predictors=max_predictors) + fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors) fitted_models = (fw_selection. _forward_selection(pd.DataFrame(), "target", diff --git a/tests/model_building/test_models.py b/tests/model_building/test_models.py index 80623f5..80425b7 100644 --- a/tests/model_building/test_models.py +++ b/tests/model_building/test_models.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from cobra.model_building.models import LogisticRegressionModel +from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel def mock_data(): @@ -10,9 +10,11 @@ def mock_data(): "var3_enc": [0.87] * 10}) -def mock_score_model(self, data): +def mock_score_model_classification(self, data): return np.array([0.5, 0.8, 0.2, 0.9, 0.1, 0.7, 0.3, 0.6, 0.4, 0.5]) +def mock_score_model_regression(self, data): + return np.array([0.7, 0.2, 0.2, 0.9, 0.7, 0.3, 0.1, 0.4, 0.8, 0.5])*15 class TestLogisticRegressionModel: @@ -26,7 +28,7 @@ def mock_roc_auc_score(y_true, y_score): (mocker .patch("cobra.model_building.LogisticRegressionModel.score_model", - mock_score_model)) + mock_score_model_classification)) (mocker .patch("cobra.model_building.models.roc_auc_score", @@ -45,8 +47,7 @@ def test_evaluate_cached(self): model = LogisticRegressionModel() model._eval_metrics_by_split["train"] = expected - actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), - split) + actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split) assert actual == expected @@ -57,7 +58,7 @@ def mock_pearsonr(ypred, ytrue): (mocker .patch("cobra.model_building.LogisticRegressionModel.score_model", - mock_score_model)) + mock_score_model_classification)) (mocker .patch("cobra.model_building.models.stats.pearsonr", @@ -78,7 +79,7 @@ def mock_pearsonr(ypred, ytrue): pd.testing.assert_frame_equal(actual, expected) - def test_serialize(self, mocker): + def test_serialize(self): model = LogisticRegressionModel() actual = model.serialize() @@ -145,6 +146,115 @@ def test_deserialize(self): assert logit.get_params() == model_dict["params"] assert logit.classes_.all() == np.array(model_dict["classes_"]).all() assert logit.n_iter_.all() == np.array(model_dict["n_iter_"]).all() - assert logit.intercept_.all() == (np.array(model_dict["intercept_"]) - .all()) + assert logit.intercept_.all() == (np.array(model_dict["intercept_"]).all()) assert logit.coef_.all() == np.array(model_dict["coef_"]).all() + +class TestLinearRegressionModel: + + def test_evaluate(self, mocker): + + X = mock_data() + y = pd.Series(np.array([0.6, 0.1, 0.2, 0.9, 0.8, 0.3, 0.2, 0.4, 0.9, 0.5])*12) + + def mock_mean_squared_error(y_true, y_pred): + return 1.23 + + (mocker + .patch("cobra.model_building.LinearRegressionModel.score_model", + mock_score_model_regression)) + + (mocker + .patch("cobra.model_building.models.mean_squared_error", + mock_mean_squared_error)) + + model = LinearRegressionModel() + actual = model.evaluate(X, y) + + assert actual == np.sqrt(1.23) + + def test_evaluate_cached(self): + + split = "train" + expected = np.sqrt(1.23) + + model = LinearRegressionModel() + model._eval_metrics_by_split["train"] = expected + + actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split) + + assert actual == expected + + def test_compute_variable_importance(self, mocker): + + def mock_pearsonr(ypred, ytrue): + return [ypred.unique()[0]] + + (mocker + .patch("cobra.model_building.LinearRegressionModel.score_model", + mock_score_model_regression)) + + (mocker + .patch("cobra.model_building.models.stats.pearsonr", + mock_pearsonr)) + + model = LinearRegressionModel() + model.predictors = ["var1_enc", "var2_enc", "var3_enc"] + + data = mock_data() + + actual = model.compute_variable_importance(data) + + expected = pd.DataFrame([ + {"predictor": "var1", "importance": data["var1_enc"].unique()[0]}, + {"predictor": "var2", "importance": data["var2_enc"].unique()[0]}, + {"predictor": "var3", "importance": data["var3_enc"].unique()[0]} + ]).sort_values(by="importance", ascending=False).reset_index(drop=True) + + pd.testing.assert_frame_equal(actual, expected) + + def test_serialize(self): + + model = LinearRegressionModel() + actual = model.serialize() + + expected = { + "meta": "linear-regression", + "predictors": [], + "_eval_metrics_by_split": {}, + "params": { + "copy_X": True, + "fit_intercept": True, + "n_jobs": None, + "normalize": False, + "positive": False + } + } + + assert actual == expected + + def test_deserialize(self): + + model = LinearRegressionModel() + + model_dict = { + "meta": "linear-regression", + "predictors": [], + "_eval_metrics_by_split": {}, + "params": { + "copy_X": True, + "fit_intercept": True, + "n_jobs": None, + "normalize": False, + "positive": False + }, + "coef_": [[0.5, 0.75]], + "intercept_": [-3] + } + + model.deserialize(model_dict) + + linear = model.linear + assert linear.get_params() == model_dict["params"] + assert linear.intercept_.all() == (np.array(model_dict["intercept_"]).all()) + assert linear.coef_.all() == np.array(model_dict["coef_"]).all() +