From 07338fb213dd269abe104c12a6887c4dac0b6fb1 Mon Sep 17 00:00:00 2001 From: sborms Date: Fri, 6 Aug 2021 17:07:13 +0200 Subject: [PATCH 1/3] add model_type functionality in PreProcessor class & tests --- .../categorical_data_processor.py | 4 +- cobra/preprocessing/preprocessor.py | 60 +++++++++++-------- tests/preprocessing/test_preprocessor.py | 1 + 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 8a07331..5993b5d 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -46,7 +46,7 @@ class CategoricalDataProcessor(BaseEstimator): keep_missing : bool Whether or not to keep missing as a separate category. model_type : str - Model type ("classification" or "regression"). + Model type (``classification`` or ``regression``). p_value_threshold : float Significance threshold for regrouping. regroup : bool @@ -442,7 +442,7 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str, category : str Category for which we carry out the test. model_type : str - Model type ("classification" or "regression"). + Model type (``classification`` or ``regression``). scale_contingency_table : bool Whether we scale contingency table with incidence rate. Only used when model_type = "classification". diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 0177e34..3338ec4 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -44,23 +44,27 @@ class PreProcessor(BaseEstimator): ---------- categorical_data_processor : CategoricalDataProcessor Instance of CategoricalDataProcessor to do the preprocessing of - categorical variables + categorical variables. The model_type variable is specified + here (``classification`` or ``regression``). discretizer : KBinsDiscretizer Instance of KBinsDiscretizer to do the prepocessing of continuous - variables by means of discretization + variables by means of discretization. serialization_path : str - path to save the pipeline to + Path to save the pipeline to. stratify_split : bool - Whether or not to stratify the train-test split + Whether or not to stratify the train-test split. target_encoder : TargetEncoder - Instance of TargetEncoder to do the incidence replacement + Instance of TargetEncoder to do the incidence replacement. """ - def __init__(self, categorical_data_processor: CategoricalDataProcessor, + def __init__(self, + categorical_data_processor: CategoricalDataProcessor, discretizer: KBinsDiscretizer, target_encoder: TargetEncoder, is_fitted: bool = False): + self.model_type = categorical_data_processor.model_type + self._categorical_data_processor = categorical_data_processor self._discretizer = discretizer self._target_encoder = target_encoder @@ -69,6 +73,7 @@ def __init__(self, categorical_data_processor: CategoricalDataProcessor, @classmethod def from_params(cls, + model_type: str = "classification", n_bins: int = 10, strategy: str = "quantile", closed: str = "right", @@ -91,16 +96,18 @@ def from_params(cls, Parameters ---------- + model_type : str + Model type (``classification`` or ``regression``). n_bins : int, optional Number of bins to produce. Raises ValueError if ``n_bins < 2``. strategy : str, optional Binning strategy. Currently only ``uniform`` and ``quantile`` - e.g. equifrequency is supported + e.g. equifrequency is supported. closed : str, optional - Whether to close the bins (intervals) from the left or right + Whether to close the bins (intervals) from the left or right. auto_adapt_bins : bool, optional - reduces the number of bins (starting from n_bins) as a function of - the number of missings + Reduces the number of bins (starting from n_bins) as a function of + the number of missings. starting_precision : int, optional Initial precision for the bin edges to start from, can also be negative. Given a list of bin edges, the class will @@ -110,33 +117,32 @@ def from_params(cls, will be made to round up the numbers of the bin edges e.g. ``5.55 -> 10``, ``146 -> 100``, ... label_format : str, optional - format string to display the bin labels + Format string to display the bin labels e.g. ``min - max``, ``(min, max]``, ... change_endpoint_format : bool, optional Whether or not to change the format of the lower and upper bins into ``< x`` and ``> y`` resp. regroup : bool - Whether or not to regroup categories + Whether or not to regroup categories. regroup_name : str - New name of the non-significant regrouped variables + New name of the non-significant regrouped variables. keep_missing : bool - Whether or not to keep missing as a separate category + Whether or not to keep missing as a separate category. category_size_threshold : int - minimal size of a category to keep it as a separate category + Minimal size of a category to keep it as a separate category. p_value_threshold : float Significance threshold for regrouping. forced_categories : dict Map to prevent certain categories from being group into ``Other`` for each column - dict of the form ``{col:[forced vars]}``. scale_contingency_table : bool - Whether contingency table should be scaled before chi^2.' + Whether contingency table should be scaled before chi^2. weight : float, optional Smoothing parameters (non-negative). The higher the value of the parameter, the bigger the contribution of the overall mean. - When set to zero, there is no smoothing - (e.g. the pure target incidence is used). + When set to zero, there is no smoothing (e.g. the pure target incidence is used). imputation_strategy : str, optional - in case there is a particular column which contains new categories, + In case there is a particular column which contains new categories, the encoding will lead to NULL values which should be imputed. Valid strategies are to replace with the global mean of the train set or the min (resp. max) incidence of the categories of that @@ -145,9 +151,11 @@ def from_params(cls, Returns ------- PreProcessor - Description + class encapsulating CategoricalDataProcessor, + KBinsDiscretizer, and TargetEncoder instances """ categorical_data_processor = CategoricalDataProcessor( + model_type, regroup, regroup_name, keep_missing, @@ -155,15 +163,17 @@ def from_params(cls, p_value_threshold, scale_contingency_table, forced_categories) + discretizer = KBinsDiscretizer(n_bins, strategy, closed, auto_adapt_bins, starting_precision, label_format, change_endpoint_format) - target_encoder = TargetEncoder(weight) + target_encoder = TargetEncoder(weight, imputation_strategy) - return cls(categorical_data_processor, discretizer, target_encoder) + return cls(model_type, + categorical_data_processor, discretizer, target_encoder) @classmethod def from_pipeline(cls, pipeline: dict): @@ -187,12 +197,13 @@ def from_pipeline(cls, pipeline: dict): """ if not PreProcessor._is_valid_pipeline(pipeline): - raise ValueError("Invalid pipeline") # To do: specify error + raise ValueError("Invalid pipeline") ## TODO: specify error categorical_data_processor = CategoricalDataProcessor() categorical_data_processor.set_attributes_from_dict( pipeline["categorical_data_processor"] ) + model_type = categorical_data_processor.model_type discretizer = KBinsDiscretizer() discretizer.set_attributes_from_dict(pipeline["discretizer"]) @@ -200,7 +211,8 @@ def from_pipeline(cls, pipeline: dict): target_encoder = TargetEncoder() target_encoder.set_attributes_from_dict(pipeline["target_encoder"]) - return cls(categorical_data_processor, discretizer, target_encoder, + return cls(model_type, + categorical_data_processor, discretizer, target_encoder, is_fitted=pipeline["_is_fitted"]) def fit(self, train_data: pd.DataFrame, continuous_vars: list, diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py index 80f6d73..2e42759 100644 --- a/tests/preprocessing/test_preprocessor.py +++ b/tests/preprocessing/test_preprocessor.py @@ -97,6 +97,7 @@ def test_is_valid_pipeline(self, injection_location: str, # is_valid_pipeline only checks for relevant keys atm pipeline_dict = { "categorical_data_processor": { + "model_type": None, "regroup": None, "regroup_name": None, "keep_missing": None, From 5a05f407fc82654646605ec736d31bb0082978a7 Mon Sep 17 00:00:00 2001 From: Sander Vanden Hautte Date: Wed, 11 Aug 2021 14:53:02 +0200 Subject: [PATCH 2/3] Issue #75: PIGs for regression. --- cobra/evaluation/pigs_tables.py | 145 +++++++++++++++++++------------- cobra/utils.py | 23 +++-- 2 files changed, 101 insertions(+), 67 deletions(-) diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py index 3e39411..d22ad17 100644 --- a/cobra/evaluation/pigs_tables.py +++ b/cobra/evaluation/pigs_tables.py @@ -7,52 +7,55 @@ import cobra.utils as utils -def generate_pig_tables(data: pd.DataFrame, +def generate_pig_tables(basetable: pd.DataFrame, id_column_name: str, target_column_name: str, preprocessed_predictors: list) -> pd.DataFrame: - """Compute PIG tables for all predictors in preprocessed_predictors. The - output is a DataFrame with columns ``variable``, ``label``, ``pop_size``, - ``avg_incidence`` and ``incidence`` + """Compute PIG tables for all predictors in preprocessed_predictors. + + The output is a DataFrame with columns ``variable``, ``label``, + ``pop_size``, ``global_avg_target`` and ``avg_target``. Parameters ---------- - data : pd.DataFrame - basetable to compute PIG tables of + basetable : pd.DataFrame + Basetable to compute PIG tables from. id_column_name : str - column name of the id (e.g. customernumber) + Name of the basetable column containing the IDs of the basetable rows + (e.g. customernumber). target_column_name : str - column name of the target - predictors: list - list of preprocessed predictor names + Name of the basetable column containing the target values to predict. + preprocessed_predictors: list + List of basetable column names containing preprocessed predictors. Returns ------- pd.DataFrame - DataFrame containing a PIG table for all predictors + DataFrame containing a PIG table for all predictors. """ - - pigs = [compute_pig_table(data, column_name, target_column_name, - id_column_name) - for column_name in sorted(preprocessed_predictors) - if column_name not in [id_column_name, target_column_name]] - + pigs = [ + compute_pig_table(basetable, + column_name, + target_column_name, + id_column_name) + for column_name in sorted(preprocessed_predictors) + if column_name not in [id_column_name, target_column_name] + ] output = pd.concat(pigs) - return output -def compute_pig_table(data: pd.DataFrame, - column_name: str, +def compute_pig_table(basetable: pd.DataFrame, + predictor_column_name: str, target_column_name: str, id_column_name: str) -> pd.DataFrame: - """Compute the pig table of a given predictor for a given target + """Compute the PIG table of a given predictor for a given target. Parameters ---------- - data : pd.DataFrame + basetable : pd.DataFrame input data from which to compute the pig table - column_name : str + predictor_column_name : str predictor name of which to compute the pig table target_column_name : str name of the target variable @@ -62,57 +65,74 @@ def compute_pig_table(data: pd.DataFrame, Returns ------- pd.DataFrame - pig table as a DataFrame + PIG table as a DataFrame """ - avg_incidence = data[target_column_name].mean() + global_avg_target = basetable[target_column_name].mean() # group by the binned variable, compute the incidence # (=mean of the target for the given bin) and compute the bin size # (e.g. COUNT(id_column_name)). After that, rename the columns - res = (data.groupby(column_name) - .agg({target_column_name: "mean", id_column_name: "size"}) - .reset_index() - .rename(columns={column_name: "label", - target_column_name: "incidence", - id_column_name: "pop_size"})) + res = (basetable.groupby(predictor_column_name) + .agg({target_column_name: "mean", id_column_name: "size"}) + .reset_index() + .rename(columns={predictor_column_name: "label", + target_column_name: "avg_target", + id_column_name: "pop_size"})) # add the column name to a variable column # add the average incidence # replace population size by a percentage of total population - res["variable"] = utils.clean_predictor_name(column_name) - res["avg_incidence"] = avg_incidence - res["pop_size"] = res["pop_size"]/len(data.index) + res["variable"] = utils.clean_predictor_name(predictor_column_name) + res["global_avg_target"] = global_avg_target + res["pop_size"] = res["pop_size"]/len(basetable.index) # make sure to always return the data with the proper column order column_order = ["variable", "label", "pop_size", - "avg_incidence", "incidence"] + "global_avg_target", "avg_target"] return res[column_order] -def plot_incidence(df: pd.DataFrame, variable: str, - column_order: list = None, dim: tuple = (12, 8)): - """Function plots Predictor Incidence Graphs (PIGs). - Bins are ordered in descening order of bin incidence +def plot_incidence(pig_tables: pd.DataFrame, + variable: str, + model_type: str, + column_order: list = None, + dim: tuple = (12, 8)): + """Plots a Predictor Insights Graph (PIG), a graph in which the mean + target value is plotted for a number of bins constructed from a predictor + variable. When the target actually is a binary classification target, + the plotted mean target value is actually the target incidence rate. + + Bins are ordered in descending order of mean target value unless specified otherwise with `column_order` list. + Parameters ---------- - df: pd.DataFrame - dataframe with cleaned, binned, partitioned and prepared data + pig_tables: pd.DataFrame + dataframe with cleaned, binned, partitioned and prepared data, + as created by generate_pig_tables() from this module. variable: str - variable for which the incidence plot will be shown + name of the predictor variable for which the PIG will be plotted. + model_type: str + type of model (either "classification" or "regression"). column_order: list, default=None - explicit order of variable + explicit order of the value bins of the predictor variable to be used + on the PIG. dim: tuple, default=(12, 8) - tuple with width and lentgh of the plot + optional tuple to configure the width and length of the plot. """ - df_plot = df[df['variable'] == variable].copy() + if model_type not in ["classification", "regression"]: + raise ValueError("An unexpected value was set for the model_type " + "parameter. Expected 'classification' or " + "'regression'.") - if column_order is not None: + df_plot = pig_tables[pig_tables['variable'] == variable].copy() + if column_order is not None: if not set(df_plot['label']) == set(column_order): raise ValueError( - 'Variables in column_order and dataframe are not equal') + 'The column_order and pig_tables parameters do not contain ' + 'the same set of variables.') df_plot['label'] = df_plot['label'].astype('category') df_plot['label'].cat.reorder_categories(column_order, @@ -121,37 +141,42 @@ def plot_incidence(df: pd.DataFrame, variable: str, df_plot.sort_values(by=['label'], ascending=True, inplace=True) df_plot.reset_index(inplace=True) else: - df_plot.sort_values(by=['incidence'], ascending=False, inplace=True) + df_plot.sort_values(by=['avg_target'], ascending=False, inplace=True) df_plot.reset_index(inplace=True) with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) # ----------------- - # Left axis - incidence + # Left axis - average target # ----------------- - ax.plot(df_plot['label'], df_plot['incidence'], + ax.plot(df_plot['label'], df_plot['avg_target'], color="#00ccff", marker=".", - markersize=20, linewidth=3, label='incidence rate per bin', + markersize=20, linewidth=3, + label='incidence rate per bin' if model_type == "classification" else "mean target value per bin", zorder=10) - ax.plot(df_plot['label'], df_plot['avg_incidence'], + ax.plot(df_plot['label'], df_plot['global_avg_target'], color="#022252", linestyle='--', linewidth=4, - label='average incidence rate', + label='average incidence rate' if model_type == "classification" else "global mean target value", zorder=10) # dummy line to have label on second axis from first ax.plot(np.nan, "#939598", linewidth=6, label='bin size') # set labels & ticks - ax.set_ylabel('incidence', fontsize=16) + ax.set_ylabel('incidence' if model_type == "classification" else "mean target value", + fontsize=16) ax.set_xlabel('{} bins' ''.format(variable), fontsize=16) ax.xaxis.set_tick_params(rotation=45, labelsize=14) ax.yaxis.set_tick_params(labelsize=14) - ax.set_yticks(np.arange(0, max(df_plot['incidence'])+0.05, 0.05)) - ax.yaxis.set_major_formatter( - FuncFormatter(lambda y, _: '{:.1%}'.format(y))) + if model_type == "classification": + # Mean target values are between 0 and 1 (target incidence rate), + # so format them as percentages: + ax.set_yticks(np.arange(0, max(df_plot['avg_target'])+0.05, 0.05)) + ax.yaxis.set_major_formatter( + FuncFormatter(lambda y, _: '{:.1%}'.format(y))) # removes ticks but keeps the labels ax.tick_params(axis='both', which='both', length=0) @@ -185,7 +210,11 @@ def plot_incidence(df: pd.DataFrame, variable: str, ax2.grid(False) # title & legend - fig.suptitle('Incidence Plot - ' + variable, fontsize=22, y=1.02) + if model_type == "classification": + title = "Incidence plot - " + variable + else: + title = "Mean target plot - " + variable + fig.suptitle(title, fontsize=22, y=1.02) ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102), loc=3, ncol=1, mode="expand", borderaxespad=0., prop={"size": 14}) diff --git a/cobra/utils.py b/cobra/utils.py index c129270..f138fd7 100644 --- a/cobra/utils.py +++ b/cobra/utils.py @@ -1,12 +1,17 @@ -def clean_predictor_name(predictor: str) -> str: - """Strip-off redundant suffix (e.g. "_enc" or "_bin") from the predictor - name to return a clean version of the predictor +def clean_predictor_name(predictor_name: str) -> str: + """Strip the redundant suffix (e.g. "_enc" or "_bin") off from the end + of the predictor name to return a clean version of the predictor. - Args: - predictor (str): Description + Parameters + ---------- + predictor_name : str + Description - Returns: - str: Description + Returns + ------- + str + Description """ - return (predictor.replace("_enc", "").replace("_bin", "") - .replace("_processed", "")) + return (predictor_name.replace("_enc", "") + .replace("_bin", "") + .replace("_processed", "")) From 027d1c71a55b6860c83d48261a32e07ea95fb467 Mon Sep 17 00:00:00 2001 From: Sander Vanden Hautte Date: Thu, 12 Aug 2021 12:33:08 +0200 Subject: [PATCH 3/3] Revised unit tests for issue #75: PIGs for regression. --- tests/evaluation/test_evaluation.py | 55 ++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/tests/evaluation/test_evaluation.py b/tests/evaluation/test_evaluation.py index 16273f2..b3b1a3f 100644 --- a/tests/evaluation/test_evaluation.py +++ b/tests/evaluation/test_evaluation.py @@ -23,11 +23,58 @@ def mock_preds(n, seed = 505): class TestEvaluation: - def test_plot_incidence(self): + def test_plot_incidence_with_unsupported_model_type(self): + with pytest.raises(ValueError): + plot_incidence(pig_tables=None, + variable="", + model_type="anomaly_detection") + + def test_plot_incidence_with_different_column_orders(self): + data = mock_data() + with pytest.raises(ValueError): + plot_incidence(pig_tables=data, + variable='education', + model_type="classification", + # different bins than in the data variable: + column_order=['1st-4th', '5th-6th', '7th-8th']) + + # Stubs for later: requires exposing df_plot and testing matplotlib's + # plot object internals: + """ + def test_plot_incidence_without_column_order(self): + data = mock_data() + plot_incidence(pig_tables=data, + variable='education', + model_type="classification", + column_order=None) + # Can't assert: df_plot is not exposed by the function + + def test_plot_incidence_with_column_order(self): + data = mock_data() + plot_incidence(pig_tables=data, + variable='education', + model_type="classification", + column_order=['1st-4th', '5th-6th', '7th-8th', '9th']) + # Can't assert: df_plot is not exposed by the function + + def test_plot_incidence_visual_result_for_classification(self): data = mock_data() - column_order = ['1st-4th', '5th-6th', '7th-8th'] - with pytest.raises(Exception): - plot_incidence(data, 'education', column_order) + plot_incidence(pig_tables=data, + variable='education', + model_type="classification", + column_order=['1st-4th', '5th-6th', '7th-8th', '9th']) + # Can't assert: would need to check matplotlib's fig and ax + # internals. + + def test_plot_incidence_visual_result_for_regression(self): + data = mock_data() # change into regression target though. + plot_incidence(pig_tables=data, + variable='education', + model_type="classification", + column_order=['1st-4th', '5th-6th', '7th-8th', '9th']) + # Can't assert: would need to check matplotlib's fig and ax + # internals. + """ def test_lift_curve_n_bins(self): n_bins_test = [5, 10, 15, 35]