From 7f339fde8590b0c9b7af6d282450b49a0914a1dd Mon Sep 17 00:00:00 2001 From: sborms Date: Thu, 5 Aug 2021 18:17:35 +0200 Subject: [PATCH 1/3] added model_type parameter, different statistical test for regression models, included a simple test --- .gitignore | 5 +- .../categorical_data_processor.py | 106 ++++++++++++------ .../test_categorical_data_processor.py | 21 +++- 3 files changed, 91 insertions(+), 41 deletions(-) diff --git a/.gitignore b/.gitignore index 4bb934a..4ec86e3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ -#Ignored directories in root folder - +# Ignored directories in root folder # Byte-compiled / optimized / DLL files __pycache__/ @@ -107,7 +106,7 @@ ENV/ # vscode settings .vscode/ -# Other ignore files +# Other ignored files *.pptx *.ppt .idea/ diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index c24a550..1ebde44 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -14,6 +14,7 @@ - Jan Benisek (implementation) - Matthias Roels (implementation) """ + # standard lib imports import re from typing import Optional @@ -38,33 +39,41 @@ class CategoricalDataProcessor(BaseEstimator): Attributes ---------- category_size_threshold : int - minimal size of a category to keep it as a separate category + Minimal size of a category to keep it as a separate category. forced_categories : dict Map to prevent certain categories from being group into ``Other`` - for each colum - dict of the form ``{col:[forced vars]}``. + for each column - dict of the form ``{col:[forced vars]}``. keep_missing : bool - Whether or not to keep missing as a separate category + Whether or not to keep missing as a separate category. + model_type : str + Model type ("classification" or "regression"). p_value_threshold : float Significance threshold for regrouping. regroup : bool - Whether or not to regroup categories + Whether or not to regroup categories. regroup_name : str New name of the non-significant regrouped variables scale_contingency_table : bool - Whether contingency table should be scaled before chi^2.' + Whether contingency table should be scaled before chi^2. """ - valid_keys = ["regroup", "regroup_name", "keep_missing", + valid_keys = ["model_type", "regroup", "regroup_name", "keep_missing", "category_size_threshold", "p_value_threshold", "scale_contingency_table", "forced_categories"] - def __init__(self, regroup: bool = True, regroup_name: str = "Other", + def __init__(self, + model_type: str = "classification", + regroup: bool = True, + regroup_name: str = "Other", keep_missing: bool = True, category_size_threshold: int = 5, p_value_threshold: float = 0.001, scale_contingency_table: bool = True, forced_categories: dict = {}): + # assert model_type in ["classification", "regression"], "define an appropriate model type" + + self.model_type = model_type self.regroup = regroup self.regroup_name = regroup_name self.keep_missing = keep_missing @@ -136,12 +145,12 @@ def fit(self, data: pd.DataFrame, column_names: list, Parameters ---------- data : pd.DataFrame - data used to compute the mapping to encode the categorical + Data used to compute the mapping to encode the categorical variables with. column_names : list - Columns of data to be processed + Columns of data to be processed. target_column : str - Column name of the target + Column name of the target. """ if not self.regroup: @@ -168,8 +177,8 @@ def fit(self, data: pd.DataFrame, column_names: list, def _fit_column(self, data: pd.DataFrame, column_name: str, target_column) -> set: - """Compute which categories to regroup into "Other" for a particular - column + """Compute which categories to regroup into "Other" + for a particular column Parameters ---------- @@ -183,13 +192,18 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, list list of categories to combine into a category "Other" """ + model_type = self.model_type + if len(data[column_name].unique()) == 1: log.warning(f"Predictor {column_name} is constant" " and will be ignored in computation.") return set(data[column_name].unique()) y = data[target_column] - incidence = y.mean() + if model_type == "classification": + incidence = y.mean() + else: + incidence = None combined_categories = set() @@ -201,13 +215,14 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, unique_categories = list(X.unique()) # do not merge categories in case of dummies, i.e. 0 and 1 - # (and possibly "Missings") + # (and possibly "Missing") if (len(unique_categories) == 2 or (len(unique_categories) == 3 and "Missing" in unique_categories)): return set(unique_categories) # get small categories and add them to the merged category list + # does not apply incidence factor when model_type = "regression" small_categories = (CategoricalDataProcessor ._get_small_categories( X, @@ -221,6 +236,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, pval = (CategoricalDataProcessor ._compute_p_value(X, y, category, + model_type, self.scale_contingency_table)) # if not significant, add it to the list @@ -348,24 +364,27 @@ def _get_small_categories(predictor_series: pd.Series, incidence: float, category_size_threshold: int) -> set: """Fetch categories with a size below a certain threshold. - Note that we use an additional weighting with the overall incidence + Note that we use an additional weighting with the overall incidence. Parameters ---------- predictor_series : pd.Series - Description + Variables data. incidence : float - global train incidence + Global train incidence. category_size_threshold : int - minimal size of a category to keep it as a separate category + Minimal size of a category to keep it as a separate category. Returns ------- set - List a categories with a count below a certain threshold + List a categories with a count below a certain threshold. """ category_counts = predictor_series.groupby(predictor_series).size() - factor = max(incidence, 1 - incidence) + if incidence is not None: + factor = max(incidence, 1 - incidence) + else: + factor = 1 # Get all categories with a count below a threshold bool_mask = (category_counts*factor) <= category_size_threshold @@ -404,10 +423,14 @@ def _replace_missings(data: pd.DataFrame, @staticmethod def _compute_p_value(X: pd.Series, y: pd.Series, category: str, + model_type: str, scale_contingency_table: bool) -> float: - """Calculates p-value in contingency table (chi-square test) in - order to evaluate whether category of interest is significantly - different from the rest of the categories, given the target variable. + """Calculates p-value in order to evaluate whether category of + interest is significantly different from the rest of the + categories, given the target variable. + + In case model_type is "classification", chi-squared test based on a contingency table. + In case model_type is "regression", Kruskal-Wallis test. Parameters ---------- @@ -416,31 +439,42 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str, y : pd.Series Target data. category : str - Category for which we carry out the test + Category for which we carry out the test. + model_type : str + Model type ("classification" or "regression"). scale_contingency_table : bool - Whether we scale contingency table with incidence rate + Whether we scale contingency table with incidence rate. + Only used when model_type = "classification". Returns ------- float - p-value of chi-square test + p-value of applied statistical test """ df = pd.concat([X, y], axis=1) + df.columns = ["X", "y"] df["other_categories"] = np.where(X == category, 0, 1) - contigency_table = pd.crosstab(index=df['other_categories'], columns=y, - margins=False) + if model_type == "classification": + contingency_table = pd.crosstab(index=df["other_categories"], columns=df["y"], + margins=False) + + # if true, we scale the "other" categories + if scale_contingency_table: + size_other_cats = contingency_table.iloc[1].sum() + incidence_mean = y.mean() + + contingency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats + contingency_table.iloc[1, 1] = incidence_mean * size_other_cats + contingency_table = contingency_table.values.astype(np.int64) - # if true, we scale the "other" categories - if scale_contingency_table: - size_other_cats = contigency_table.iloc[1].sum() - incidence_mean = y.mean() + pval = stats.chi2_contingency(contingency_table, correction=False)[1] - contigency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats - contigency_table.iloc[1, 1] = incidence_mean * size_other_cats - contigency_table = contigency_table.values.astype(np.int64) + elif model_type == "regression": + pval = stats.kruskal(df.y[df.other_categories == 0], + df.y[df.other_categories == 1])[1] - return stats.chi2_contingency(contigency_table, correction=False)[1] + return pval @staticmethod def _replace_categories(data: pd.Series, categories: set, diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py index 5b4ec3f..97191b7 100644 --- a/tests/preprocessing/test_categorical_data_processor.py +++ b/tests/preprocessing/test_categorical_data_processor.py @@ -20,6 +20,7 @@ def test_attributes_to_dict(self): actual = processor.attributes_to_dict() expected = { + "model_type": "classification", "regroup": True, "regroup_name": "Other", "keep_missing": True, @@ -79,7 +80,23 @@ def test_compute_p_value(self, scale_contingency_table, expected): category = "c1" actual = (CategoricalDataProcessor - ._compute_p_value(X, y, category, scale_contingency_table)) + ._compute_p_value(X, y, category, "classification", scale_contingency_table)) + + assert pytest.approx(actual, abs=1e-5) == expected + + @pytest.mark.parametrize("seed, expected", + [(505, 0.02222), + (603, 0.89230)]) + def test_compute_p_value_regression(self, seed, expected): + + np.random.seed(seed) + + X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10)) + y = pd.Series(data=np.random.uniform(0, 1, 100)*5) + category = "c1" + + actual = (CategoricalDataProcessor + ._compute_p_value(X, y, category, "regression", None)) assert pytest.approx(actual, abs=1e-5) == expected @@ -87,7 +104,7 @@ def test_get_small_categories(self): data = pd.Series(data=(["c1"]*50 + ["c2"]*25 + ["c3"]*15 + ["c4"]*5)) incidence = 0.35 - threshold = 10 # to make it easy to manualy compute + threshold = 10 # to make it easy to manualLy compute expected = {"c3", "c4"} actual = (CategoricalDataProcessor From 79871abbb665106c213266a5f7075827e77cacbf Mon Sep 17 00:00:00 2001 From: sborms Date: Fri, 6 Aug 2021 10:00:28 +0200 Subject: [PATCH 2/3] unit test renaming --- tests/preprocessing/test_categorical_data_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py index 97191b7..6e026d3 100644 --- a/tests/preprocessing/test_categorical_data_processor.py +++ b/tests/preprocessing/test_categorical_data_processor.py @@ -73,7 +73,7 @@ def test_set_attributes_from_dict(self, attribute): @pytest.mark.parametrize("scale_contingency_table, expected", [(False, 0.01329), (True, 0.43437)]) - def test_compute_p_value(self, scale_contingency_table, expected): + def test_compute_p_value_classification(self, scale_contingency_table, expected): X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10)) y = pd.Series(data=([0]*35 + [1]*35 + [0]*15 + [1]*5 + [0]*8 + [1]*2)) From d4e0d713496ddb580bee506c1f8f75c74f630440 Mon Sep 17 00:00:00 2001 From: Sander Vanden Hautte <4201920+sandervh14@users.noreply.github.com> Date: Fri, 6 Aug 2021 09:20:40 +0000 Subject: [PATCH 3/3] Issue #65: model_type check in categorical data processor constructor. --- cobra/preprocessing/categorical_data_processor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 1ebde44..8a07331 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -71,7 +71,8 @@ def __init__(self, scale_contingency_table: bool = True, forced_categories: dict = {}): - # assert model_type in ["classification", "regression"], "define an appropriate model type" + if model_type not in ["classification", "regression"]: + raise ValueError("An unexpected model_type was provided. Valid model_types are either 'classification' or 'regression'.") self.model_type = model_type self.regroup = regroup