Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#Ignored directories in root folder

# Ignored directories in root folder

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down Expand Up @@ -107,7 +106,7 @@ ENV/
# vscode settings
.vscode/

# Other ignore files
# Other ignored files
*.pptx
*.ppt
.idea/
107 changes: 71 additions & 36 deletions cobra/preprocessing/categorical_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- Jan Benisek (implementation)
- Matthias Roels (implementation)
"""

# standard lib imports
import re
from typing import Optional
Expand All @@ -38,33 +39,42 @@ class CategoricalDataProcessor(BaseEstimator):
Attributes
----------
category_size_threshold : int
minimal size of a category to keep it as a separate category
Minimal size of a category to keep it as a separate category.
forced_categories : dict
Map to prevent certain categories from being group into ``Other``
for each colum - dict of the form ``{col:[forced vars]}``.
for each column - dict of the form ``{col:[forced vars]}``.
keep_missing : bool
Whether or not to keep missing as a separate category
Whether or not to keep missing as a separate category.
model_type : str
Model type ("classification" or "regression").
p_value_threshold : float
Significance threshold for regrouping.
regroup : bool
Whether or not to regroup categories
Whether or not to regroup categories.
regroup_name : str
New name of the non-significant regrouped variables
scale_contingency_table : bool
Whether contingency table should be scaled before chi^2.'
Whether contingency table should be scaled before chi^2.
"""

valid_keys = ["regroup", "regroup_name", "keep_missing",
valid_keys = ["model_type", "regroup", "regroup_name", "keep_missing",
"category_size_threshold", "p_value_threshold",
"scale_contingency_table", "forced_categories"]

def __init__(self, regroup: bool = True, regroup_name: str = "Other",
def __init__(self,
model_type: str = "classification",
regroup: bool = True,
regroup_name: str = "Other",
keep_missing: bool = True,
category_size_threshold: int = 5,
p_value_threshold: float = 0.001,
scale_contingency_table: bool = True,
forced_categories: dict = {}):

if model_type not in ["classification", "regression"]:
raise ValueError("An unexpected model_type was provided. Valid model_types are either 'classification' or 'regression'.")

self.model_type = model_type
self.regroup = regroup
self.regroup_name = regroup_name
self.keep_missing = keep_missing
Expand Down Expand Up @@ -136,12 +146,12 @@ def fit(self, data: pd.DataFrame, column_names: list,
Parameters
----------
data : pd.DataFrame
data used to compute the mapping to encode the categorical
Data used to compute the mapping to encode the categorical
variables with.
column_names : list
Columns of data to be processed
Columns of data to be processed.
target_column : str
Column name of the target
Column name of the target.
"""

if not self.regroup:
Expand All @@ -168,8 +178,8 @@ def fit(self, data: pd.DataFrame, column_names: list,

def _fit_column(self, data: pd.DataFrame, column_name: str,
target_column) -> set:
"""Compute which categories to regroup into "Other" for a particular
column
"""Compute which categories to regroup into "Other"
for a particular column

Parameters
----------
Expand All @@ -183,13 +193,18 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
list
list of categories to combine into a category "Other"
"""
model_type = self.model_type

if len(data[column_name].unique()) == 1:
log.warning(f"Predictor {column_name} is constant"
" and will be ignored in computation.")
return set(data[column_name].unique())

y = data[target_column]
incidence = y.mean()
if model_type == "classification":
incidence = y.mean()
else:
incidence = None

combined_categories = set()

Expand All @@ -201,13 +216,14 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
unique_categories = list(X.unique())

# do not merge categories in case of dummies, i.e. 0 and 1
# (and possibly "Missings")
# (and possibly "Missing")
if (len(unique_categories) == 2
or (len(unique_categories) == 3
and "Missing" in unique_categories)):
return set(unique_categories)

# get small categories and add them to the merged category list
# does not apply incidence factor when model_type = "regression"
small_categories = (CategoricalDataProcessor
._get_small_categories(
X,
Expand All @@ -221,6 +237,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,

pval = (CategoricalDataProcessor
._compute_p_value(X, y, category,
model_type,
self.scale_contingency_table))

# if not significant, add it to the list
Expand Down Expand Up @@ -348,24 +365,27 @@ def _get_small_categories(predictor_series: pd.Series,
incidence: float,
category_size_threshold: int) -> set:
"""Fetch categories with a size below a certain threshold.
Note that we use an additional weighting with the overall incidence
Note that we use an additional weighting with the overall incidence.

Parameters
----------
predictor_series : pd.Series
Description
Variables data.
incidence : float
global train incidence
Global train incidence.
category_size_threshold : int
minimal size of a category to keep it as a separate category
Minimal size of a category to keep it as a separate category.

Returns
-------
set
List a categories with a count below a certain threshold
List a categories with a count below a certain threshold.
"""
category_counts = predictor_series.groupby(predictor_series).size()
factor = max(incidence, 1 - incidence)
if incidence is not None:
factor = max(incidence, 1 - incidence)
else:
factor = 1

# Get all categories with a count below a threshold
bool_mask = (category_counts*factor) <= category_size_threshold
Expand Down Expand Up @@ -404,10 +424,14 @@ def _replace_missings(data: pd.DataFrame,

@staticmethod
def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
model_type: str,
scale_contingency_table: bool) -> float:
"""Calculates p-value in contingency table (chi-square test) in
order to evaluate whether category of interest is significantly
different from the rest of the categories, given the target variable.
"""Calculates p-value in order to evaluate whether category of
interest is significantly different from the rest of the
categories, given the target variable.

In case model_type is "classification", chi-squared test based on a contingency table.
In case model_type is "regression", Kruskal-Wallis test.

Parameters
----------
Expand All @@ -416,31 +440,42 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
y : pd.Series
Target data.
category : str
Category for which we carry out the test
Category for which we carry out the test.
model_type : str
Model type ("classification" or "regression").
scale_contingency_table : bool
Whether we scale contingency table with incidence rate
Whether we scale contingency table with incidence rate.
Only used when model_type = "classification".

Returns
-------
float
p-value of chi-square test
p-value of applied statistical test
"""
df = pd.concat([X, y], axis=1)
df.columns = ["X", "y"]
df["other_categories"] = np.where(X == category, 0, 1)

contigency_table = pd.crosstab(index=df['other_categories'], columns=y,
margins=False)
if model_type == "classification":
contingency_table = pd.crosstab(index=df["other_categories"], columns=df["y"],
margins=False)

# if true, we scale the "other" categories
if scale_contingency_table:
size_other_cats = contingency_table.iloc[1].sum()
incidence_mean = y.mean()

contingency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats
contingency_table.iloc[1, 1] = incidence_mean * size_other_cats
contingency_table = contingency_table.values.astype(np.int64)

# if true, we scale the "other" categories
if scale_contingency_table:
size_other_cats = contigency_table.iloc[1].sum()
incidence_mean = y.mean()
pval = stats.chi2_contingency(contingency_table, correction=False)[1]

contigency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats
contigency_table.iloc[1, 1] = incidence_mean * size_other_cats
contigency_table = contigency_table.values.astype(np.int64)
elif model_type == "regression":
pval = stats.kruskal(df.y[df.other_categories == 0],
df.y[df.other_categories == 1])[1]

return stats.chi2_contingency(contigency_table, correction=False)[1]
return pval

@staticmethod
def _replace_categories(data: pd.Series, categories: set,
Expand Down
23 changes: 20 additions & 3 deletions tests/preprocessing/test_categorical_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def test_attributes_to_dict(self):
actual = processor.attributes_to_dict()

expected = {
"model_type": "classification",
"regroup": True,
"regroup_name": "Other",
"keep_missing": True,
Expand Down Expand Up @@ -72,22 +73,38 @@ def test_set_attributes_from_dict(self, attribute):
@pytest.mark.parametrize("scale_contingency_table, expected",
[(False, 0.01329),
(True, 0.43437)])
def test_compute_p_value(self, scale_contingency_table, expected):
def test_compute_p_value_classification(self, scale_contingency_table, expected):

X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
y = pd.Series(data=([0]*35 + [1]*35 + [0]*15 + [1]*5 + [0]*8 + [1]*2))
category = "c1"

actual = (CategoricalDataProcessor
._compute_p_value(X, y, category, scale_contingency_table))
._compute_p_value(X, y, category, "classification", scale_contingency_table))

assert pytest.approx(actual, abs=1e-5) == expected

@pytest.mark.parametrize("seed, expected",
[(505, 0.02222),
(603, 0.89230)])
def test_compute_p_value_regression(self, seed, expected):

np.random.seed(seed)

X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
y = pd.Series(data=np.random.uniform(0, 1, 100)*5)
category = "c1"

actual = (CategoricalDataProcessor
._compute_p_value(X, y, category, "regression", None))

assert pytest.approx(actual, abs=1e-5) == expected

def test_get_small_categories(self):

data = pd.Series(data=(["c1"]*50 + ["c2"]*25 + ["c3"]*15 + ["c4"]*5))
incidence = 0.35
threshold = 10 # to make it easy to manualy compute
threshold = 10 # to make it easy to manualLy compute
expected = {"c3", "c4"}

actual = (CategoricalDataProcessor
Expand Down