Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cobra/preprocessing/categorical_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class CategoricalDataProcessor(BaseEstimator):
keep_missing : bool
Whether or not to keep missing as a separate category.
model_type : str
Model type ("classification" or "regression").
Model type (``classification`` or ``regression``).
p_value_threshold : float
Significance threshold for regrouping.
regroup : bool
Expand Down Expand Up @@ -442,7 +442,7 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
category : str
Category for which we carry out the test.
model_type : str
Model type ("classification" or "regression").
Model type (``classification`` or ``regression``).
scale_contingency_table : bool
Whether we scale contingency table with incidence rate.
Only used when model_type = "classification".
Expand Down
57 changes: 34 additions & 23 deletions cobra/preprocessing/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,27 @@ class PreProcessor(BaseEstimator):
----------
categorical_data_processor : CategoricalDataProcessor
Instance of CategoricalDataProcessor to do the preprocessing of
categorical variables
categorical variables. The model_type variable is specified
here (``classification`` or ``regression``).
discretizer : KBinsDiscretizer
Instance of KBinsDiscretizer to do the prepocessing of continuous
variables by means of discretization
variables by means of discretization.
serialization_path : str
path to save the pipeline to
Path to save the pipeline to.
stratify_split : bool
Whether or not to stratify the train-test split
Whether or not to stratify the train-test split.
target_encoder : TargetEncoder
Instance of TargetEncoder to do the incidence replacement
Instance of TargetEncoder to do the incidence replacement.
"""

def __init__(self, categorical_data_processor: CategoricalDataProcessor,
def __init__(self,
categorical_data_processor: CategoricalDataProcessor,
discretizer: KBinsDiscretizer,
target_encoder: TargetEncoder,
is_fitted: bool = False):

self.model_type = categorical_data_processor.model_type

self._categorical_data_processor = categorical_data_processor
self._discretizer = discretizer
self._target_encoder = target_encoder
Expand All @@ -69,6 +73,7 @@ def __init__(self, categorical_data_processor: CategoricalDataProcessor,

@classmethod
def from_params(cls,
model_type: str = "classification",
n_bins: int = 10,
strategy: str = "quantile",
closed: str = "right",
Expand All @@ -91,16 +96,18 @@ def from_params(cls,

Parameters
----------
model_type : str
Model type (``classification`` or ``regression``).
n_bins : int, optional
Number of bins to produce. Raises ValueError if ``n_bins < 2``.
strategy : str, optional
Binning strategy. Currently only ``uniform`` and ``quantile``
e.g. equifrequency is supported
e.g. equifrequency is supported.
closed : str, optional
Whether to close the bins (intervals) from the left or right
Whether to close the bins (intervals) from the left or right.
auto_adapt_bins : bool, optional
reduces the number of bins (starting from n_bins) as a function of
the number of missings
Reduces the number of bins (starting from n_bins) as a function of
the number of missings.
starting_precision : int, optional
Initial precision for the bin edges to start from,
can also be negative. Given a list of bin edges, the class will
Expand All @@ -110,33 +117,32 @@ def from_params(cls,
will be made to round up the numbers of the bin edges
e.g. ``5.55 -> 10``, ``146 -> 100``, ...
label_format : str, optional
format string to display the bin labels
Format string to display the bin labels
e.g. ``min - max``, ``(min, max]``, ...
change_endpoint_format : bool, optional
Whether or not to change the format of the lower and upper bins
into ``< x`` and ``> y`` resp.
regroup : bool
Whether or not to regroup categories
Whether or not to regroup categories.
regroup_name : str
New name of the non-significant regrouped variables
New name of the non-significant regrouped variables.
keep_missing : bool
Whether or not to keep missing as a separate category
Whether or not to keep missing as a separate category.
category_size_threshold : int
minimal size of a category to keep it as a separate category
Minimal size of a category to keep it as a separate category.
p_value_threshold : float
Significance threshold for regrouping.
forced_categories : dict
Map to prevent certain categories from being group into ``Other``
for each column - dict of the form ``{col:[forced vars]}``.
scale_contingency_table : bool
Whether contingency table should be scaled before chi^2.'
Whether contingency table should be scaled before chi^2.
weight : float, optional
Smoothing parameters (non-negative). The higher the value of the
parameter, the bigger the contribution of the overall mean.
When set to zero, there is no smoothing
(e.g. the pure target incidence is used).
When set to zero, there is no smoothing (e.g. the pure target incidence is used).
imputation_strategy : str, optional
in case there is a particular column which contains new categories,
In case there is a particular column which contains new categories,
the encoding will lead to NULL values which should be imputed.
Valid strategies are to replace with the global mean of the train
set or the min (resp. max) incidence of the categories of that
Expand All @@ -145,23 +151,26 @@ def from_params(cls,
Returns
-------
PreProcessor
Description
class encapsulating CategoricalDataProcessor,
KBinsDiscretizer, and TargetEncoder instances
"""
categorical_data_processor = CategoricalDataProcessor(
model_type,
regroup,
regroup_name,
keep_missing,
category_size_threshold,
p_value_threshold,
scale_contingency_table,
forced_categories)

discretizer = KBinsDiscretizer(n_bins, strategy, closed,
auto_adapt_bins,
starting_precision,
label_format,
change_endpoint_format)

target_encoder = TargetEncoder(weight)
target_encoder = TargetEncoder(weight, imputation_strategy)

return cls(categorical_data_processor, discretizer, target_encoder)

Expand All @@ -187,20 +196,22 @@ def from_pipeline(cls, pipeline: dict):
"""

if not PreProcessor._is_valid_pipeline(pipeline):
raise ValueError("Invalid pipeline") # To do: specify error
raise ValueError("Invalid pipeline") ## TODO: specify error

categorical_data_processor = CategoricalDataProcessor()
categorical_data_processor.set_attributes_from_dict(
pipeline["categorical_data_processor"]
)
model_type = categorical_data_processor.model_type

discretizer = KBinsDiscretizer()
discretizer.set_attributes_from_dict(pipeline["discretizer"])

target_encoder = TargetEncoder()
target_encoder.set_attributes_from_dict(pipeline["target_encoder"])

return cls(categorical_data_processor, discretizer, target_encoder,
return cls(model_type,
categorical_data_processor, discretizer, target_encoder,
is_fitted=pipeline["_is_fitted"])

def fit(self, train_data: pd.DataFrame, continuous_vars: list,
Expand Down
1 change: 1 addition & 0 deletions tests/preprocessing/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def test_is_valid_pipeline(self, injection_location: str,
# is_valid_pipeline only checks for relevant keys atm
pipeline_dict = {
"categorical_data_processor": {
"model_type": None,
"regroup": None,
"regroup_name": None,
"keep_missing": None,
Expand Down