Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 87 additions & 58 deletions cobra/evaluation/pigs_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,52 +7,55 @@
import cobra.utils as utils


def generate_pig_tables(data: pd.DataFrame,
def generate_pig_tables(basetable: pd.DataFrame,
id_column_name: str,
target_column_name: str,
preprocessed_predictors: list) -> pd.DataFrame:
"""Compute PIG tables for all predictors in preprocessed_predictors. The
output is a DataFrame with columns ``variable``, ``label``, ``pop_size``,
``avg_incidence`` and ``incidence``
"""Compute PIG tables for all predictors in preprocessed_predictors.

The output is a DataFrame with columns ``variable``, ``label``,
``pop_size``, ``global_avg_target`` and ``avg_target``.

Parameters
----------
data : pd.DataFrame
basetable to compute PIG tables of
basetable : pd.DataFrame
Basetable to compute PIG tables from.
id_column_name : str
column name of the id (e.g. customernumber)
Name of the basetable column containing the IDs of the basetable rows
(e.g. customernumber).
target_column_name : str
column name of the target
predictors: list
list of preprocessed predictor names
Name of the basetable column containing the target values to predict.
preprocessed_predictors: list
List of basetable column names containing preprocessed predictors.

Returns
-------
pd.DataFrame
DataFrame containing a PIG table for all predictors
DataFrame containing a PIG table for all predictors.
"""

pigs = [compute_pig_table(data, column_name, target_column_name,
id_column_name)
for column_name in sorted(preprocessed_predictors)
if column_name not in [id_column_name, target_column_name]]

pigs = [
compute_pig_table(basetable,
column_name,
target_column_name,
id_column_name)
for column_name in sorted(preprocessed_predictors)
if column_name not in [id_column_name, target_column_name]
]
output = pd.concat(pigs)

return output


def compute_pig_table(data: pd.DataFrame,
column_name: str,
def compute_pig_table(basetable: pd.DataFrame,
predictor_column_name: str,
target_column_name: str,
id_column_name: str) -> pd.DataFrame:
"""Compute the pig table of a given predictor for a given target
"""Compute the PIG table of a given predictor for a given target.

Parameters
----------
data : pd.DataFrame
basetable : pd.DataFrame
input data from which to compute the pig table
column_name : str
predictor_column_name : str
predictor name of which to compute the pig table
target_column_name : str
name of the target variable
Expand All @@ -62,57 +65,74 @@ def compute_pig_table(data: pd.DataFrame,
Returns
-------
pd.DataFrame
pig table as a DataFrame
PIG table as a DataFrame
"""
avg_incidence = data[target_column_name].mean()
global_avg_target = basetable[target_column_name].mean()

# group by the binned variable, compute the incidence
# (=mean of the target for the given bin) and compute the bin size
# (e.g. COUNT(id_column_name)). After that, rename the columns
res = (data.groupby(column_name)
.agg({target_column_name: "mean", id_column_name: "size"})
.reset_index()
.rename(columns={column_name: "label",
target_column_name: "incidence",
id_column_name: "pop_size"}))
res = (basetable.groupby(predictor_column_name)
.agg({target_column_name: "mean", id_column_name: "size"})
.reset_index()
.rename(columns={predictor_column_name: "label",
target_column_name: "avg_target",
id_column_name: "pop_size"}))

# add the column name to a variable column
# add the average incidence
# replace population size by a percentage of total population
res["variable"] = utils.clean_predictor_name(column_name)
res["avg_incidence"] = avg_incidence
res["pop_size"] = res["pop_size"]/len(data.index)
res["variable"] = utils.clean_predictor_name(predictor_column_name)
res["global_avg_target"] = global_avg_target
res["pop_size"] = res["pop_size"]/len(basetable.index)

# make sure to always return the data with the proper column order
column_order = ["variable", "label", "pop_size",
"avg_incidence", "incidence"]
"global_avg_target", "avg_target"]

return res[column_order]


def plot_incidence(df: pd.DataFrame, variable: str,
column_order: list = None, dim: tuple = (12, 8)):
"""Function plots Predictor Incidence Graphs (PIGs).
Bins are ordered in descening order of bin incidence
def plot_incidence(pig_tables: pd.DataFrame,
variable: str,
model_type: str,
column_order: list = None,
dim: tuple = (12, 8)):
"""Plots a Predictor Insights Graph (PIG), a graph in which the mean
target value is plotted for a number of bins constructed from a predictor
variable. When the target actually is a binary classification target,
the plotted mean target value is actually the target incidence rate.

Bins are ordered in descending order of mean target value
unless specified otherwise with `column_order` list.

Parameters
----------
df: pd.DataFrame
dataframe with cleaned, binned, partitioned and prepared data
pig_tables: pd.DataFrame
dataframe with cleaned, binned, partitioned and prepared data,
as created by generate_pig_tables() from this module.
variable: str
variable for which the incidence plot will be shown
name of the predictor variable for which the PIG will be plotted.
model_type: str
type of model (either "classification" or "regression").
column_order: list, default=None
explicit order of variable
explicit order of the value bins of the predictor variable to be used
on the PIG.
dim: tuple, default=(12, 8)
tuple with width and lentgh of the plot
optional tuple to configure the width and length of the plot.
"""
df_plot = df[df['variable'] == variable].copy()
if model_type not in ["classification", "regression"]:
raise ValueError("An unexpected value was set for the model_type "
"parameter. Expected 'classification' or "
"'regression'.")

if column_order is not None:
df_plot = pig_tables[pig_tables['variable'] == variable].copy()

if column_order is not None:
if not set(df_plot['label']) == set(column_order):
raise ValueError(
'Variables in column_order and dataframe are not equal')
'The column_order and pig_tables parameters do not contain '
'the same set of variables.')

df_plot['label'] = df_plot['label'].astype('category')
df_plot['label'].cat.reorder_categories(column_order,
Expand All @@ -121,37 +141,42 @@ def plot_incidence(df: pd.DataFrame, variable: str,
df_plot.sort_values(by=['label'], ascending=True, inplace=True)
df_plot.reset_index(inplace=True)
else:
df_plot.sort_values(by=['incidence'], ascending=False, inplace=True)
df_plot.sort_values(by=['avg_target'], ascending=False, inplace=True)
df_plot.reset_index(inplace=True)

with plt.style.context("seaborn-ticks"):
fig, ax = plt.subplots(figsize=dim)

# -----------------
# Left axis - incidence
# Left axis - average target
# -----------------
ax.plot(df_plot['label'], df_plot['incidence'],
ax.plot(df_plot['label'], df_plot['avg_target'],
color="#00ccff", marker=".",
markersize=20, linewidth=3, label='incidence rate per bin',
markersize=20, linewidth=3,
label='incidence rate per bin' if model_type == "classification" else "mean target value per bin",
zorder=10)

ax.plot(df_plot['label'], df_plot['avg_incidence'],
ax.plot(df_plot['label'], df_plot['global_avg_target'],
color="#022252", linestyle='--', linewidth=4,
label='average incidence rate',
label='average incidence rate' if model_type == "classification" else "global mean target value",
zorder=10)

# dummy line to have label on second axis from first
ax.plot(np.nan, "#939598", linewidth=6, label='bin size')

# set labels & ticks
ax.set_ylabel('incidence', fontsize=16)
ax.set_ylabel('incidence' if model_type == "classification" else "mean target value",
fontsize=16)
ax.set_xlabel('{} bins' ''.format(variable), fontsize=16)
ax.xaxis.set_tick_params(rotation=45, labelsize=14)
ax.yaxis.set_tick_params(labelsize=14)

ax.set_yticks(np.arange(0, max(df_plot['incidence'])+0.05, 0.05))
ax.yaxis.set_major_formatter(
FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
if model_type == "classification":
# Mean target values are between 0 and 1 (target incidence rate),
# so format them as percentages:
ax.set_yticks(np.arange(0, max(df_plot['avg_target'])+0.05, 0.05))
ax.yaxis.set_major_formatter(
FuncFormatter(lambda y, _: '{:.1%}'.format(y)))

# removes ticks but keeps the labels
ax.tick_params(axis='both', which='both', length=0)
Expand Down Expand Up @@ -185,7 +210,11 @@ def plot_incidence(df: pd.DataFrame, variable: str,
ax2.grid(False)

# title & legend
fig.suptitle('Incidence Plot - ' + variable, fontsize=22, y=1.02)
if model_type == "classification":
title = "Incidence plot - " + variable
else:
title = "Mean target plot - " + variable
fig.suptitle(title, fontsize=22, y=1.02)
ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102),
loc=3, ncol=1, mode="expand", borderaxespad=0.,
prop={"size": 14})
Expand Down
4 changes: 2 additions & 2 deletions cobra/preprocessing/categorical_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class CategoricalDataProcessor(BaseEstimator):
keep_missing : bool
Whether or not to keep missing as a separate category.
model_type : str
Model type ("classification" or "regression").
Model type (``classification`` or ``regression``).
p_value_threshold : float
Significance threshold for regrouping.
regroup : bool
Expand Down Expand Up @@ -442,7 +442,7 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
category : str
Category for which we carry out the test.
model_type : str
Model type ("classification" or "regression").
Model type (``classification`` or ``regression``).
scale_contingency_table : bool
Whether we scale contingency table with incidence rate.
Only used when model_type = "classification".
Expand Down
Loading