diff --git a/.gitignore b/.gitignore index 4ec86e3..94f578c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ -# Ignored directories in root folder +#Ignored directories in root folder + # Byte-compiled / optimized / DLL files __pycache__/ @@ -47,7 +48,6 @@ nosetests.xml coverage.xml *.cover .hypothesis/ -junit/ # Translations *.mo @@ -106,7 +106,12 @@ ENV/ # vscode settings .vscode/ -# Other ignored files +# JetBrains IDEs (PyCharm etc.) +.idea/ + +# Downloaded datasets (see cobra.datasets module): +datasets/ + +# Other ignore files *.pptx *.ppt -.idea/ diff --git a/README.rst b/README.rst index 22bf451..52282d4 100644 --- a/README.rst +++ b/README.rst @@ -9,13 +9,10 @@ ------------------------------------------------------------------------------------------------------------------------------------ -===== -cobra -===== -.. image:: material\logo.png +.. image:: C:/Users/hendrik.dewinter/PycharmProjects/cobra/material/logo.png :width: 300 -**cobra** is a Python package to build predictive models using logistic regression with a focus on performance and interpretation. It consists of several modules for data preprocessing, feature selection and model evaluation. The underlying methodology was developed at Python Predictions in the course of hundreds of business-related prediction challenges. It has been tweaked, tested and optimized over the years based on feedback from clients, our team, and academic researchers. +**Cobra** is a Python package to build predictive models using linear or logistic regression with a focus on performance and interpretation. It consists of several modules for data preprocessing, feature selection and model evaluation. The underlying methodology was developed at Python Predictions in the course of hundreds of business-related prediction challenges. It has been tweaked, tested and optimized over the years based on feedback from clients, our team, and academic researchers. Main Features ============= @@ -25,10 +22,10 @@ Main Features - partition into train/selection/validation sets - create bins from continuous variables - regroup categorical variables based on statistical significance - - replace missing values and - - add columns with incidence rate per category/bin + - replace missing values + - add columns with average of target values (linear regression) or incidence rate per category/bin (logistic regression) -- Perform univariate feature selection based on AUC +- Perform univariate feature selection based on RMSE (linear regression) or AUC (logistic regression) - Compute correlation matrix of predictors - Find the suitable variables using forward feature selection - Evaluate model performance and visualize the results @@ -80,8 +77,15 @@ Help and Support Documentation ------------- -- HTML documentation of the `individual modules `_ -- A step-by-step `tutorial `_ +HTML documentation of the `individual modules `_ + +**Logistic Regression** + +- A step-by-step tutorial ``_ + +**Linear Regression** + +- A step-by-step tutorial ``_ Outreach ------------- diff --git a/cobra/datasets/__init__.py b/cobra/datasets/__init__.py new file mode 100644 index 0000000..36e51ff --- /dev/null +++ b/cobra/datasets/__init__.py @@ -0,0 +1,219 @@ +""" +Dataset creation methods for specific test cases, e.g. memory consumption. + +Many machine learning libraries provide methods already to create datasets to +quickly run some experiments, +e.g. sklearn.datasets.make_classification() +(https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html#sklearn.datasets.make_classification). + +This package provides additional dataset creation methods for specific test +cases, e.g. very large datasets to try out Cobra's memory consumption. +""" +import os + +import numpy as np +import pandas as pd +from tqdm.auto import tqdm + + +def make_large_house_prices_dataset( + data_folder: str = './data/argentina-venta-de-propiedades', + ask_download_confirmation: bool = True) -> pd.DataFrame: + """ + Create a very large house prices dataset (shape: (2126816, 340)) + for classification and regression purposes, based on the Kaggle dataset at + https://www.kaggle.com/msorondo/argentina-venta-de-propiedades. + + This method downloads the dataset from Kaggle, + since including it directly in this repository would make the repository + very large, while only a few users of this repository will use this dataset. + + To make the download work, you need to have a Kaggle API Token saved + in a file on your computer. + If this is not the case, a warning will be thrown to help you set this up. + + To execute this, we advise that your PC should have >= 16 GB RAM. + + Parameters + ---------- + data_folder : str + path of the folder in which the CSV files can be written + ask_download_confirmation : bool + whether a confirmation must be asked before the 2.47 GB of CSV files + are downloaded. Set to False to run this method from a pytest unit test, + because those don't suppport input() calls. + + Returns + ------- + pd.DataFrame + a 2Mx340 basetable, ready for classification or regression experiments. + + Raises + ------ + ModuleNotFoundError + In case the kaggle package is not installed. + This is a dependency solely for this method, so it is not included + in cobra's requirements.txt. + IOError + In case a Kaggle API token file is not available on your machine. + See our help message printed in this case for how to solve this. + """ + # Importing the following modules at the top of the file is not useful, + # since they are only required for THIS specific dataset creation method. + # We don't want to make other dataset creation methods crash on + # the unavailability of the following modules if they don't use them. + from kaggle.api.kaggle_api_extended import KaggleApi + from zipfile import ZipFile + + setup_help_msg = r""" + This method downloads the dataset from Kaggle, + since including it directly in this repository would make the repository + very large, while only a few users of this repository will use this dataset. + + To make the download work, you need to have a Kaggle API Token saved + in a file on your computer. + If this is not yet the case: + 1. Create a Kaggle account, if you don't have one yet. + 2. Log in on Kaggle's website. + 3. On your Kaggle account, under "API", select "Create New API Token" and + a file "kaggle.json" will be downloaded on your computer. + 4. Move that "kaggle.json" file to the following path: + "C:\Users\\.kaggle". + 5. Run this method.""" + # Authenticate to Kaggle: + try: + api = KaggleApi() + api.authenticate() + except IOError: + print(setup_help_msg) + raise + + # Download and unzip the CSV files from Kaggle: + if ask_download_confirmation: + download_consent = input("Warning: 2.47 GB of CSV files will be " + "downloaded from Kaggle. " + "Is this OK? Type 'y' to continue:") + if download_consent != 'y': + raise RuntimeError("Stopped creating the houses dataset, " + "you did not consent to download it.") + dataset = 'msorondo/argentina-venta-de-propiedades' + # api.dataset_list_files(dataset) is buggy + we're discarding one file + # (ar_properties.csv), so let's specify them manually: + csv_files = [ + 'uy_properties_crude.csv', # smallest first for debugging + 'ar_properties_crude.csv', + 'co_properties_crude.csv', + 'ec_properties_crude.csv', + 'pe_properties_crude.csv', + #'uy_properties_crude.csv' + ] + os.makedirs(data_folder, exist_ok=True) + for csv_file in tqdm(csv_files, + desc="Downloading CSV files of the Kaggle dataset..."): + api.dataset_download_file(dataset, csv_file, data_folder) + zip_file = os.path.join(data_folder, csv_file + ".zip") + with ZipFile(zip_file) as zf: + zf.extract(csv_file, data_folder) + os.remove(zip_file) + + # Combine all CSVs into 1 big dataframe + # & add the country of each loaded CSV file: + print("Combining the CSVs into one basetable...") + dfs = [] + for csv_file in csv_files: + df = pd.read_csv(os.path.join(data_folder, csv_file)) + country = csv_file.split("_")[0] + df["country"] = country + dfs.append(df) + basetable = pd.concat(dfs, axis=0, ignore_index=True) + del dfs + + # Keep only houses for sale. (Some are paid for per month, we assume those + # are houses for rent and less applicable in that case, for this toy + # dataset). + print("Filtering only houses for sale...") + basetable = basetable[basetable.price_period.isna()] + + # The houses are from 5 different countries, with different currencies, + # so create one target column (house price) in a single currency (EUR): + print("Converting house prices in different currencies to EUR...") + + def price_to_eur(price, currency): + # 1 Argentine Peso equals 0,0092 Euro + if currency == "ARS": + return price * 0.0092 + # 1 United States Dollar equals 0,84 Euro + elif currency == "USD": + return price * 0.84 + elif pd.isna(currency): + return np.nan + # 1 Colombian Peso equals 0,00024 Euro + elif currency == "COP": + return price * 0.00024 + # 1 Sol equals 0,23 Euro + elif currency == "PEN": + return price * 0.23 + # 1 Uruguayan Peso equals 0,019 Euro + elif currency == "UYU": + return price * 0.019 + else: + raise ValueError("Unexpected currency.") + + basetable["price_EUR"] = basetable[["price", "currency"]].apply( + lambda row: price_to_eur(row[0], row[1]), axis=1) + + # Create a target column for a classification problem: + # which houses are more expensive than 300K? + # A Cobra model will then tell which features explain WHY this is the case. + basetable["price_EUR_>300K"] = basetable.price_EUR > 300_000 + + # Derived features for the datetime columns: + print("Creating derived features for the datetime columns...") + basetable["start_date"] = pd.to_datetime(basetable["start_date"], + format="%Y-%m-%d", + errors='coerce') # avoid OutOfBoundsDatetime + basetable["end_date"] = pd.to_datetime(basetable["end_date"], + format="%Y-%m-%d", + errors='coerce') + basetable["created_on"] = pd.to_datetime(basetable["created_on"], + format="%Y-%m-%d", + errors='coerce') + datetime_features = ["start_date", "end_date", "created_on"] + derived_datetime_features = [] + for datetime_feature in datetime_features: + basetable[datetime_feature + "_year"] = basetable[datetime_feature].dt.year + basetable[datetime_feature + "_month"] = basetable[datetime_feature].dt.month + basetable[datetime_feature + "_day"] = basetable[datetime_feature].dt.day + basetable[datetime_feature + "_quarter"] = basetable[datetime_feature].dt.quarter + derived_datetime_features += [ + datetime_feature + "_year", + datetime_feature + "_month", + datetime_feature + "_day", + datetime_feature + "_quarter" + ] + + # To reproduce the cobra performance issues under the same circumstances, + # we need the dataframe to have 300 columns. + print("Adding extra columns with random values, " + "to reproduce bigger dataframes...") + # => Add some irrelevant features (features with random values): + num_random_features = 150 + random_feature_cols = [f"random_feature_{feat_idx}" + for feat_idx in range(num_random_features)] + df_randoms = pd.DataFrame( + np.random.rand(basetable.shape[0], num_random_features) * 100, + columns=random_feature_cols) + basetable = pd.concat([basetable, df_randoms], axis=1) + print("Dataframe shape after adding irrelevant features:", basetable.shape) + # => ... and add some correlated (in this case identical) features: + random_feature_corr_cols = [col + "_corr" for col in random_feature_cols] + df_corr = df_randoms.copy().rename(columns={ + random_feature_col: random_feature_corr_col + for random_feature_col, random_feature_corr_col + in zip(random_feature_cols, random_feature_corr_cols) + }) + basetable = pd.concat([basetable, df_corr], axis=1) + del df_randoms, df_corr + print("Dataframe shape after adding correlated features:", basetable.shape) + + return basetable diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index b395711..7ef8437 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -133,7 +133,7 @@ def compute_scalar_metrics(y_true: np.ndarray, "recall": recall_score(y_true, y_pred_b), "F1": f1_score(y_true, y_pred_b, average=None)[1], "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b), - "lift at {}".format(lift_at): np.round(Evaluator + "lift at {}".format(lift_at): np.round(ClassificationEvaluator ._compute_lift( y_true=y_true, y_pred=y_pred, diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index cb7f749..0000000 --- a/docs/Makefile +++ /dev/null @@ -1,44 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = source -PAPER = -BUILDDIR = build - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter - -.PHONY: help Makefile clean html man changes linkcheck doctest - - -# Put it first so that "make" without argument is like "make help". -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " man to make manual pages" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - -rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(SOURCEDIR) $(BUILDDIR)/html $(SPHINXOPTS) - -man: - $(SPHINXBUILD) -b man $(SPHINXOPTS) $(BUILDDIR)/man - -changes: - $(SPHINXBUILD) -b changes $(SPHINXOPTS) $(BUILDDIR)/changes - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(SPHINXOPTS) $(BUILDDIR)/linkcheck - -doctest: - $(SPHINXBUILD) -b doctest $(SPHINXOPTS) $(BUILDDIR)/doctest diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 6247f7e..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index 449fd5a..0000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,83 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. - -import os -import sys -sys.path.insert(0, os.path.abspath('../../')) - - -# -- Project information ----------------------------------------------------- - -project = 'cobra' -copyright = '2020, Python Predictions' -author = 'Python Predictions' - -# The full version, including alpha/beta/rc tags -release = '1.0.0' - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.todo', - 'sphinx.ext.viewcode', - 'sphinx.ext.autodoc', - 'sphinx.ext.githubpages', - 'sphinx.ext.napoleon' -] - -autodoc_member_order = 'bysource' - -# Napoleon settings -napoleon_google_docstring = True -napoleon_numpy_docstring = True -napoleon_include_init_with_doc = True -napoleon_include_private_with_doc = False -napoleon_include_special_with_doc = True -napoleon_use_admonition_for_examples = False -napoleon_use_admonition_for_notes = False -napoleon_use_admonition_for_references = False -napoleon_use_ivar = False -napoleon_use_param = True -napoleon_use_rtype = True -napoleon_type_aliases = None - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -source_suffix = ['.rst', '.md'] - -# The master toctree document. -master_doc = 'index' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path . -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'sphinx_rtd_theme' - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -html_favicon = 'images/cobra_icon.png' diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index 92055b9..0000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,37 +0,0 @@ -.. cobra documentation master file, created by - sphinx-quickstart on Thu Dec 3 11:55:07 2020. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -********************************* -Welcome to cobra's documentation! -********************************* - -.. include:: ../../README.rst - -.. toctree:: - :maxdepth: 2 - :hidden: - :caption: Contents: - -.. toctree:: - :maxdepth: 4 - :hidden: - :caption: Tutorial - - tutorial - -.. toctree:: - :maxdepth: 4 - :hidden: - :caption: API Reference - - docstring/modules - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst deleted file mode 100644 index d8d69b1..0000000 --- a/docs/source/tutorial.rst +++ /dev/null @@ -1,224 +0,0 @@ -Tutorial -======== - -This section we will walk you through all the required steps to build a predictive model using cobra. All classes and functions used here are well-documented. In case you want more information on a class or function, simply read the corresponding parts in the documentation or run the following python snippet from e.g. a notebook: - -.. code-block:: python - - help(function_or_class_you_want_info_from) - -Building a good model involves three steps - - - preprocessing: properly prepare the predictors (a synonym for "feature" or variable that we use throughout this tutorial) for modelling. - - feature selection: automatically select a subset of predictors which contribute most to the target variable or output in which you are interested. - - model evaluation: once a model has been build, a detailed evaluation can be performed by computing all sorts of evaluation metrics. - -In the examples below, we assume the data for model building is available in a pandas DataFrame called ``basetable``. This DataFrame should at least contain an ID column (e.g. "customernumber"), a target column (e.g. "TARGET") and a number of candidate predictors (features) to build a model with. - -Preprocessing -------------- - -The first part focusses on preparing the predictors for modelling by: - -- Splitting the dataset into training, selection and validation datasets. -- binning continuous variables into discrete intervals -- Replacing missing values of both categorical and continuous variables (which are now binned) with an additional "Missing" bin/category -- Regrouping categories in new category "other" -- Replacing bins/categories with their corresponding incidence rate per category/bin. - -This will be taken care of by the ``PreProcessor`` class, which has a scikit-learn like interface (i.e. ``fit`` & ``transform``) - -.. code-block:: python - - import json - from cobra.preprocessing import PreProcessor - - # Prepare data - # create instance of PreProcessor from parameters - # There are many options possible, see API reference, but here - # we will use all the defaults - preprocessor = PreProcessor.from_params() - - # split data into train-selection-validation set - # in the result, an additional column "split" will be created - # containing each of those values - basetable = preprocessor.train_selection_validation_split( - basetable, - train_prop=0.6, selection_prop=0.2, - validation_prop=0.2) - - # create list containing the column names of the discrete resp. - # continuous variables - continuous_vars = [] - discrete_vars = [] - - # fit the pipeline - preprocessor.fit(basetable[basetable["split"]=="train"], - continuous_vars=continuous_vars, - discrete_vars=discrete_vars, - target_column_name=target_column_name) - - # store fitted preprocessing pipeline as a JSON file - pipeline = preprocessor.serialize_pipeline() - - # I/O outside of PreProcessor to allow flexibility (e.g. upload to S3, ...) - path = "path/to/store/preprocessing/pipeline/as/json/file/for/later/re-use.json" - with open(path, "w") as file: - json.dump(pipeline, file) - - # transform the data (e.g. perform discretisation, incidence replacement, ...) - basetable = preprocessor.transform(basetable, - continuous_vars=continuous_vars, - discrete_vars=discrete_vars) - - # When you want to reuse the pipeline the next time, simply load it back in again - # using the following snippet: - # with open(path, "r") as file: - # pipeline = json.load(file) - # preprocessor = PreProcessor.from_pipeline(pipeline) and you're good to go! - -Feature selection ------------------ - -Once the predictors are properly prepared, we can start building a predictive model, which boils down to selecting the right predictors from the dataset to train a model on. As a dataset typically contains many predictors, we can first perform a univariate preselection to rule out any predictor with little to no predictive power. - -This preselection is based on an AUC threshold of a univariate model on the train and selection datasets. As the AUC just calculates the quality of a ranking, all monotonous transformations of a given ranking (i.e. transformations that do not alter the ranking itself) will lead to the same AUC. Hence, pushing a categorical variable (incl. a binned continuous variable) through a logistic regression will produce exactly the same ranking as using target encoding, as it will produce the exact same output: a ranking of the categories on the training/selection set. Therefore, no univariate model is trained here as the target encoded train and selection data is used as predicted scores to compute the AUC with against the target. - -.. code-block:: python - - from cobra.model_building import univariate_selection - from cobra.evaluation import plot_univariate_predictor_quality - from cobra.evaluation import plot_correlation_matrix - - # Get list of predictor names to use for univariate_selection - preprocessed_predictors = [col for col in basetable.columns if col.endswith("_enc")] - - # perform univariate selection on preprocessed predictors: - df_auc = univariate_selection.compute_univariate_preselection( - target_enc_train_data=basetable[basetable["split"] == "train"], - target_enc_selection_data=basetable[basetable["split"] == "selection"], - predictors=preprocessed_predictors, - target_column=target_column_name, - preselect_auc_threshold=0.53, # if auc_selection <= 0.53 exclude predictor - preselect_overtrain_threshold=0.05 # if (auc_train - auc_selection) >= 0.05 --> overfitting! - ) - - # Plot df_auc to get a horizontal barplot: - plot_univariate_predictor_quality(df_auc) - - # compute correlations between preprocessed predictors: - df_corr = (univariate_selection - .compute_correlations(basetable[basetable["split"] == "train"], - preprocessed_predictors)) - - # plot correlation matrix - plot_correlation_matrix(df_corr) - - # get a list of predictors selection by the univariate selection - preselected_predictors = (univariate_selection - .get_preselected_predictors(df_auc)) - -After an initial preselection on the predictors, we can start building the model itself using forward feature selection to choose the right set of predictors. Since we use target encoding on all our predictors, we will only consider models with positive coefficients (no sign flip should occur) as this makes the model more interpretable. - -.. code-block:: python - - from cobra.model_building import ForwardFeatureSelection - from cobra.evaluation import plot_performance_curves - from cobra.evaluation import plot_variable_importance - - forward_selection = ForwardFeatureSelection(max_predictors=30, - pos_only=True) - - # fit the forward feature selection on the train data - # has optional parameters to force and/or exclude certain predictors (see docs) - forward_selection.fit(basetable[basetable["split"] == "train"], - target_column_name, - preselected_predictors) - - # compute model performance (e.g. AUC for train-selection-validation) - performances = (forward_selection - .compute_model_performances(basetable, target_column_name)) - - # plot performance curves - plot_performance_curves(performances) - -Based on the performance curves (AUC per model with a particular number of predictors in case of logistic regression), a final model can then be chosen and the variables importance can be plotted: - -.. code-block:: python - - # After plotting the performances and selecting the model, - # we can extract this model from the forward_selection class: - model = forward_selection.get_model_from_step(5) - - # Note that chosen model has 6 variables (python lists start with index 0), - # which can be obtained as follows: - final_predictors = model.predictors - # We can also compute and plot the importance of each predictor in the model: - variable_importance = model.compute_variable_importance( - basetable[basetable["split"] == "selection"] - ) - plot_variable_importance(variable_importance) - -**Note**: variable importance is based on correlation of the predictor with the *model scores* (and not the true labels!). - -Finally, we can again export the model to a dictionary to store it as JSON - -.. code-block:: python - - model_dict = model.serialize() - - with open(path, "w") as file: - json.dump(model_dict, file) - - # To reload the model again from a JSON file, run the following snippet: - # from cobra.model_building import LogisticRegressionModel - # with open(path, "r") as file: - # model_dict = json.load(file) - # model = LogisticRegressionModel() - # model.deserialize(model_dict) - -Evaluation ----------- - -Now that we have build and selected a final model, it is time to evaluate it against various evaluation metrics: - -.. code-block:: python - - from cobra.evaluation import Evaluator - - # get numpy array of True target labels and predicted scores: - y_true = basetable[basetable["split"] == "selection"][target_column_name].values - y_pred = model.score_model(basetable[basetable["split"] == "selection"]) - - evaluator = Evaluator() - evaluator.fit(y_true, y_pred) # Automatically find the best cut-off probability - - # Get various scalar metrics such as accuracy, AUC, precision, recall, ... - evaluator.scalar_metrics - - # Plot non-scalar evaluation metrics: - evaluator.plot_roc_curve() - - evaluator.plot_confusion_matrix() - - evaluator.plot_cumulative_gains() - - evaluator.plot_lift_curve() - - evaluator.plot_cumulative_response_curve() - -Additionally, we can also compute the output needed to plot the so-called Predictor Insights Graphs (PIGs in short). These are graphs that represents the insights of the relationship between a single predictor (e.g. age) and the target (e.g. burnouts). This is a graph where the predictor is binned into groups, and where we represent group size in bars and group (target) incidence in a colored line. We have the option to force order of predictor values. - -.. code-block:: python - - from cobra.evaluation import generate_pig_tables - from cobra.evaluation import plot_incidence - - predictor_list = [col for col in basetable.columns - if col.endswith("_bin") or col.endswith("_processed")] - pig_tables = generate_pig_tables(basetable[basetable["split"] == "selection"], - id_column_name=id_column_name, - target_column_name=target_column_name, - preprocessed_predictors=predictor_list) - # Plot PIGs - plot_incidence(pig_tables, 'predictor_name', predictor_order) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9670f11..799d87a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ numpy>=1.19.4 pandas>=1.1.5 scipy>=1.5.4 -scikit-learn>=0.24 +scikit-learn>=0.23.1 matplotlib>=3.3.3 seaborn>=0.11.0 -tqdm>=4.59.0 \ No newline at end of file +tqdm>=4.59.0 diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py index 2e42759..864405b 100644 --- a/tests/preprocessing/test_preprocessor.py +++ b/tests/preprocessing/test_preprocessor.py @@ -7,6 +7,7 @@ import pandas as pd from cobra.preprocessing.preprocessor import PreProcessor +from cobra.datasets import make_large_house_prices_dataset @contextmanager @@ -148,3 +149,71 @@ def test_get_variable_list(self, continuous_vars: list, discrete_vars) assert actual == expected + + @pytest.mark.skip() # Only meant to be run manually. + def test_preprocessor_performance_on_large_dataset(self, + data_folder='../../datasets/argentina-venta-de-propiedades'): + """ + Download a large housing dataset and run the PreProcessor on it, + to check and debug its performance. + + This test is meant to be run manually only, + since it would slow down the automatic tests if included + *and* there are no real assertions to be made here - it's meant more + as a thing to run manually and go over (or debug) the code. + + To run: + - disable the skip marking of this test and debug it + - call this test from a python console (this will print progress + messages, the first option won't): + from tests.preprocessing.test_preprocessor import TestPreProcessor + tpp = TestPreProcessor() + tpp.test_preprocessor_performance_on_large_dataset(data_folder='./datasets/argentina-venta-de-propiedades') + """ + print("Creating basetable...") + basetable = make_large_house_prices_dataset( + data_folder, + ask_download_confirmation=False) # input() call doesn't work in pytest. + + # Preparing the preprocessor to do the performance testing: + preprocessor = PreProcessor.from_params() + basetable = preprocessor.train_selection_validation_split(basetable, + train_prop=0.7, + selection_prop=0.15, + validation_prop=0.15) + + # Setting which vars are discrete and which are continuous: + derived_datetime_features = [col for col in basetable.columns + if col.startswith("start_date") + or col.startswith("end_date") + or col.startswith("created_on")] + hierarchical_location_features = ["l1", "l2", "l3", "l4", "l5", "l6"] + rooms_features = ["rooms", "bedrooms", "bathrooms"] + discrete_vars = ["ad_type"] + \ + derived_datetime_features + \ + hierarchical_location_features + \ + rooms_features + \ + ["property_type", "operation_type", "country"] + # Note: "title" and "description" are not included here, they would help + # create a better model, but our primary interest here is testing + # just the preprocessing performance instead... + + random_feature_cols = [col for col in basetable.columns + if col.startswith("random_feature")] + continuous_vars = ["lat", "lon", "surface_total", "surface_covered"] + \ + random_feature_cols + + # Setting the target column: + target_clf = "price_EUR_>300K" + target_regr = "price_EUR" + + print("Fitting the preprocessor...") + preprocessor.fit(basetable[basetable["split"] == "train"], + continuous_vars=continuous_vars, + discrete_vars=discrete_vars, + target_column_name=target_clf) + + print("Transforming the preprocessor...") + basetable = preprocessor.transform(basetable, + continuous_vars=continuous_vars, + discrete_vars=discrete_vars)