Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
combined feature_pipeline + estimator Pipeline
- Can pass a feature pipeline to `Dataset.plot` methods, to apply preprocessing
before visualization
- New config implementation. If you need to reset the configuration, you should use `Model.config.reset_config()`

# v0.10.3
- Fixed typehints in Dataset
Expand Down
47 changes: 16 additions & 31 deletions docs/config.inc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,56 +4,41 @@ Config
------
All configuration options available

:class: ml_tooling.config.DefaultConfig
.. autoclass:: ml_tooling.config.DefaultConfig


:attr:`VERBOSITY` = 0
The level of verbosity from output


:attr:`CLASSIFIER_METRIC` = 'accuracy'

Default metric for classifiers
Default metric for classifiers

:attr:`REGRESSION_METRIC` = 'r2'

Default metric for regressions
Default metric for regressions

:attr:`CROSS_VALIDATION` = 10

Default Number of cross validation folds to use

:attr:`STYLE_SHEET` = 'almbrand.mplstyle'

Default style sheet to use for plots
Default Number of cross validation folds to use

:attr:`N_JOBS` = -1

Default number of cores to use when doing multiprocessing.
-1 means use all available
Default number of cores to use when doing multiprocessing.
-1 means use all available

:attr:`RANDOM_STATE` = 42

Default random state seed for all functions involving randomness
Default random state seed for all functions involving randomness

:attr:`RUN_DIR` = './runs'
Default folder to store run logging files

Default folder to store run logging files

:attr:`MODEL_DIR` = './models'

Default folder to store pickled models in
:attr:`ESTIMATOR_DIR` = './models'
Default folder to store pickled models in

:attr:`LOG` = False
Toggles whether or not to log runs to a file. Set to True if you
want every run to be logged, else use the :meth:`~ml_tooling.baseclass.ModelData.log`
context manager

Toggles whether or not to log runs to a file. Set to True if you
want every run to be logged, else use the :meth:`~ml_tooling.baseclass.ModelData.log`
context manager

:attr:`SHUFFLE` = True

Default whether or not to shuffle data for test set
:attr:`TRAIN_TEST_SHUFFLE` = True
Default whether or not to shuffle data for test set

:attr:`TEST_SIZE` = 0.25

Default percentage of data that will be part of the test set
Default percentage of data that will be part of the test set
22 changes: 6 additions & 16 deletions src/ml_tooling/baseclass.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import datetime
import pathlib
import joblib
import pandas as pd
from contextlib import contextmanager
from importlib.resources import path as import_path
from typing import Tuple, Optional, Sequence, Union, List, Iterable, Any

import joblib
import pandas as pd
from sklearn.base import is_classifier, is_regressor
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import check_cv
from sklearn.pipeline import Pipeline

from ml_tooling.config import DefaultConfig, ConfigGetter
from ml_tooling.config import config
from ml_tooling.data.base_data import Dataset
from ml_tooling.logging.logger import create_logger
from ml_tooling.metrics.metric import Metrics
Expand Down Expand Up @@ -40,9 +41,6 @@ class Model:
Wrapper class for Estimators
"""

_config = None
config = ConfigGetter()

def __init__(self, estimator: Estimator, feature_pipeline: Pipeline = None):
"""
Parameters
Expand All @@ -57,6 +55,7 @@ def __init__(self, estimator: Estimator, feature_pipeline: Pipeline = None):
self._estimator: Estimator = _validate_estimator(estimator)
self.feature_pipeline = feature_pipeline
self.result: Optional[ResultType] = None
self.config = config

@property
def estimator(self):
Expand Down Expand Up @@ -437,7 +436,7 @@ def score_estimator(
if not data.has_validation_set:
data.create_train_test(
stratify=self.is_classifier,
shuffle=self.config.SHUFFLE,
shuffle=self.config.TRAIN_TEST_SHUFFLE,
test_size=self.config.TEST_SIZE,
seed=self.config.RANDOM_STATE,
)
Expand Down Expand Up @@ -665,15 +664,6 @@ def load_production_estimator(cls, module_name: str):
estimator = joblib.load(path)
return cls(estimator)

@classmethod
def reset_config(cls):
"""
Reset configuration to default
"""
cls._config = DefaultConfig()

return cls

def __repr__(self):
return f"<Model: {self.estimator_name}>"

Expand Down
49 changes: 28 additions & 21 deletions src/ml_tooling/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,43 @@

class DefaultConfig:
"""
Configuration for a given BaseClass. Configs propagate through each instance
Configuration for Models
"""

default_config = {
"VERBOSITY": 0,
"CLASSIFIER_METRIC": "accuracy",
"REGRESSION_METRIC": "r2",
"CROSS_VALIDATION": 10,
"N_JOBS": -1,
"RANDOM_STATE": 42,
"TRAIN_TEST_SHUFFLE": True,
"TEST_SIZE": 0.25,
}

def __init__(self):
self.VERBOSITY = 0
self.CLASSIFIER_METRIC = "accuracy"
self.REGRESSION_METRIC = "r2"
self.CROSS_VALIDATION = 10
self.STYLE_SHEET = MPL_STYLESHEET
self.N_JOBS = -1
self.RANDOM_STATE = 42
self._set_config()
self.LOG = False
self.RUN_DIR = RUN_DIR
self.ESTIMATOR_DIR = ESTIMATOR_DIR
self.LOG = False
self.SHUFFLE = True
self.TEST_SIZE = 0.25

def _set_config(self):
self.VERBOSITY = self.default_config["VERBOSITY"]
self.CLASSIFIER_METRIC = self.default_config["CLASSIFIER_METRIC"]
self.REGRESSION_METRIC = self.default_config["REGRESSION_METRIC"]
self.CROSS_VALIDATION = self.default_config["CROSS_VALIDATION"]
self.N_JOBS = self.default_config["N_JOBS"]
self.RANDOM_STATE = self.default_config["RANDOM_STATE"]
self.TRAIN_TEST_SHUFFLE = self.default_config["TRAIN_TEST_SHUFFLE"]
self.TEST_SIZE = self.default_config["TEST_SIZE"]

@property
def default_storage(self):
return FileStorage(self.ESTIMATOR_DIR)

def reset_config(self):
self._set_config()

def __repr__(self):
attrs = "\n".join(
[
Expand All @@ -44,13 +60,4 @@ def __repr__(self):
return f"<Config: \n{attrs}\n>"


class ConfigGetter:
"""
Give each class that inherits from Model an individual config attribute
without relying on the user to overriding the config when they define their class.
"""

def __get__(self, obj, cls):
if cls._config is None:
cls._config = DefaultConfig()
return cls._config
config = DefaultConfig()
35 changes: 17 additions & 18 deletions src/ml_tooling/plots/viz/baseviz.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@
)
from ml_tooling.utils import _get_estimator_name
from sklearn.base import is_classifier
from ml_tooling.config import MPL_STYLESHEET, config


class BaseVisualize:
"""
Base class for visualizers
"""

def __init__(self, estimator, data, config):
def __init__(self, estimator, data):
self._estimator = estimator
self._estimator_name = _get_estimator_name(estimator)
self._config = config
self._data = data

@property
Expand All @@ -39,9 +39,9 @@ def default_metric(self):
"""

return (
self._config.CLASSIFIER_METRIC
config.CLASSIFIER_METRIC
if is_classifier(self._estimator)
else self._config.REGRESSION_METRIC
else config.REGRESSION_METRIC
)

def feature_importance(
Expand All @@ -52,7 +52,6 @@ def feature_importance(
bottom_n: Union[int, float] = None,
add_label: bool = True,
n_jobs: int = None,
random_state: int = None,
ax: Axes = None,
**kwargs,
) -> Axes:
Expand Down Expand Up @@ -83,9 +82,6 @@ def feature_importance(
Overwrites N_JOBS from settings. Useful if data is to big to fit
in memory multiple times.

random_state: int
Random state to be used when permuting features,

ax: Axes
Draws graph on passed ax - otherwise creates new ax

Expand All @@ -97,19 +93,19 @@ def feature_importance(
matplotlib.Axes
"""

n_jobs = self._config.N_JOBS if n_jobs is None else n_jobs
n_jobs = config.N_JOBS if n_jobs is None else n_jobs
scoring = self.default_metric if scoring == "default" else scoring
title = f"Feature Importances ({scoring.title()}) - {self._estimator_name}"

with plt.style.context(self._config.STYLE_SHEET):
with plt.style.context(MPL_STYLESHEET):
return plot_feature_importance(
estimator=self._estimator,
x=self._data.x,
y=self._data.y,
scoring=scoring,
n_repeats=n_repeats,
n_jobs=n_jobs,
random_state=random_state,
random_state=config.RANDOM_STATE,
ax=ax,
top_n=top_n,
bottom_n=bottom_n,
Expand All @@ -120,7 +116,7 @@ def feature_importance(

def learning_curve(
self,
cv: int = 5,
cv: int = None,
scoring: str = "default",
n_jobs: int = None,
train_sizes: Sequence[float] = np.linspace(0.1, 1.0, 5),
Expand Down Expand Up @@ -157,9 +153,10 @@ def learning_curve(
"""

title = f"Learning Curve - {self._estimator_name}"
n_jobs = self._config.N_JOBS if n_jobs is None else n_jobs
n_jobs = config.N_JOBS if n_jobs is None else n_jobs
cv = config.CROSS_VALIDATION if cv is None else cv

with plt.style.context(self._config.STYLE_SHEET):
with plt.style.context(MPL_STYLESHEET):
ax = plot_learning_curve(
estimator=self._estimator,
x=self._data.train_x,
Expand All @@ -180,7 +177,7 @@ def validation_curve(
param_name: str,
param_range: Sequence,
n_jobs: int = None,
cv: int = 5,
cv: int = None,
scoring: str = "default",
ax: Axes = None,
**kwargs,
Expand All @@ -205,7 +202,8 @@ def validation_curve(
Number of jobs to use in parallelizing the estimator fitting and scoring

cv: int
Number of CV iterations to run. Uses a :class:`~sklearn.model_selection.StratifiedKFold`
Number of CV iterations to run. Defaults to value in `Model.config`.
Uses a :class:`~sklearn.model_selection.StratifiedKFold`
if`estimator` is a classifier - otherwise a :class:`~sklearn.model_selection.KFold`
is used.

Expand All @@ -224,10 +222,11 @@ def validation_curve(
plt.Axes

"""
n_jobs = self._config.N_JOBS if n_jobs is None else n_jobs
n_jobs = config.N_JOBS if n_jobs is None else n_jobs
cv = config.CROSS_VALIDATION if cv is None else cv
title = f"Validation Curve - {self._estimator_name}"

with plt.style.context(self._config.STYLE_SHEET):
with plt.style.context(MPL_STYLESHEET):
ax = plot_validation_curve(
self._estimator,
x=self._data.train_x,
Expand Down
9 changes: 5 additions & 4 deletions src/ml_tooling/plots/viz/classification_viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
)
from ml_tooling.utils import VizError, _classify
from ml_tooling.plots.viz.baseviz import BaseVisualize
from ml_tooling.config import MPL_STYLESHEET


class ClassificationVisualize(BaseVisualize):
Expand Down Expand Up @@ -35,7 +36,7 @@ def confusion_matrix(
matplotlib.Axes
"""

with plt.style.context(self._config.STYLE_SHEET):
with plt.style.context(MPL_STYLESHEET):
title = f"Confusion Matrix - {self._estimator_name}"
y_pred = _classify(self._data.test_x, self._estimator, threshold=threshold)
return plot_confusion_matrix(
Expand All @@ -55,7 +56,7 @@ def roc_curve(self, **kwargs) -> plt.Axes:
if not hasattr(self._estimator, "predict_proba"):
raise VizError("Model must provide a 'predict_proba' method")

with plt.style.context(self._config.STYLE_SHEET):
with plt.style.context(MPL_STYLESHEET):
title = f"ROC AUC - {self._estimator_name}"
y_proba = self._estimator.predict_proba(self._data.test_x)[:, 1]
return plot_roc_auc(self._data.test_y, y_proba, title=title, **kwargs)
Expand All @@ -70,7 +71,7 @@ def lift_curve(self, **kwargs) -> plt.Axes:
-------
matplotlib.Axes
"""
with plt.style.context(self._config.STYLE_SHEET):
with plt.style.context(MPL_STYLESHEET):
title = f"Lift Curve - {self._estimator_name}"
y_proba = self._estimator.predict_proba(self._data.test_x)[:, 1]
return plot_lift_curve(self._data.test_y, y_proba, title=title, **kwargs)
Expand All @@ -95,7 +96,7 @@ def pr_curve(self, **kwargs) -> plt.Axes:
if not hasattr(self._estimator, "predict_proba"):
raise VizError("Estimator must provide a 'predict_proba' method")

with plt.style.context(self._config.STYLE_SHEET):
with plt.style.context(MPL_STYLESHEET):
title = f"Precision-Recall - {self._estimator_name}"
y_proba = self._estimator.predict_proba(self._data.test_x)[:, 1]
return plot_pr_curve(self._data.test_y, y_proba, title=title, **kwargs)
Loading