From bbc63bcdac31e629129011feb685aa50a9b7abff Mon Sep 17 00:00:00 2001 From: Anders Bogsnes Date: Tue, 21 Jul 2020 14:32:30 +0200 Subject: [PATCH 1/3] Passsing all tests --- src/ml_tooling/baseclass.py | 22 ++----- src/ml_tooling/config.py | 47 ++++++++------ src/ml_tooling/plots/viz/baseviz.py | 7 +- .../plots/viz/classification_viz.py | 9 +-- src/ml_tooling/plots/viz/regression_viz.py | 5 +- src/ml_tooling/storage/file.py | 2 +- tests/test_baseclass.py | 18 ++--- tests/test_config.py | 65 +------------------ tests/test_storage.py | 4 +- 9 files changed, 59 insertions(+), 120 deletions(-) diff --git a/src/ml_tooling/baseclass.py b/src/ml_tooling/baseclass.py index 4071793f..81f6257f 100644 --- a/src/ml_tooling/baseclass.py +++ b/src/ml_tooling/baseclass.py @@ -1,16 +1,17 @@ import datetime import pathlib -import joblib -import pandas as pd from contextlib import contextmanager from importlib.resources import path as import_path from typing import Tuple, Optional, Sequence, Union, List, Iterable, Any + +import joblib +import pandas as pd from sklearn.base import is_classifier, is_regressor from sklearn.exceptions import NotFittedError from sklearn.model_selection import check_cv from sklearn.pipeline import Pipeline -from ml_tooling.config import DefaultConfig, ConfigGetter +from ml_tooling.config import config from ml_tooling.data.base_data import Dataset from ml_tooling.logging.logger import create_logger from ml_tooling.metrics.metric import Metrics @@ -40,9 +41,6 @@ class Model: Wrapper class for Estimators """ - _config = None - config = ConfigGetter() - def __init__(self, estimator: Estimator, feature_pipeline: Pipeline = None): """ Parameters @@ -57,6 +55,7 @@ def __init__(self, estimator: Estimator, feature_pipeline: Pipeline = None): self._estimator: Estimator = _validate_estimator(estimator) self.feature_pipeline = feature_pipeline self.result: Optional[ResultType] = None + self.config = config @property def estimator(self): @@ -437,7 +436,7 @@ def score_estimator( if not data.has_validation_set: data.create_train_test( stratify=self.is_classifier, - shuffle=self.config.SHUFFLE, + shuffle=self.config.TRAIN_TEST_SHUFFLE, test_size=self.config.TEST_SIZE, seed=self.config.RANDOM_STATE, ) @@ -665,15 +664,6 @@ def load_production_estimator(cls, module_name: str): estimator = joblib.load(path) return cls(estimator) - @classmethod - def reset_config(cls): - """ - Reset configuration to default - """ - cls._config = DefaultConfig() - - return cls - def __repr__(self): return f"" diff --git a/src/ml_tooling/config.py b/src/ml_tooling/config.py index 99108931..191efbca 100644 --- a/src/ml_tooling/config.py +++ b/src/ml_tooling/config.py @@ -15,24 +15,40 @@ class DefaultConfig: Configuration for a given BaseClass. Configs propagate through each instance """ + default_config = { + "VERBOSITY": 0, + "CLASSIFIER_METRIC": "accuracy", + "REGRESSION_METRIC": "r2", + "CROSS_VALIDATION": 10, + "N_JOBS": -1, + "RANDOM_STATE": 42, + "TRAIN_TEST_SHUFFLE": True, + "TEST_SIZE": 0.25, + } + def __init__(self): - self.VERBOSITY = 0 - self.CLASSIFIER_METRIC = "accuracy" - self.REGRESSION_METRIC = "r2" - self.CROSS_VALIDATION = 10 - self.STYLE_SHEET = MPL_STYLESHEET - self.N_JOBS = -1 - self.RANDOM_STATE = 42 + self._set_config() + self.LOG = False self.RUN_DIR = RUN_DIR self.ESTIMATOR_DIR = ESTIMATOR_DIR - self.LOG = False - self.SHUFFLE = True - self.TEST_SIZE = 0.25 + + def _set_config(self): + self.VERBOSITY = self.default_config["VERBOSITY"] + self.CLASSIFIER_METRIC = self.default_config["CLASSIFIER_METRIC"] + self.REGRESSION_METRIC = self.default_config["REGRESSION_METRIC"] + self.CROSS_VALIDATION = self.default_config["CROSS_VALIDATION"] + self.N_JOBS = self.default_config["N_JOBS"] + self.RANDOM_STATE = self.default_config["RANDOM_STATE"] + self.TRAIN_TEST_SHUFFLE = self.default_config["TRAIN_TEST_SHUFFLE"] + self.TEST_SIZE = self.default_config["TEST_SIZE"] @property def default_storage(self): return FileStorage(self.ESTIMATOR_DIR) + def reset_config(self): + self._set_config() + def __repr__(self): attrs = "\n".join( [ @@ -44,13 +60,4 @@ def __repr__(self): return f"" -class ConfigGetter: - """ - Give each class that inherits from Model an individual config attribute - without relying on the user to overriding the config when they define their class. - """ - - def __get__(self, obj, cls): - if cls._config is None: - cls._config = DefaultConfig() - return cls._config +config = DefaultConfig() diff --git a/src/ml_tooling/plots/viz/baseviz.py b/src/ml_tooling/plots/viz/baseviz.py index 6c774c70..d25f591d 100644 --- a/src/ml_tooling/plots/viz/baseviz.py +++ b/src/ml_tooling/plots/viz/baseviz.py @@ -11,6 +11,7 @@ ) from ml_tooling.utils import _get_estimator_name from sklearn.base import is_classifier +from ml_tooling.config import MPL_STYLESHEET class BaseVisualize: @@ -101,7 +102,7 @@ def feature_importance( scoring = self.default_metric if scoring == "default" else scoring title = f"Feature Importances ({scoring.title()}) - {self._estimator_name}" - with plt.style.context(self._config.STYLE_SHEET): + with plt.style.context(MPL_STYLESHEET): return plot_feature_importance( estimator=self._estimator, x=self._data.x, @@ -159,7 +160,7 @@ def learning_curve( title = f"Learning Curve - {self._estimator_name}" n_jobs = self._config.N_JOBS if n_jobs is None else n_jobs - with plt.style.context(self._config.STYLE_SHEET): + with plt.style.context(MPL_STYLESHEET): ax = plot_learning_curve( estimator=self._estimator, x=self._data.train_x, @@ -227,7 +228,7 @@ def validation_curve( n_jobs = self._config.N_JOBS if n_jobs is None else n_jobs title = f"Validation Curve - {self._estimator_name}" - with plt.style.context(self._config.STYLE_SHEET): + with plt.style.context(MPL_STYLESHEET): ax = plot_validation_curve( self._estimator, x=self._data.train_x, diff --git a/src/ml_tooling/plots/viz/classification_viz.py b/src/ml_tooling/plots/viz/classification_viz.py index e189030b..5094a9c6 100644 --- a/src/ml_tooling/plots/viz/classification_viz.py +++ b/src/ml_tooling/plots/viz/classification_viz.py @@ -8,6 +8,7 @@ ) from ml_tooling.utils import VizError, _classify from ml_tooling.plots.viz.baseviz import BaseVisualize +from ml_tooling.config import MPL_STYLESHEET class ClassificationVisualize(BaseVisualize): @@ -35,7 +36,7 @@ def confusion_matrix( matplotlib.Axes """ - with plt.style.context(self._config.STYLE_SHEET): + with plt.style.context(MPL_STYLESHEET): title = f"Confusion Matrix - {self._estimator_name}" y_pred = _classify(self._data.test_x, self._estimator, threshold=threshold) return plot_confusion_matrix( @@ -55,7 +56,7 @@ def roc_curve(self, **kwargs) -> plt.Axes: if not hasattr(self._estimator, "predict_proba"): raise VizError("Model must provide a 'predict_proba' method") - with plt.style.context(self._config.STYLE_SHEET): + with plt.style.context(MPL_STYLESHEET): title = f"ROC AUC - {self._estimator_name}" y_proba = self._estimator.predict_proba(self._data.test_x)[:, 1] return plot_roc_auc(self._data.test_y, y_proba, title=title, **kwargs) @@ -70,7 +71,7 @@ def lift_curve(self, **kwargs) -> plt.Axes: ------- matplotlib.Axes """ - with plt.style.context(self._config.STYLE_SHEET): + with plt.style.context(MPL_STYLESHEET): title = f"Lift Curve - {self._estimator_name}" y_proba = self._estimator.predict_proba(self._data.test_x)[:, 1] return plot_lift_curve(self._data.test_y, y_proba, title=title, **kwargs) @@ -95,7 +96,7 @@ def pr_curve(self, **kwargs) -> plt.Axes: if not hasattr(self._estimator, "predict_proba"): raise VizError("Estimator must provide a 'predict_proba' method") - with plt.style.context(self._config.STYLE_SHEET): + with plt.style.context(MPL_STYLESHEET): title = f"Precision-Recall - {self._estimator_name}" y_proba = self._estimator.predict_proba(self._data.test_x)[:, 1] return plot_pr_curve(self._data.test_y, y_proba, title=title, **kwargs) diff --git a/src/ml_tooling/plots/viz/regression_viz.py b/src/ml_tooling/plots/viz/regression_viz.py index 1b50010f..657b8488 100644 --- a/src/ml_tooling/plots/viz/regression_viz.py +++ b/src/ml_tooling/plots/viz/regression_viz.py @@ -2,6 +2,7 @@ from ml_tooling.plots import plot_residuals, plot_prediction_error from ml_tooling.plots.viz.baseviz import BaseVisualize +from ml_tooling.config import MPL_STYLESHEET class RegressionVisualize(BaseVisualize): @@ -19,7 +20,7 @@ def residuals(self, **kwargs) -> plt.Axes: matplotlib.Axes Plot of the estimator's residuals """ - with plt.style.context(self._config.STYLE_SHEET): + with plt.style.context(MPL_STYLESHEET): title = f"Residual Plot - {self._estimator_name}" y_pred = self._estimator.predict(self._data.test_x) return plot_residuals(self._data.test_y, y_pred, title, **kwargs) @@ -35,7 +36,7 @@ def prediction_error(self, **kwargs) -> plt.Axes: Plot of the estimator's prediction error """ - with plt.style.context(self._config.STYLE_SHEET): + with plt.style.context(MPL_STYLESHEET): title = f"Prediction Error - {self._estimator_name}" y_pred = self._estimator.predict(self._data.test_x) return plot_prediction_error( diff --git a/src/ml_tooling/storage/file.py b/src/ml_tooling/storage/file.py index f799e4e3..1bb9ef6f 100644 --- a/src/ml_tooling/storage/file.py +++ b/src/ml_tooling/storage/file.py @@ -15,7 +15,7 @@ class FileStorage(Storage): def __init__(self, dir_path: Pathlike = None): self.dir_path = Path.cwd() if dir_path is None else Path(dir_path) - if not self.dir_path.is_dir(): + if self.dir_path.is_file(): raise MLToolingError( f"dir_path is {self.dir_path} which is not a directory" ) diff --git a/tests/test_baseclass.py b/tests/test_baseclass.py index 6d0fbcea..c5363c87 100644 --- a/tests/test_baseclass.py +++ b/tests/test_baseclass.py @@ -75,7 +75,7 @@ def test_default_metric_getter_works_as_expected_classifier(self): assert rf.config.CLASSIFIER_METRIC == "fowlkes_mallows_score" assert rf.config.REGRESSION_METRIC == "r2" assert rf.default_metric == "fowlkes_mallows_score" - rf.reset_config() + rf.config.reset_config() def test_default_metric_getter_works_as_expected_regressor(self): linreg = Model(LinearRegression()) @@ -86,7 +86,7 @@ def test_default_metric_getter_works_as_expected_regressor(self): assert linreg.config.CLASSIFIER_METRIC == "accuracy" assert linreg.config.REGRESSION_METRIC == "neg_mean_squared_error" assert linreg.default_metric == "neg_mean_squared_error" - linreg.reset_config() + linreg.config.reset_config() def test_default_metric_works_as_expected_without_pipeline(self): rf = Model(RandomForestClassifier(n_estimators=10)) @@ -97,8 +97,8 @@ def test_default_metric_works_as_expected_without_pipeline(self): linreg.config.REGRESSION_METRIC = "neg_mean_squared_error" assert "fowlkes_mallows_score" == rf.default_metric assert "neg_mean_squared_error" == linreg.default_metric - rf.reset_config() - linreg.reset_config() + rf.config.reset_config() + linreg.config.reset_config() def test_default_metric_works_as_expected_with_pipeline( self, pipeline_logistic: Pipeline, pipeline_linear: Pipeline @@ -111,8 +111,8 @@ def test_default_metric_works_as_expected_with_pipeline( linreg.config.REGRESSION_METRIC = "neg_mean_squared_error" assert "fowlkes_mallows_score" == logreg.default_metric assert "neg_mean_squared_error" == linreg.default_metric - logreg.reset_config() - linreg.reset_config() + logreg.config.reset_config() + linreg.config.reset_config() def test_regression_model_can_be_saved( self, classifier: Model, tmp_path: pathlib.Path, train_iris_dataset @@ -475,7 +475,7 @@ def test_score_estimator_creates_train_test_data_with_changed_config( model = Model(LinearRegression()) model.config.RANDOM_STATE = 1 model.config.TEST_SIZE = 0.5 - model.config.SHUFFLE = False + model.config.TRAIN_TEST_SHUFFLE = False data = boston_dataset() model.score_estimator(data) @@ -486,7 +486,7 @@ def test_score_estimator_creates_train_test_data_with_changed_config( assert np.array_equal(data.test_y, test.test_y) pd.testing.assert_frame_equal(data.train_x, test.train_x) assert np.array_equal(data.train_y, test.train_y) - model.reset_config() + model.config.reset_config() def test_score_estimator_creates_train_test_data_with_changed_config_and_classification_data( self, iris_dataset @@ -504,7 +504,7 @@ def test_score_estimator_creates_train_test_data_with_changed_config_and_classif assert np.array_equal(data.test_y, test.test_y) pd.testing.assert_frame_equal(data.train_x, test.train_x) assert np.array_equal(data.train_y, test.train_y) - model.reset_config() + model.config.reset_config() def test_can_score_estimator_with_specified_metric(self, train_iris_dataset): model = Model(LogisticRegression(solver="liblinear")) diff --git a/tests/test_config.py b/tests/test_config.py index 00a5b402..0b0ee480 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,53 +1,7 @@ -from ml_tooling import Model from ml_tooling.config import DefaultConfig -from sklearn.linear_model import LinearRegression class TestConfig: - def test_config_is_set_globally(self, pipeline_dummy_classifier, pipeline_linear): - class TestModel(Model): - @classmethod - def setup_estimator(cls): - pass - - def get_prediction_data(self, *args): - pass - - def get_training_data(self): - pass - - TestModel.reset_config() - - assert TestModel.config.N_JOBS == -1 - - model = TestModel(pipeline_dummy_classifier) - assert model.config.N_JOBS == -1 - - TestModel.config.N_JOBS = 1 - assert TestModel.config.N_JOBS == 1 - assert model.config.N_JOBS == 1 - - new_model = TestModel(pipeline_dummy_classifier) - assert new_model.config.N_JOBS == 1 - - def test_can_change_config(self): - class SomeModel(Model): - @classmethod - def setup_estimator(cls): - pass - - def get_training_data(self): - pass - - def get_prediction_data(self, *args): - pass - - SomeModel.reset_config() - test_model = SomeModel(LinearRegression()) - assert 10 == test_model.config.CROSS_VALIDATION - test_model.config.CROSS_VALIDATION = 2 - assert test_model.config.CROSS_VALIDATION == 2 - def test_config_repr_works(self): config = DefaultConfig() for key in [ @@ -55,7 +9,6 @@ def test_config_repr_works(self): "CLASSIFIER_METRIC", "REGRESSION_METRIC", "CROSS_VALIDATION", - "STYLE_SHEET", "N_JOBS", "RANDOM_STATE", ]: @@ -69,23 +22,7 @@ def test_from_same_class_share_config( assert log.config.CLASSIFIER_METRIC == "accuracy" log.config.CLASSIFIER_METRIC = "fowlkes_mallows_score" assert rf.config.CLASSIFIER_METRIC == "fowlkes_mallows_score" - - def test_from_different_classes_do_not_share_config( - self, base, pipeline_logistic, pipeline_forest_classifier - ): - class NoModel(Model): - def get_prediction_data(self, idx): - pass - - def get_training_data(self): - pass - - log = base(pipeline_logistic) - rf = NoModel(pipeline_forest_classifier) - assert log.config.CLASSIFIER_METRIC == "accuracy" - log.config.CLASSIFIER_METRIC = "fowlkes_mallows_score" - assert rf.config.CLASSIFIER_METRIC == "accuracy" - assert log.config.CLASSIFIER_METRIC == "fowlkes_mallows_score" + log.config.reset_config() def test_config_default_storage_points_to_cwd(self, base, tmp_path): base.config.ESTIMATOR_DIR = tmp_path diff --git a/tests/test_storage.py b/tests/test_storage.py index e9c799eb..f219e370 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -90,8 +90,10 @@ def test_can_get_list_of_paths_and_load_from_output( assert estimators[-1] == paths[3] def test_raise_when_non_dir(self, classifier: Model, tmp_path: pathlib.Path): + path = tmp_path / "file.txt" + path.write_text("test") with pytest.raises(MLToolingError, match="which is not a directory"): - _ = FileStorage("/not/a/dir.file") + FileStorage(path) def test_cannot_instantiate_an_abstract_baseclass(self): with pytest.raises(TypeError): From 440fe0ce8dcb43fe6647109a4a1bdcee4c0dca73 Mon Sep 17 00:00:00 2001 From: Anders Bogsnes Date: Tue, 21 Jul 2020 14:50:34 +0200 Subject: [PATCH 2/3] Updated config implementation Now uses an instance global with a default_config. Allows users to modify globally, while still allowing access from modules, so it doesn't need to be passed from the Model object --- src/ml_tooling/plots/viz/baseviz.py | 30 ++++++++++++++--------------- src/ml_tooling/result/result.py | 6 ++---- src/ml_tooling/utils.py | 4 ++-- tests/test_visualize.py | 28 ++++++++++----------------- 4 files changed, 28 insertions(+), 40 deletions(-) diff --git a/src/ml_tooling/plots/viz/baseviz.py b/src/ml_tooling/plots/viz/baseviz.py index d25f591d..befb228e 100644 --- a/src/ml_tooling/plots/viz/baseviz.py +++ b/src/ml_tooling/plots/viz/baseviz.py @@ -11,7 +11,7 @@ ) from ml_tooling.utils import _get_estimator_name from sklearn.base import is_classifier -from ml_tooling.config import MPL_STYLESHEET +from ml_tooling.config import MPL_STYLESHEET, config class BaseVisualize: @@ -19,10 +19,9 @@ class BaseVisualize: Base class for visualizers """ - def __init__(self, estimator, data, config): + def __init__(self, estimator, data): self._estimator = estimator self._estimator_name = _get_estimator_name(estimator) - self._config = config self._data = data @property @@ -40,9 +39,9 @@ def default_metric(self): """ return ( - self._config.CLASSIFIER_METRIC + config.CLASSIFIER_METRIC if is_classifier(self._estimator) - else self._config.REGRESSION_METRIC + else config.REGRESSION_METRIC ) def feature_importance( @@ -53,7 +52,6 @@ def feature_importance( bottom_n: Union[int, float] = None, add_label: bool = True, n_jobs: int = None, - random_state: int = None, ax: Axes = None, **kwargs, ) -> Axes: @@ -84,9 +82,6 @@ def feature_importance( Overwrites N_JOBS from settings. Useful if data is to big to fit in memory multiple times. - random_state: int - Random state to be used when permuting features, - ax: Axes Draws graph on passed ax - otherwise creates new ax @@ -98,7 +93,7 @@ def feature_importance( matplotlib.Axes """ - n_jobs = self._config.N_JOBS if n_jobs is None else n_jobs + n_jobs = config.N_JOBS if n_jobs is None else n_jobs scoring = self.default_metric if scoring == "default" else scoring title = f"Feature Importances ({scoring.title()}) - {self._estimator_name}" @@ -110,7 +105,7 @@ def feature_importance( scoring=scoring, n_repeats=n_repeats, n_jobs=n_jobs, - random_state=random_state, + random_state=config.RANDOM_STATE, ax=ax, top_n=top_n, bottom_n=bottom_n, @@ -121,7 +116,7 @@ def feature_importance( def learning_curve( self, - cv: int = 5, + cv: int = None, scoring: str = "default", n_jobs: int = None, train_sizes: Sequence[float] = np.linspace(0.1, 1.0, 5), @@ -158,7 +153,8 @@ def learning_curve( """ title = f"Learning Curve - {self._estimator_name}" - n_jobs = self._config.N_JOBS if n_jobs is None else n_jobs + n_jobs = config.N_JOBS if n_jobs is None else n_jobs + cv = config.CROSS_VALIDATION if cv is None else cv with plt.style.context(MPL_STYLESHEET): ax = plot_learning_curve( @@ -181,7 +177,7 @@ def validation_curve( param_name: str, param_range: Sequence, n_jobs: int = None, - cv: int = 5, + cv: int = None, scoring: str = "default", ax: Axes = None, **kwargs, @@ -206,7 +202,8 @@ def validation_curve( Number of jobs to use in parallelizing the estimator fitting and scoring cv: int - Number of CV iterations to run. Uses a :class:`~sklearn.model_selection.StratifiedKFold` + Number of CV iterations to run. Defaults to value in `Model.config`. + Uses a :class:`~sklearn.model_selection.StratifiedKFold` if`estimator` is a classifier - otherwise a :class:`~sklearn.model_selection.KFold` is used. @@ -225,7 +222,8 @@ def validation_curve( plt.Axes """ - n_jobs = self._config.N_JOBS if n_jobs is None else n_jobs + n_jobs = config.N_JOBS if n_jobs is None else n_jobs + cv = config.CROSS_VALIDATION if cv is None else cv title = f"Validation Curve - {self._estimator_name}" with plt.style.context(MPL_STYLESHEET): diff --git a/src/ml_tooling/result/result.py b/src/ml_tooling/result/result.py index 173a7d0c..4e3c1cfb 100644 --- a/src/ml_tooling/result/result.py +++ b/src/ml_tooling/result/result.py @@ -32,10 +32,8 @@ class Result: @property def plot(self): if self.model.is_classifier: - return ClassificationVisualize( - self.model.estimator, self.data, self.model.config - ) - return RegressionVisualize(self.model.estimator, self.data, self.model.config) + return ClassificationVisualize(self.model.estimator, self.data) + return RegressionVisualize(self.model.estimator, self.data) @classmethod def from_model( diff --git a/src/ml_tooling/utils.py b/src/ml_tooling/utils.py index d283f9dc..659eff5f 100644 --- a/src/ml_tooling/utils.py +++ b/src/ml_tooling/utils.py @@ -118,7 +118,7 @@ def _get_estimator_name(clf: Estimator) -> str: return class_name -def listify(collection) -> list: +def listify(collection: Any) -> List: """ Takes a given collection and returns a list of the elements, handling strings correctly @@ -140,7 +140,7 @@ def listify(collection) -> list: return collection -def _validate_estimator(estimator: Estimator): +def _validate_estimator(estimator: Estimator) -> Estimator: """ Ensures that estimator is a valid estimator - either a :class:`~sklearn.base.BaseEstimator` or a :class:`~sklearn.pipeline.Pipeline` with a :class:`~sklearn.base.BaseEstimator` diff --git a/tests/test_visualize.py b/tests/test_visualize.py index ad906895..9c1bbfe5 100644 --- a/tests/test_visualize.py +++ b/tests/test_visualize.py @@ -129,7 +129,7 @@ def test_feature_importance_plots_can_be_given_an_ax(self, classifier: Model): plt.close() def test_feature_importance_plots_have_correct_data(self, classifier: Model): - ax = classifier.result.plot.feature_importance(random_state=42) + ax = classifier.result.plot.feature_importance() expected = {"0.04", "0.06", "0.10", "-0.03"} assert {text.get_text() for text in ax.texts} == expected @@ -161,7 +161,7 @@ def test_feature_importance_plots_have_no_labels_if_value_is_false( def test_feature_importance_plots_have_correct_labels_when_top_n_is_set( self, classifier: Model ): - ax = classifier.result.plot.feature_importance(top_n=2, random_state=42) + ax = classifier.result.plot.feature_importance(top_n=2) assert 2 == len(ax.texts) assert {text.get_text() for text in ax.texts} == {"0.10", "0.06"} @@ -179,7 +179,7 @@ def test_feature_importance_plots_have_correct_labels_when_top_n_is_set( def test_feature_importance_plots_have_correct_labels_when_top_n_is_percent( self, classifier: Model ): - ax = classifier.result.plot.feature_importance(top_n=0.2, random_state=42) + ax = classifier.result.plot.feature_importance(top_n=0.2) assert len(ax.texts) == 1 assert {text.get_text() for text in ax.texts} == {"0.10"} @@ -197,7 +197,7 @@ def test_feature_importance_plots_have_correct_labels_when_top_n_is_percent( def test_feature_importance_plots_have_correct_labels_when_bottom_n_is_int( self, classifier: Model ): - ax = classifier.result.plot.feature_importance(bottom_n=2, random_state=42) + ax = classifier.result.plot.feature_importance(bottom_n=2) assert len(ax.texts) == 2 assert {text.get_text() for text in ax.texts} == {"0.04", "-0.03"} @@ -215,7 +215,7 @@ def test_feature_importance_plots_have_correct_labels_when_bottom_n_is_int( def test_feature_importance_plots_have_correct_labels_when_bottom_n_is_percent( self, classifier: Model ): - ax = classifier.result.plot.feature_importance(bottom_n=0.2, random_state=42) + ax = classifier.result.plot.feature_importance(bottom_n=0.2) assert len(ax.texts) == 1 assert {text.get_text() for text in ax.texts} == {"-0.03"} @@ -233,9 +233,7 @@ def test_feature_importance_plots_have_correct_labels_when_bottom_n_is_percent( def test_feature_importance_plots_correct_if_top_n_is_int_and_bottom_n_is_int( self, classifier: Model ): - ax = classifier.result.plot.feature_importance( - top_n=1, bottom_n=1, random_state=42 - ) + ax = classifier.result.plot.feature_importance(top_n=1, bottom_n=1) assert len(ax.texts) == 2 assert {text.get_text() for text in ax.texts} == {"0.10", "-0.03"} assert ax.get_ylabel() == "Feature Labels" @@ -252,9 +250,7 @@ def test_feature_importance_plots_correct_if_top_n_is_int_and_bottom_n_is_int( def test_feature_importance_plots_correct_when_top_n_is_int_and_bottom_n_is_percent( self, classifier: Model ): - ax = classifier.result.plot.feature_importance( - top_n=1, bottom_n=0.2, random_state=42 - ) + ax = classifier.result.plot.feature_importance(top_n=1, bottom_n=0.2) assert 2 == len(ax.texts) assert {text.get_text() for text in ax.texts} == {"0.10", "-0.03"} assert ax.get_ylabel() == "Feature Labels" @@ -271,9 +267,7 @@ def test_feature_importance_plots_correct_when_top_n_is_int_and_bottom_n_is_perc def test_feature_importance_plots_correct_when_top_n_is_percent_and_bottom_n_is_int( self, classifier: Model ): - ax = classifier.result.plot.feature_importance( - top_n=0.2, bottom_n=1, random_state=42 - ) + ax = classifier.result.plot.feature_importance(top_n=0.2, bottom_n=1) assert len(ax.texts) == 2 assert {text.get_text() for text in ax.texts} == {"-0.03", "0.10"} assert ax.get_ylabel() == "Feature Labels" @@ -340,9 +334,7 @@ def load_prediction_data(self): plt.close() def test_can_use_different_scoring_metrics(self, classifier: Model): - ax = classifier.result.plot.feature_importance( - scoring="roc_auc", random_state=42 - ) + ax = classifier.result.plot.feature_importance(scoring="roc_auc") assert ( ax.title.get_text() == "Feature Importances (Roc_Auc) - LogisticRegression" ) @@ -516,7 +508,7 @@ def test_learning_curve_plots_can_be_given_an_ax(self, classifier: Model): plt.close() def test_learning_curve_plots_have_correct_elements(self, classifier: Model): - test_ax = classifier.result.plot.learning_curve() + test_ax = classifier.result.plot.learning_curve(cv=5) assert test_ax.title.get_text() == "Learning Curve - LogisticRegression" assert test_ax.get_ylabel() == "Accuracy Score" assert test_ax.get_xlabel() == "Number of Examples Used" From ce92db11b5a3430847d5d775d04054dd2bfe68a6 Mon Sep 17 00:00:00 2001 From: Anders Bogsnes Date: Tue, 21 Jul 2020 15:00:30 +0200 Subject: [PATCH 3/3] Updated CHANGELOG and docs --- CHANGELOG.md | 1 + docs/config.inc.rst | 47 ++++++++++++++-------------------------- src/ml_tooling/config.py | 2 +- 3 files changed, 18 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5240bfda..b826eee8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ combined feature_pipeline + estimator Pipeline - Can pass a feature pipeline to `Dataset.plot` methods, to apply preprocessing before visualization +- New config implementation. If you need to reset the configuration, you should use `Model.config.reset_config()` # v0.10.3 - Fixed typehints in Dataset diff --git a/docs/config.inc.rst b/docs/config.inc.rst index 9e740717..e17e735a 100644 --- a/docs/config.inc.rst +++ b/docs/config.inc.rst @@ -4,56 +4,41 @@ Config ------ All configuration options available -:class: ml_tooling.config.DefaultConfig +.. autoclass:: ml_tooling.config.DefaultConfig :attr:`VERBOSITY` = 0 The level of verbosity from output - :attr:`CLASSIFIER_METRIC` = 'accuracy' - - Default metric for classifiers + Default metric for classifiers :attr:`REGRESSION_METRIC` = 'r2' - - Default metric for regressions + Default metric for regressions :attr:`CROSS_VALIDATION` = 10 - - Default Number of cross validation folds to use - - :attr:`STYLE_SHEET` = 'almbrand.mplstyle' - - Default style sheet to use for plots + Default Number of cross validation folds to use :attr:`N_JOBS` = -1 - - Default number of cores to use when doing multiprocessing. - -1 means use all available + Default number of cores to use when doing multiprocessing. + -1 means use all available :attr:`RANDOM_STATE` = 42 - - Default random state seed for all functions involving randomness + Default random state seed for all functions involving randomness :attr:`RUN_DIR` = './runs' + Default folder to store run logging files - Default folder to store run logging files - - :attr:`MODEL_DIR` = './models' - - Default folder to store pickled models in + :attr:`ESTIMATOR_DIR` = './models' + Default folder to store pickled models in :attr:`LOG` = False + Toggles whether or not to log runs to a file. Set to True if you + want every run to be logged, else use the :meth:`~ml_tooling.baseclass.ModelData.log` + context manager - Toggles whether or not to log runs to a file. Set to True if you - want every run to be logged, else use the :meth:`~ml_tooling.baseclass.ModelData.log` - context manager - - :attr:`SHUFFLE` = True - - Default whether or not to shuffle data for test set + :attr:`TRAIN_TEST_SHUFFLE` = True + Default whether or not to shuffle data for test set :attr:`TEST_SIZE` = 0.25 - - Default percentage of data that will be part of the test set + Default percentage of data that will be part of the test set diff --git a/src/ml_tooling/config.py b/src/ml_tooling/config.py index 191efbca..3559cdf7 100644 --- a/src/ml_tooling/config.py +++ b/src/ml_tooling/config.py @@ -12,7 +12,7 @@ class DefaultConfig: """ - Configuration for a given BaseClass. Configs propagate through each instance + Configuration for Models """ default_config = {