From 71cc926516f30791cba77522518e8e36193fa505 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sat, 3 Dec 2022 17:33:08 +0100 Subject: [PATCH 1/8] added all_estimators in ObjectNode trusted --- skops/io/_general.py | 4 ++-- skops/io/_trusted_types.py | 8 ++++++++ skops/io/tests/test_audit.py | 22 +++++++++++++--------- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/skops/io/_general.py b/skops/io/_general.py index 7f27fbcb..126bcfaf 100644 --- a/skops/io/_general.py +++ b/skops/io/_general.py @@ -8,7 +8,7 @@ import numpy as np from ._audit import Node, get_tree -from ._trusted_types import PRIMITIVE_TYPE_NAMES +from ._trusted_types import PRIMITIVE_TYPE_NAMES, SKLEARN_ESTIMATOR_TYPE_NAMES from ._utils import ( LoadContext, SaveContext, @@ -383,7 +383,7 @@ def __init__( self.children = {"attrs": attrs} # TODO: what do we trust? - self.trusted = self._get_trusted(trusted, []) + self.trusted = self._get_trusted(trusted, default=SKLEARN_ESTIMATOR_TYPE_NAMES) def _construct(self): cls = gettype(self.module_name, self.class_name) diff --git a/skops/io/_trusted_types.py b/skops/io/_trusted_types.py index e3c38ffd..abc780bd 100644 --- a/skops/io/_trusted_types.py +++ b/skops/io/_trusted_types.py @@ -1,3 +1,11 @@ +from sklearn.utils import all_estimators + +from ._utils import get_type_name + PRIMITIVES_TYPES = [int, float, str, bool] PRIMITIVE_TYPE_NAMES = ["builtins." + t.__name__ for t in PRIMITIVES_TYPES] + +SKLEARN_ESTIMATOR_TYPE_NAMES = [ + get_type_name(estimator_class) for _, estimator_class in all_estimators() +] diff --git a/skops/io/tests/test_audit.py b/skops/io/tests/test_audit.py index a1ae0188..4b35ca85 100644 --- a/skops/io/tests/test_audit.py +++ b/skops/io/tests/test_audit.py @@ -146,6 +146,18 @@ def __init__(self): assert not hasattr(temp, "b") +def test_sklearn_trusted_set(): + clf = Pipeline( + [ + ("scaler", StandardScaler()), + ("clf", LogisticRegression(random_state=0, solver="liblinear")), + ] + ) + + untrusted = get_untrusted_types(data=dumps(clf)) + assert len(untrusted) == 0 + + def test_complex_pipeline_untrusted_set(): # fmt: off clf = Pipeline([ @@ -162,12 +174,4 @@ def test_complex_pipeline_untrusted_set(): untrusted = get_untrusted_types(data=dumps(clf)) type_names = [x.split(".")[-1] for x in untrusted] - assert type_names == [ - "sqrt", - "square", - "LogisticRegression", - "FeatureUnion", - "Pipeline", - "StandardScaler", - "FunctionTransformer", - ] + assert type_names == ["sqrt", "square"] From 0e600b81c78e3ce9ec547695c28789c29536eb4a Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sat, 3 Dec 2022 19:06:11 +0100 Subject: [PATCH 2/8] added simple sanity check --- skops/io/_trusted_types.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skops/io/_trusted_types.py b/skops/io/_trusted_types.py index abc780bd..1ef1b826 100644 --- a/skops/io/_trusted_types.py +++ b/skops/io/_trusted_types.py @@ -7,5 +7,7 @@ PRIMITIVE_TYPE_NAMES = ["builtins." + t.__name__ for t in PRIMITIVES_TYPES] SKLEARN_ESTIMATOR_TYPE_NAMES = [ - get_type_name(estimator_class) for _, estimator_class in all_estimators() + get_type_name(estimator_class) + for _, estimator_class in all_estimators() + if get_type_name(estimator_class).startswith("sklearn.") ] From d12564f12fb0484e4e52ee805e9aa9cf82625a10 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sun, 4 Dec 2022 10:55:13 +0100 Subject: [PATCH 3/8] testing all estimators in test_sklearn_trusted_types --- skops/io/tests/test_audit.py | 19 +++-- skops/io/tests/test_persist.py | 134 ++---------------------------- skops/io/tests/testing_utils.py | 139 ++++++++++++++++++++++++++++++++ 3 files changed, 154 insertions(+), 138 deletions(-) create mode 100644 skops/io/tests/testing_utils.py diff --git a/skops/io/tests/test_audit.py b/skops/io/tests/test_audit.py index 4b35ca85..be29e89e 100644 --- a/skops/io/tests/test_audit.py +++ b/skops/io/tests/test_audit.py @@ -9,11 +9,13 @@ from sklearn.linear_model import LogisticRegression from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.preprocessing import FunctionTransformer, StandardScaler +from sklearn.utils.estimator_checks import _get_check_estimator_ids from skops.io import dumps, get_untrusted_types from skops.io._audit import Node, audit_tree, check_type, get_tree, temp_setattr from skops.io._general import DictNode, dict_get_state from skops.io._utils import LoadContext, SaveContext, gettype +from skops.io.tests.testing_utils import get_tested_estimators class CustomType: @@ -146,16 +148,13 @@ def __init__(self): assert not hasattr(temp, "b") -def test_sklearn_trusted_set(): - clf = Pipeline( - [ - ("scaler", StandardScaler()), - ("clf", LogisticRegression(random_state=0, solver="liblinear")), - ] - ) - - untrusted = get_untrusted_types(data=dumps(clf)) - assert len(untrusted) == 0 +@pytest.mark.parametrize( + "estimator", get_tested_estimators(), ids=_get_check_estimator_ids +) +def test_sklearn_trusted_types(estimator): + untrusted_types = get_untrusted_types(data=dumps(estimator)) + sklearn_untrusted_types = [t for t in untrusted_types if t.startswith("skelarn.")] + assert len(sklearn_untrusted_types) == 0 def test_complex_pipeline_untrusted_set(): diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index f833620a..fbff2844 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -5,37 +5,24 @@ import sys import warnings from collections import Counter -from functools import partial, wraps +from functools import wraps from pathlib import Path from zipfile import ZipFile import joblib import numpy as np import pytest -from scipy import sparse, special +from scipy import sparse from sklearn.base import BaseEstimator, is_regressor -from sklearn.compose import ColumnTransformer from sklearn.datasets import load_sample_images, make_classification, make_regression -from sklearn.decomposition import SparseCoder from sklearn.exceptions import SkipTestWarning from sklearn.experimental import enable_halving_search_cv # noqa from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import ( - GridSearchCV, - GroupKFold, - HalvingGridSearchCV, - HalvingRandomSearchCV, - KFold, - RandomizedSearchCV, - ShuffleSplit, - check_cv, -) -from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor +from sklearn.model_selection import GroupKFold, KFold, ShuffleSplit, check_cv from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.preprocessing import ( FunctionTransformer, MinMaxScaler, - Normalizer, PolynomialFeatures, StandardScaler, ) @@ -58,10 +45,7 @@ from skops.io._sklearn import UNSUPPORTED_TYPES from skops.io._utils import LoadContext, SaveContext, _get_state, get_state from skops.io.exceptions import UnsupportedTypeException - -# Default settings for X -N_SAMPLES = 50 -N_FEATURES = 20 +from skops.io.tests.testing_utils import N_FEATURES, N_SAMPLES, get_tested_estimators # TODO: Investigate why that seems to be an issue on MacOS (only observed with # Python 3.8) @@ -122,112 +106,6 @@ def wrapper(state, load_context, trusted): NODE_TYPE_MAPPING[key] = debug_get_tree(method) -def _tested_estimators(type_filter=None): - for name, Estimator in all_estimators(type_filter=type_filter): - if Estimator in UNSUPPORTED_TYPES: - continue - try: - # suppress warnings here for skipped estimators. - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - category=SkipTestWarning, - message="Can't instantiate estimator", - ) - estimator = _construct_instance(Estimator) - # with the kind of data we pass, it needs to be 1 for the few - # estimators which have this. - if "n_components" in estimator.get_params(): - estimator.set_params(n_components=1) - # Then n_best needs to be <= n_components - if "n_best" in estimator.get_params(): - estimator.set_params(n_best=1) - if "patch_size" in estimator.get_params(): - # set patch size to fix PatchExtractor test. - estimator.set_params(patch_size=(3, 3)) - except SkipTest: - continue - - yield estimator - - # nested Pipeline & FeatureUnion - # fmt: off - yield Pipeline([ - ("features", FeatureUnion([ - ("scaler", StandardScaler()), - ("scaled-poly", Pipeline([ - ("polys", FeatureUnion([ - ("poly1", PolynomialFeatures()), - ("poly2", PolynomialFeatures(degree=3, include_bias=False)) - ])), - ("scale", MinMaxScaler()), - ])), - ])), - ("clf", LogisticRegression(random_state=0, solver="liblinear")), - ]) - # fmt: on - - # FunctionTransformer with numpy functions - yield FunctionTransformer( - func=np.sqrt, - inverse_func=np.square, - ) - - # FunctionTransformer with scipy functions - problem is that they look like - # numpy ufuncs - yield FunctionTransformer( - func=special.erf, - inverse_func=special.erfinv, - ) - - # partial functions should be supported - yield FunctionTransformer( - func=partial(np.add, 10), - inverse_func=partial(np.add, -10), - ) - - yield KNeighborsClassifier(algorithm="kd_tree") - yield KNeighborsRegressor(algorithm="ball_tree") - - yield ColumnTransformer( - [ - ("norm1", Normalizer(norm="l1"), [0]), - ("norm2", Normalizer(norm="l1"), [1, 2]), - ("norm3", Normalizer(norm="l1"), [True] + (N_FEATURES - 1) * [False]), - ("norm4", Normalizer(norm="l1"), np.array([1, 2])), - ("norm5", Normalizer(norm="l1"), slice(3)), - ("norm6", Normalizer(norm="l1"), slice(-10, -3, 2)), - ], - ) - - yield GridSearchCV( - LogisticRegression(random_state=0, solver="liblinear"), - {"C": [1, 2, 3, 4, 5]}, - ) - - yield HalvingGridSearchCV( - LogisticRegression(random_state=0, solver="liblinear"), - {"C": [1, 2, 3, 4, 5]}, - ) - - yield HalvingRandomSearchCV( - LogisticRegression(random_state=0, solver="liblinear"), - {"C": [1, 2, 3, 4, 5]}, - ) - - yield RandomizedSearchCV( - LogisticRegression(random_state=0, solver="liblinear"), - {"C": [1, 2, 3, 4, 5]}, - n_iter=3, - ) - - dictionary = np.random.randint(-2, 3, size=(5, N_FEATURES)).astype(float) - yield SparseCoder( - dictionary=dictionary, - transform_algorithm="lasso_lars", - ) - - def _unsupported_estimators(type_filter=None): for name, Estimator in all_estimators(type_filter=type_filter): if Estimator not in UNSUPPORTED_TYPES: @@ -398,7 +276,7 @@ def assert_params_equal(params1, params2): @pytest.mark.parametrize( - "estimator", _tested_estimators(), ids=_get_check_estimator_ids + "estimator", get_tested_estimators(), ids=_get_check_estimator_ids ) def test_can_persist_non_fitted(estimator): """Check that non-fitted estimators can be persisted.""" @@ -466,7 +344,7 @@ def get_input(estimator): @pytest.mark.parametrize( - "estimator", _tested_estimators(), ids=_get_check_estimator_ids + "estimator", get_tested_estimators(), ids=_get_check_estimator_ids ) def test_can_persist_fitted(estimator, request): """Check that fitted estimators can be persisted and return the right results.""" diff --git a/skops/io/tests/testing_utils.py b/skops/io/tests/testing_utils.py new file mode 100644 index 00000000..48e66dd0 --- /dev/null +++ b/skops/io/tests/testing_utils.py @@ -0,0 +1,139 @@ +import warnings +from functools import partial + +import numpy as np +from scipy import special +from sklearn.compose import ColumnTransformer +from sklearn.decomposition import SparseCoder +from sklearn.exceptions import SkipTestWarning +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import ( + GridSearchCV, + HalvingGridSearchCV, + HalvingRandomSearchCV, + RandomizedSearchCV, +) +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor +from sklearn.pipeline import FeatureUnion, Pipeline +from sklearn.preprocessing import ( + FunctionTransformer, + MinMaxScaler, + Normalizer, + PolynomialFeatures, + StandardScaler, +) +from sklearn.utils import all_estimators +from sklearn.utils._testing import SkipTest +from sklearn.utils.estimator_checks import _construct_instance + +from skops.io._sklearn import UNSUPPORTED_TYPES + +# Default settings for X +N_SAMPLES = 50 +N_FEATURES = 20 + + +def get_tested_estimators(type_filter=None): + for name, Estimator in all_estimators(type_filter=type_filter): + if Estimator in UNSUPPORTED_TYPES: + continue + try: + # suppress warnings here for skipped estimators. + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=SkipTestWarning, + message="Can't instantiate estimator", + ) + estimator = _construct_instance(Estimator) + # with the kind of data we pass, it needs to be 1 for the few + # estimators which have this. + if "n_components" in estimator.get_params(): + estimator.set_params(n_components=1) + # Then n_best needs to be <= n_components + if "n_best" in estimator.get_params(): + estimator.set_params(n_best=1) + if "patch_size" in estimator.get_params(): + # set patch size to fix PatchExtractor test. + estimator.set_params(patch_size=(3, 3)) + except SkipTest: + continue + + yield estimator + + # nested Pipeline & FeatureUnion + # fmt: off + yield Pipeline([ + ("features", FeatureUnion([ + ("scaler", StandardScaler()), + ("scaled-poly", Pipeline([ + ("polys", FeatureUnion([ + ("poly1", PolynomialFeatures()), + ("poly2", PolynomialFeatures(degree=3, include_bias=False)) + ])), + ("scale", MinMaxScaler()), + ])), + ])), + ("clf", LogisticRegression(random_state=0, solver="liblinear")), + ]) + # fmt: on + + # FunctionTransformer with numpy functions + yield FunctionTransformer( + func=np.sqrt, + inverse_func=np.square, + ) + + # FunctionTransformer with scipy functions - problem is that they look like + # numpy ufuncs + yield FunctionTransformer( + func=special.erf, + inverse_func=special.erfinv, + ) + + # partial functions should be supported + yield FunctionTransformer( + func=partial(np.add, 10), + inverse_func=partial(np.add, -10), + ) + + yield KNeighborsClassifier(algorithm="kd_tree") + yield KNeighborsRegressor(algorithm="ball_tree") + + yield ColumnTransformer( + [ + ("norm1", Normalizer(norm="l1"), [0]), + ("norm2", Normalizer(norm="l1"), [1, 2]), + ("norm3", Normalizer(norm="l1"), [True] + (N_FEATURES - 1) * [False]), + ("norm4", Normalizer(norm="l1"), np.array([1, 2])), + ("norm5", Normalizer(norm="l1"), slice(3)), + ("norm6", Normalizer(norm="l1"), slice(-10, -3, 2)), + ], + ) + + yield GridSearchCV( + LogisticRegression(random_state=0, solver="liblinear"), + {"C": [1, 2, 3, 4, 5]}, + ) + + yield HalvingGridSearchCV( + LogisticRegression(random_state=0, solver="liblinear"), + {"C": [1, 2, 3, 4, 5]}, + ) + + yield HalvingRandomSearchCV( + LogisticRegression(random_state=0, solver="liblinear"), + {"C": [1, 2, 3, 4, 5]}, + ) + + yield RandomizedSearchCV( + LogisticRegression(random_state=0, solver="liblinear"), + {"C": [1, 2, 3, 4, 5]}, + n_iter=3, + ) + + dictionary = np.random.randint(-2, 3, size=(5, N_FEATURES)).astype(float) + yield SparseCoder( + dictionary=dictionary, + transform_algorithm="lasso_lars", + ) From 4fde78242a47c9ccf1036dad85d6f9137390ce2e Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Mon, 5 Dec 2022 12:19:47 +0100 Subject: [PATCH 4/8] fixed typo in test --- skops/io/tests/test_audit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skops/io/tests/test_audit.py b/skops/io/tests/test_audit.py index be29e89e..ba631137 100644 --- a/skops/io/tests/test_audit.py +++ b/skops/io/tests/test_audit.py @@ -153,7 +153,7 @@ def __init__(self): ) def test_sklearn_trusted_types(estimator): untrusted_types = get_untrusted_types(data=dumps(estimator)) - sklearn_untrusted_types = [t for t in untrusted_types if t.startswith("skelarn.")] + sklearn_untrusted_types = [t for t in untrusted_types if t.startswith("sklearn.")] assert len(sklearn_untrusted_types) == 0 From 2a96fc55dd169cf87984c746f4677f680203121c Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 6 Dec 2022 22:57:11 +0100 Subject: [PATCH 5/8] moved sklearn_untrusted_type inside test_can_persist_fitted --- skops/io/tests/test_audit.py | 11 --- skops/io/tests/test_persist.py | 142 ++++++++++++++++++++++++++++++-- skops/io/tests/testing_utils.py | 139 ------------------------------- 3 files changed, 135 insertions(+), 157 deletions(-) delete mode 100644 skops/io/tests/testing_utils.py diff --git a/skops/io/tests/test_audit.py b/skops/io/tests/test_audit.py index ba631137..71914b4d 100644 --- a/skops/io/tests/test_audit.py +++ b/skops/io/tests/test_audit.py @@ -9,13 +9,11 @@ from sklearn.linear_model import LogisticRegression from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.preprocessing import FunctionTransformer, StandardScaler -from sklearn.utils.estimator_checks import _get_check_estimator_ids from skops.io import dumps, get_untrusted_types from skops.io._audit import Node, audit_tree, check_type, get_tree, temp_setattr from skops.io._general import DictNode, dict_get_state from skops.io._utils import LoadContext, SaveContext, gettype -from skops.io.tests.testing_utils import get_tested_estimators class CustomType: @@ -148,15 +146,6 @@ def __init__(self): assert not hasattr(temp, "b") -@pytest.mark.parametrize( - "estimator", get_tested_estimators(), ids=_get_check_estimator_ids -) -def test_sklearn_trusted_types(estimator): - untrusted_types = get_untrusted_types(data=dumps(estimator)) - sklearn_untrusted_types = [t for t in untrusted_types if t.startswith("sklearn.")] - assert len(sklearn_untrusted_types) == 0 - - def test_complex_pipeline_untrusted_set(): # fmt: off clf = Pipeline([ diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index 29dcff36..ce1c6ec9 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -5,24 +5,37 @@ import sys import warnings from collections import Counter -from functools import wraps +from functools import partial, wraps from pathlib import Path from zipfile import ZipFile import joblib import numpy as np import pytest -from scipy import sparse +from scipy import sparse, special from sklearn.base import BaseEstimator, is_regressor +from sklearn.compose import ColumnTransformer from sklearn.datasets import load_sample_images, make_classification, make_regression +from sklearn.decomposition import SparseCoder from sklearn.exceptions import SkipTestWarning from sklearn.experimental import enable_halving_search_cv # noqa from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import GroupKFold, KFold, ShuffleSplit, check_cv +from sklearn.model_selection import ( + GridSearchCV, + GroupKFold, + HalvingGridSearchCV, + HalvingRandomSearchCV, + KFold, + RandomizedSearchCV, + ShuffleSplit, + check_cv, +) +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.preprocessing import ( FunctionTransformer, MinMaxScaler, + Normalizer, PolynomialFeatures, StandardScaler, ) @@ -45,7 +58,10 @@ from skops.io._sklearn import UNSUPPORTED_TYPES from skops.io._utils import LoadContext, SaveContext, _get_state, get_state from skops.io.exceptions import UnsupportedTypeException -from skops.io.tests.testing_utils import N_FEATURES, N_SAMPLES, get_tested_estimators + +# Default settings for X +N_SAMPLES = 50 +N_FEATURES = 20 # TODO: Investigate why that seems to be an issue on MacOS (only observed with # Python 3.8) @@ -106,6 +122,112 @@ def wrapper(state, load_context, trusted): NODE_TYPE_MAPPING[key] = debug_get_tree(method) +def _tested_estimators(type_filter=None): + for name, Estimator in all_estimators(type_filter=type_filter): + if Estimator in UNSUPPORTED_TYPES: + continue + try: + # suppress warnings here for skipped estimators. + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=SkipTestWarning, + message="Can't instantiate estimator", + ) + estimator = _construct_instance(Estimator) + # with the kind of data we pass, it needs to be 1 for the few + # estimators which have this. + if "n_components" in estimator.get_params(): + estimator.set_params(n_components=1) + # Then n_best needs to be <= n_components + if "n_best" in estimator.get_params(): + estimator.set_params(n_best=1) + if "patch_size" in estimator.get_params(): + # set patch size to fix PatchExtractor test. + estimator.set_params(patch_size=(3, 3)) + except SkipTest: + continue + + yield estimator + + # nested Pipeline & FeatureUnion + # fmt: off + yield Pipeline([ + ("features", FeatureUnion([ + ("scaler", StandardScaler()), + ("scaled-poly", Pipeline([ + ("polys", FeatureUnion([ + ("poly1", PolynomialFeatures()), + ("poly2", PolynomialFeatures(degree=3, include_bias=False)) + ])), + ("scale", MinMaxScaler()), + ])), + ])), + ("clf", LogisticRegression(random_state=0, solver="liblinear")), + ]) + # fmt: on + + # FunctionTransformer with numpy functions + yield FunctionTransformer( + func=np.sqrt, + inverse_func=np.square, + ) + + # FunctionTransformer with scipy functions - problem is that they look like + # numpy ufuncs + yield FunctionTransformer( + func=special.erf, + inverse_func=special.erfinv, + ) + + # partial functions should be supported + yield FunctionTransformer( + func=partial(np.add, 10), + inverse_func=partial(np.add, -10), + ) + + yield KNeighborsClassifier(algorithm="kd_tree") + yield KNeighborsRegressor(algorithm="ball_tree") + + yield ColumnTransformer( + [ + ("norm1", Normalizer(norm="l1"), [0]), + ("norm2", Normalizer(norm="l1"), [1, 2]), + ("norm3", Normalizer(norm="l1"), [True] + (N_FEATURES - 1) * [False]), + ("norm4", Normalizer(norm="l1"), np.array([1, 2])), + ("norm5", Normalizer(norm="l1"), slice(3)), + ("norm6", Normalizer(norm="l1"), slice(-10, -3, 2)), + ], + ) + + yield GridSearchCV( + LogisticRegression(random_state=0, solver="liblinear"), + {"C": [1, 2, 3, 4, 5]}, + ) + + yield HalvingGridSearchCV( + LogisticRegression(random_state=0, solver="liblinear"), + {"C": [1, 2, 3, 4, 5]}, + ) + + yield HalvingRandomSearchCV( + LogisticRegression(random_state=0, solver="liblinear"), + {"C": [1, 2, 3, 4, 5]}, + ) + + yield RandomizedSearchCV( + LogisticRegression(random_state=0, solver="liblinear"), + {"C": [1, 2, 3, 4, 5]}, + n_iter=3, + ) + + dictionary = np.random.randint(-2, 3, size=(5, N_FEATURES)).astype(float) + yield SparseCoder( + dictionary=dictionary, + transform_algorithm="lasso_lars", + ) + + def _unsupported_estimators(type_filter=None): for name, Estimator in all_estimators(type_filter=type_filter): if Estimator not in UNSUPPORTED_TYPES: @@ -276,7 +398,7 @@ def assert_params_equal(params1, params2): @pytest.mark.parametrize( - "estimator", get_tested_estimators(), ids=_get_check_estimator_ids + "estimator", _tested_estimators(), ids=_get_check_estimator_ids ) def test_can_persist_non_fitted(estimator): """Check that non-fitted estimators can be persisted.""" @@ -344,9 +466,9 @@ def get_input(estimator): @pytest.mark.parametrize( - "estimator", get_tested_estimators(), ids=_get_check_estimator_ids + "estimator", _tested_estimators(), ids=_get_check_estimator_ids ) -def test_can_persist_fitted(estimator, request): +def test_can_persist_fitted(estimator): """Check that fitted estimators can be persisted and return the right results.""" set_random_state(estimator, random_state=0) @@ -369,6 +491,12 @@ def test_can_persist_fitted(estimator, request): loaded = loads(dumped, trusted=untrusted_types) assert_params_equal(estimator.__dict__, loaded.__dict__) + # test that most sklearn estimators are not in untrusted_types + sklearn_untrusted_types = [ + type_ for type_ in untrusted_types if type_.startswith("sklearn.") + ] + assert len(sklearn_untrusted_types) == 0 + for method in [ "predict", "predict_proba", diff --git a/skops/io/tests/testing_utils.py b/skops/io/tests/testing_utils.py deleted file mode 100644 index 48e66dd0..00000000 --- a/skops/io/tests/testing_utils.py +++ /dev/null @@ -1,139 +0,0 @@ -import warnings -from functools import partial - -import numpy as np -from scipy import special -from sklearn.compose import ColumnTransformer -from sklearn.decomposition import SparseCoder -from sklearn.exceptions import SkipTestWarning -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import ( - GridSearchCV, - HalvingGridSearchCV, - HalvingRandomSearchCV, - RandomizedSearchCV, -) -from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor -from sklearn.pipeline import FeatureUnion, Pipeline -from sklearn.preprocessing import ( - FunctionTransformer, - MinMaxScaler, - Normalizer, - PolynomialFeatures, - StandardScaler, -) -from sklearn.utils import all_estimators -from sklearn.utils._testing import SkipTest -from sklearn.utils.estimator_checks import _construct_instance - -from skops.io._sklearn import UNSUPPORTED_TYPES - -# Default settings for X -N_SAMPLES = 50 -N_FEATURES = 20 - - -def get_tested_estimators(type_filter=None): - for name, Estimator in all_estimators(type_filter=type_filter): - if Estimator in UNSUPPORTED_TYPES: - continue - try: - # suppress warnings here for skipped estimators. - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - category=SkipTestWarning, - message="Can't instantiate estimator", - ) - estimator = _construct_instance(Estimator) - # with the kind of data we pass, it needs to be 1 for the few - # estimators which have this. - if "n_components" in estimator.get_params(): - estimator.set_params(n_components=1) - # Then n_best needs to be <= n_components - if "n_best" in estimator.get_params(): - estimator.set_params(n_best=1) - if "patch_size" in estimator.get_params(): - # set patch size to fix PatchExtractor test. - estimator.set_params(patch_size=(3, 3)) - except SkipTest: - continue - - yield estimator - - # nested Pipeline & FeatureUnion - # fmt: off - yield Pipeline([ - ("features", FeatureUnion([ - ("scaler", StandardScaler()), - ("scaled-poly", Pipeline([ - ("polys", FeatureUnion([ - ("poly1", PolynomialFeatures()), - ("poly2", PolynomialFeatures(degree=3, include_bias=False)) - ])), - ("scale", MinMaxScaler()), - ])), - ])), - ("clf", LogisticRegression(random_state=0, solver="liblinear")), - ]) - # fmt: on - - # FunctionTransformer with numpy functions - yield FunctionTransformer( - func=np.sqrt, - inverse_func=np.square, - ) - - # FunctionTransformer with scipy functions - problem is that they look like - # numpy ufuncs - yield FunctionTransformer( - func=special.erf, - inverse_func=special.erfinv, - ) - - # partial functions should be supported - yield FunctionTransformer( - func=partial(np.add, 10), - inverse_func=partial(np.add, -10), - ) - - yield KNeighborsClassifier(algorithm="kd_tree") - yield KNeighborsRegressor(algorithm="ball_tree") - - yield ColumnTransformer( - [ - ("norm1", Normalizer(norm="l1"), [0]), - ("norm2", Normalizer(norm="l1"), [1, 2]), - ("norm3", Normalizer(norm="l1"), [True] + (N_FEATURES - 1) * [False]), - ("norm4", Normalizer(norm="l1"), np.array([1, 2])), - ("norm5", Normalizer(norm="l1"), slice(3)), - ("norm6", Normalizer(norm="l1"), slice(-10, -3, 2)), - ], - ) - - yield GridSearchCV( - LogisticRegression(random_state=0, solver="liblinear"), - {"C": [1, 2, 3, 4, 5]}, - ) - - yield HalvingGridSearchCV( - LogisticRegression(random_state=0, solver="liblinear"), - {"C": [1, 2, 3, 4, 5]}, - ) - - yield HalvingRandomSearchCV( - LogisticRegression(random_state=0, solver="liblinear"), - {"C": [1, 2, 3, 4, 5]}, - ) - - yield RandomizedSearchCV( - LogisticRegression(random_state=0, solver="liblinear"), - {"C": [1, 2, 3, 4, 5]}, - n_iter=3, - ) - - dictionary = np.random.randint(-2, 3, size=(5, N_FEATURES)).astype(float) - yield SparseCoder( - dictionary=dictionary, - transform_algorithm="lasso_lars", - ) From 6e56ff63a179f4c3d393edc2ae5fa7a37f316da2 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 6 Dec 2022 23:22:52 +0100 Subject: [PATCH 6/8] excluded sklearn types in tests that are not yet trusted --- skops/io/tests/test_persist.py | 48 ++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index ce1c6ec9..2831cbeb 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -67,6 +67,47 @@ # Python 3.8) ATOL = 1e-6 if sys.platform == "darwin" else 1e-7 +# TODO: remove when these are added to trusted types +SKLEARN_EXCLUDED_TYPES = ( + "sklearn._loss._loss.CyHalfBinomialLoss", + "sklearn._loss._loss.CyHalfGammaLoss", + "sklearn._loss._loss.CyHalfPoissonLoss", + "sklearn._loss._loss.CyHalfSquaredError", + "sklearn._loss._loss.CyHalfTweedieLossIdentity", + "sklearn._loss.link.IdentityLink", + "sklearn._loss.link.Interval", + "sklearn._loss.link.LogLink", + "sklearn._loss.link.LogitLink", + "sklearn._loss.loss.HalfBinomialLoss", + "sklearn._loss.loss.HalfGammaLoss", + "sklearn._loss.loss.HalfPoissonLoss", + "sklearn._loss.loss.HalfSquaredError", + "sklearn._loss.loss.HalfTweedieLossIdentity", + "sklearn.calibration._CalibratedClassifier", + "sklearn.calibration._SigmoidCalibration", + "sklearn.cluster._bisect_k_means._BisectingTree", + "sklearn.cluster._kmeans._kmeans_single_lloyd", + "sklearn.covariance._graph_lasso._DictWithDeprecatedKeys", + "sklearn.ensemble._gb_losses.BinomialDeviance", + "sklearn.ensemble._gb_losses.LeastSquaresError", + "sklearn.ensemble._hist_gradient_boosting.binning._BinMapper", + "sklearn.ensemble._hist_gradient_boosting.predictor.TreePredictor", + "sklearn.feature_selection._univariate_selection.f_classif", + "sklearn.gaussian_process._gpc._BinaryGaussianProcessClassifierLaplace", + "sklearn.gaussian_process.kernels.ConstantKernel", + "sklearn.gaussian_process.kernels.Product", + "sklearn.gaussian_process.kernels.RBF", + "sklearn.impute._iterative._ImputerTriplet", + "sklearn.metrics._dist_metrics.EuclideanDistance", + "sklearn.metrics._scorer._passthrough_scorer", + "sklearn.model_selection._split.StratifiedKFold", + "sklearn.multiclass._ConstantPredictor", + "sklearn.neighbors._ball_tree.BallTree", + "sklearn.neighbors._kd_tree.KDTree", + "sklearn.neural_network._stochastic_optimizers.AdamOptimizer", + "sklearn.utils._bunch.Bunch", +) + @pytest.fixture(autouse=True, scope="module") def debug_dispatch_functions(): @@ -491,9 +532,12 @@ def test_can_persist_fitted(estimator): loaded = loads(dumped, trusted=untrusted_types) assert_params_equal(estimator.__dict__, loaded.__dict__) - # test that most sklearn estimators are not in untrusted_types + # test that sklearn types are trusted. Some known types are excluded + # from testing because they are not in the trusted list yet. sklearn_untrusted_types = [ - type_ for type_ in untrusted_types if type_.startswith("sklearn.") + type_ + for type_ in untrusted_types + if type_.startswith("sklearn.") and type_ not in SKLEARN_EXCLUDED_TYPES ] assert len(sklearn_untrusted_types) == 0 From 1fedb2c259e48c7b453e3a4aa39a1a482457e74a Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Wed, 7 Dec 2022 22:30:59 +0100 Subject: [PATCH 7/8] adjusted test for SKLEARN_ESTIMATOR_TYPE_NAMES --- skops/io/tests/test_persist.py | 51 ++-------------------------------- 1 file changed, 2 insertions(+), 49 deletions(-) diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index 2831cbeb..012ab7f7 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -56,6 +56,7 @@ from skops.io import dump, dumps, get_untrusted_types, load, loads from skops.io._audit import NODE_TYPE_MAPPING, get_tree from skops.io._sklearn import UNSUPPORTED_TYPES +from skops.io._trusted_types import SKLEARN_ESTIMATOR_TYPE_NAMES from skops.io._utils import LoadContext, SaveContext, _get_state, get_state from skops.io.exceptions import UnsupportedTypeException @@ -67,47 +68,6 @@ # Python 3.8) ATOL = 1e-6 if sys.platform == "darwin" else 1e-7 -# TODO: remove when these are added to trusted types -SKLEARN_EXCLUDED_TYPES = ( - "sklearn._loss._loss.CyHalfBinomialLoss", - "sklearn._loss._loss.CyHalfGammaLoss", - "sklearn._loss._loss.CyHalfPoissonLoss", - "sklearn._loss._loss.CyHalfSquaredError", - "sklearn._loss._loss.CyHalfTweedieLossIdentity", - "sklearn._loss.link.IdentityLink", - "sklearn._loss.link.Interval", - "sklearn._loss.link.LogLink", - "sklearn._loss.link.LogitLink", - "sklearn._loss.loss.HalfBinomialLoss", - "sklearn._loss.loss.HalfGammaLoss", - "sklearn._loss.loss.HalfPoissonLoss", - "sklearn._loss.loss.HalfSquaredError", - "sklearn._loss.loss.HalfTweedieLossIdentity", - "sklearn.calibration._CalibratedClassifier", - "sklearn.calibration._SigmoidCalibration", - "sklearn.cluster._bisect_k_means._BisectingTree", - "sklearn.cluster._kmeans._kmeans_single_lloyd", - "sklearn.covariance._graph_lasso._DictWithDeprecatedKeys", - "sklearn.ensemble._gb_losses.BinomialDeviance", - "sklearn.ensemble._gb_losses.LeastSquaresError", - "sklearn.ensemble._hist_gradient_boosting.binning._BinMapper", - "sklearn.ensemble._hist_gradient_boosting.predictor.TreePredictor", - "sklearn.feature_selection._univariate_selection.f_classif", - "sklearn.gaussian_process._gpc._BinaryGaussianProcessClassifierLaplace", - "sklearn.gaussian_process.kernels.ConstantKernel", - "sklearn.gaussian_process.kernels.Product", - "sklearn.gaussian_process.kernels.RBF", - "sklearn.impute._iterative._ImputerTriplet", - "sklearn.metrics._dist_metrics.EuclideanDistance", - "sklearn.metrics._scorer._passthrough_scorer", - "sklearn.model_selection._split.StratifiedKFold", - "sklearn.multiclass._ConstantPredictor", - "sklearn.neighbors._ball_tree.BallTree", - "sklearn.neighbors._kd_tree.KDTree", - "sklearn.neural_network._stochastic_optimizers.AdamOptimizer", - "sklearn.utils._bunch.Bunch", -) - @pytest.fixture(autouse=True, scope="module") def debug_dispatch_functions(): @@ -532,14 +492,7 @@ def test_can_persist_fitted(estimator): loaded = loads(dumped, trusted=untrusted_types) assert_params_equal(estimator.__dict__, loaded.__dict__) - # test that sklearn types are trusted. Some known types are excluded - # from testing because they are not in the trusted list yet. - sklearn_untrusted_types = [ - type_ - for type_ in untrusted_types - if type_.startswith("sklearn.") and type_ not in SKLEARN_EXCLUDED_TYPES - ] - assert len(sklearn_untrusted_types) == 0 + assert not any(type_ in SKLEARN_ESTIMATOR_TYPE_NAMES for type_ in untrusted_types) for method in [ "predict", From df2cc5a3bb2d3a92db9bcfc7023223a4ab3e4081 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sun, 11 Dec 2022 18:43:59 +0100 Subject: [PATCH 8/8] updated changelog --- docs/changes.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/changes.rst b/docs/changes.rst index 034074a4..bea77aab 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -14,6 +14,8 @@ v0.4 - :func:`.io.dump` and :func:`.io.load` now work with file like objects, which means you can use them with the ``with open(...) as f: dump(obj, f)`` pattern, like you'd do with ``pickle``. :pr:`234` by `Benjamin Bossan`_. +- All `scikit-learn` estimators are trusted by default. + :pr:`237` by :user:`Edoardo Abati `. v0.3 ----