diff --git a/docs/changes.rst b/docs/changes.rst index f70dcaaf..fb01aa3d 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -17,6 +17,8 @@ v0.7 `Adrin Jalali`_. - Fix: :func:`skops.io.visualize` is now capable of showing bytes. :pr:`352` by `Benjamin Bossan`_. +- All public ``numpy`` ufuncs (Universal Functions) and dtypes are trusted by default + by :func:`.io.load`. :pr:`336` by :user:`Omar Arab Oghli `. v0.6 ---- diff --git a/docs/persistence.rst b/docs/persistence.rst index 4d7e9e84..b2351e0f 100644 --- a/docs/persistence.rst +++ b/docs/persistence.rst @@ -82,7 +82,7 @@ using :func:`skops.io.get_untrusted_types`: from skops.io import get_untrusted_types unknown_types = get_untrusted_types(file="my-model.skops") print(unknown_types) - ['numpy.float64', 'numpy.int64', 'sklearn.metrics._scorer._passthrough_scorer', + ['sklearn.metrics._scorer._passthrough_scorer', 'xgboost.core.Booster', 'xgboost.sklearn.XGBClassifier'] Note that everything in the above list is safe to load. We already have many @@ -108,7 +108,8 @@ At the moment, ``skops`` cannot persist arbitrary Python code. This means if you have custom functions (say, a custom function to be used with :class:`sklearn.preprocessing.FunctionTransformer`), it will not work. However, most ``numpy`` and ``scipy`` functions should work. Therefore, you can save -objects having references to functions such as ``numpy.sqrt``. +objects having references to functions or universal functions (ufuncs) +such as ``numpy.sqrt``. Compression ~~~~~~~~~~~ diff --git a/skops/io/_general.py b/skops/io/_general.py index 0902857c..6d70b7ac 100644 --- a/skops/io/_general.py +++ b/skops/io/_general.py @@ -14,6 +14,8 @@ from ._audit import Node, get_tree from ._protocol import PROTOCOL from ._trusted_types import ( + NUMPY_DTYPE_TYPE_NAMES, + NUMPY_UFUNC_TYPE_NAMES, PRIMITIVE_TYPE_NAMES, SCIPY_UFUNC_TYPE_NAMES, SKLEARN_ESTIMATOR_TYPE_NAMES, @@ -210,7 +212,9 @@ def __init__( ) -> None: super().__init__(state, load_context, trusted) # TODO: what do we trust? - self.trusted = self._get_trusted(trusted, default=SCIPY_UFUNC_TYPE_NAMES) + self.trusted = self._get_trusted( + trusted, default=SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES + ) self.children = {} def _construct(self): @@ -293,7 +297,9 @@ def __init__( ) -> None: super().__init__(state, load_context, trusted) # TODO: what do we trust? - self.trusted = self._get_trusted(trusted, PRIMITIVE_TYPE_NAMES) + self.trusted = self._get_trusted( + trusted, PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES + ) # We use a bare Node type here since a Node only checks the type in the # dict using __class__ and __module__ keys. self.children = {} diff --git a/skops/io/_numpy.py b/skops/io/_numpy.py index 1a0aa413..ec474ec4 100644 --- a/skops/io/_numpy.py +++ b/skops/io/_numpy.py @@ -8,6 +8,7 @@ from ._audit import Node, get_tree from ._general import function_get_state from ._protocol import PROTOCOL +from ._trusted_types import NUMPY_DTYPE_TYPE_NAMES from ._utils import LoadContext, SaveContext, get_module, get_state, gettype from .exceptions import UnsupportedTypeException @@ -52,6 +53,10 @@ def ndarray_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: class NdArrayNode(Node): + # TODO: NdArrayNode is not only responsible for np.arrays + # but also for np.generics, thus the confusion with DTypeNode. + # See PR-336 + def __init__( self, state: dict[str, Any], @@ -60,7 +65,9 @@ def __init__( ) -> None: super().__init__(state, load_context, trusted) self.type = state["type"] - self.trusted = self._get_trusted(trusted, [np.ndarray]) + self.trusted = self._get_trusted( + trusted, [np.ndarray] + NUMPY_DTYPE_TYPE_NAMES # type: ignore + ) if self.type == "numpy": self.children = { "content": io.BytesIO(load_context.src.read(state["file"])) diff --git a/skops/io/_trusted_types.py b/skops/io/_trusted_types.py index 39e55573..e48df817 100644 --- a/skops/io/_trusted_types.py +++ b/skops/io/_trusted_types.py @@ -2,7 +2,7 @@ import scipy from sklearn.utils import all_estimators -from ._utils import get_type_name +from ._utils import get_public_type_names, get_type_name PRIMITIVES_TYPES = [int, float, str, bool] @@ -14,13 +14,15 @@ if get_type_name(estimator_class).startswith("sklearn.") ] -SCIPY_UFUNC_TYPE_NAMES = sorted( - set( - [ - get_type_name(getattr(scipy.special, attr)) - for attr in dir(scipy.special) - if isinstance(getattr(scipy.special, attr), np.ufunc) - and get_type_name(getattr(scipy.special, attr)).startswith("scipy") - ] - ) +SCIPY_UFUNC_TYPE_NAMES = get_public_type_names(module=scipy.special, oftype=np.ufunc) + +NUMPY_UFUNC_TYPE_NAMES = get_public_type_names(module=np, oftype=np.ufunc) + +NUMPY_DTYPE_TYPE_NAMES = sorted( + { + type_name + for dtypes in np.sctypes.values() + for dtype in dtypes # type: ignore + if (type_name := get_type_name(dtype)).startswith("numpy") + } ) diff --git a/skops/io/_utils.py b/skops/io/_utils.py index f147d15a..d929380a 100644 --- a/skops/io/_utils.py +++ b/skops/io/_utils.py @@ -4,6 +4,7 @@ import sys from dataclasses import dataclass, field from functools import singledispatch +from types import ModuleType from typing import Any, Type from zipfile import ZipFile @@ -200,3 +201,36 @@ def get_type_paths(types: Any) -> list[str]: types = [types] return [get_type_name(t) if not isinstance(t, str) else t for t in types] + + +def get_public_type_names(module: ModuleType, oftype: Type) -> list[str]: + """ + Helper function that gets the type names of all + public objects of the given ``oftype`` from the given ``module``, + which start with the root module name. + + Public objects are those that can be read via ``dir(...)``. + + Parameters + ---------- + module: ModuleType + Module under which the public objects are defined. + oftype: Type + The type of the objects. + + Returns + ---------- + type_names_list: list of str + The sorted list of type names, all as strings, + e.g. ``["numpy.core._multiarray_umath.absolute"]``. + """ + module_name, _, _ = module.__name__.rpartition(".") + + return sorted( + { + type_name + for attr in dir(module) + if issubclass((obj := getattr(module, attr)).__class__, oftype) + and (type_name := get_type_name(obj)).startswith(module_name) + } + ) diff --git a/skops/io/tests/test_audit.py b/skops/io/tests/test_audit.py index 7cf4ffc6..756e9988 100644 --- a/skops/io/tests/test_audit.py +++ b/skops/io/tests/test_audit.py @@ -4,11 +4,8 @@ from contextlib import suppress from zipfile import ZipFile -import numpy as np import pytest from sklearn.linear_model import LogisticRegression -from sklearn.pipeline import FeatureUnion, Pipeline -from sklearn.preprocessing import FunctionTransformer, StandardScaler from skops.io import dumps, get_untrusted_types from skops.io._audit import Node, audit_tree, check_type, get_tree, temp_setattr @@ -152,25 +149,6 @@ def __init__(self): assert not hasattr(temp, "b") -def test_complex_pipeline_untrusted_set(): - # fmt: off - clf = Pipeline([ - ("features", FeatureUnion([ - ("scaler", StandardScaler()), - ("sqrt", FunctionTransformer( - func=np.sqrt, - inverse_func=np.square, - )), - ])), - ("clf", LogisticRegression(random_state=0, solver="liblinear")), - ]) - # fmt: on - - untrusted = get_untrusted_types(data=dumps(clf)) - type_names = [x.split(".")[-1] for x in untrusted] - assert type_names == ["sqrt", "square"] - - def test_format_object_node(): estimator = LogisticRegression(random_state=0, solver="liblinear") state = get_state(estimator, SaveContext(None)) diff --git a/skops/io/tests/test_external.py b/skops/io/tests/test_external.py index b41253c0..2c90c1cd 100644 --- a/skops/io/tests/test_external.py +++ b/skops/io/tests/test_external.py @@ -81,8 +81,6 @@ def trusted(self): "lightgbm.sklearn.LGBMClassifier", "lightgbm.sklearn.LGBMRegressor", "lightgbm.sklearn.LGBMRanker", - "numpy.int32", - "numpy.int64", "sklearn.preprocessing._label.LabelEncoder", ] @@ -329,8 +327,6 @@ def trusted(self): # TODO: adjust once more types are trusted by default return [ "builtins.bytes", - "numpy.float32", - "numpy.float64", "catboost.core.CatBoostClassifier", "catboost.core.CatBoostRegressor", "catboost.core.CatBoostRanker", diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index e501abc1..e8354f3d 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -54,7 +54,13 @@ from skops.io import dump, dumps, get_untrusted_types, load, loads from skops.io._audit import NODE_TYPE_MAPPING, get_tree from skops.io._sklearn import UNSUPPORTED_TYPES -from skops.io._trusted_types import SCIPY_UFUNC_TYPE_NAMES, SKLEARN_ESTIMATOR_TYPE_NAMES +from skops.io._trusted_types import ( + NUMPY_DTYPE_TYPE_NAMES, + NUMPY_UFUNC_TYPE_NAMES, + PRIMITIVE_TYPE_NAMES, + SCIPY_UFUNC_TYPE_NAMES, + SKLEARN_ESTIMATOR_TYPE_NAMES, +) from skops.io._utils import LoadContext, SaveContext, _get_state, get_state, gettype from skops.io.exceptions import UnsupportedTypeException, UntrustedTypesFoundException from skops.io.tests._utils import assert_method_outputs_equal, assert_params_equal @@ -225,11 +231,17 @@ def _tested_estimators(type_filter=None): def _tested_ufuncs(): - for full_name in SCIPY_UFUNC_TYPE_NAMES: + for full_name in SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES: module_name, _, ufunc_name = full_name.rpartition(".") yield gettype(module_name=module_name, cls_or_func=ufunc_name) +def _tested_types(): + for full_name in PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES: + module_name, _, type_name = full_name.rpartition(".") + yield gettype(module_name=module_name, cls_or_func=type_name) + + def _unsupported_estimators(type_filter=None): for name, Estimator in all_estimators(type_filter=type_filter): if Estimator not in UNSUPPORTED_TYPES: @@ -359,15 +371,27 @@ def test_can_persist_fitted(estimator): assert not any(type_ in SKLEARN_ESTIMATOR_TYPE_NAMES for type_ in untrusted_types) assert not any(type_ in SCIPY_UFUNC_TYPE_NAMES for type_ in untrusted_types) + assert not any(type_ in NUMPY_UFUNC_TYPE_NAMES for type_ in untrusted_types) + assert not any(type_ in NUMPY_DTYPE_TYPE_NAMES for type_ in untrusted_types) assert_method_outputs_equal(estimator, loaded, X) -@pytest.mark.parametrize("ufunc", _tested_ufuncs(), ids=SCIPY_UFUNC_TYPE_NAMES) +@pytest.mark.parametrize( + "ufunc", _tested_ufuncs(), ids=SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES +) def test_can_trust_ufuncs(ufunc): dumped = dumps(ufunc) untrusted_types = get_untrusted_types(data=dumped) assert len(untrusted_types) == 0 - # TODO: extend with numpy ufuncs + + +@pytest.mark.parametrize( + "type_", _tested_types(), ids=PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES +) +def test_can_trust_types(type_): + dumped = dumps(type_) + untrusted_types = get_untrusted_types(data=dumped) + assert len(untrusted_types) == 0 @pytest.mark.parametrize( diff --git a/skops/io/tests/test_visualize.py b/skops/io/tests/test_visualize.py index 77d2fd7d..65625903 100644 --- a/skops/io/tests/test_visualize.py +++ b/skops/io/tests/test_visualize.py @@ -101,12 +101,8 @@ def sink(nodes_iter, *args, **kwargs): nodes_self_unsafe = [node for node in nodes if not node.is_self_safe] nodes_unsafe = [node for node in nodes if not node.is_safe] - # there are currently 2 unsafe nodes, a numpy int and the custom - # functions. The former might be considered safe in the future, in which - # case this test needs to be changed. - assert len(nodes_self_unsafe) == 2 - assert nodes_self_unsafe[0].val == "numpy.int64" - assert nodes_self_unsafe[1].val == "test_visualize.unsafe_function" + assert len(nodes_self_unsafe) == 1 + assert nodes_self_unsafe[0].val == "test_visualize.unsafe_function" # it's not easy to test the number of indirectly unsafe nodes, because # it will depend on the nesting structure; we can only be sure that it's @@ -114,13 +110,10 @@ def sink(nodes_iter, *args, **kwargs): assert len(nodes_unsafe) > 2 assert any("FunctionTransformer" in node.val for node in nodes_unsafe) - @pytest.mark.parametrize( - "trusted", [True, ["numpy.int64", "test_visualize.unsafe_function"]] - ) + @pytest.mark.parametrize("trusted", [True, ["test_visualize.unsafe_function"]]) def test_all_nodes_trusted(self, pipeline, trusted, capsys): # The pipeline contains untrusted type(s), but if we pass trusted=True, # it is not considered untrusted anymore - # TODO: remove numpy.int64 from trusted once it's trusted by default file = sio.dumps(pipeline) sio.visualize(file, show="untrusted", trusted=trusted) expected = "root: sklearn.pipeline.Pipeline"