Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ v0.7
`Adrin Jalali`_.
- Fix: :func:`skops.io.visualize` is now capable of showing bytes. :pr:`352` by
`Benjamin Bossan`_.
- All public ``numpy`` ufuncs (Universal Functions) and dtypes are trusted by default
by :func:`.io.load`. :pr:`336` by :user:`Omar Arab Oghli <omar-araboghli>`.

v0.6
----
Expand Down
5 changes: 3 additions & 2 deletions docs/persistence.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ using :func:`skops.io.get_untrusted_types`:
from skops.io import get_untrusted_types
unknown_types = get_untrusted_types(file="my-model.skops")
print(unknown_types)
['numpy.float64', 'numpy.int64', 'sklearn.metrics._scorer._passthrough_scorer',
['sklearn.metrics._scorer._passthrough_scorer',
'xgboost.core.Booster', 'xgboost.sklearn.XGBClassifier']

Note that everything in the above list is safe to load. We already have many
Expand All @@ -108,7 +108,8 @@ At the moment, ``skops`` cannot persist arbitrary Python code. This means if
you have custom functions (say, a custom function to be used with
:class:`sklearn.preprocessing.FunctionTransformer`), it will not work. However,
most ``numpy`` and ``scipy`` functions should work. Therefore, you can save
objects having references to functions such as ``numpy.sqrt``.
objects having references to functions or universal functions (ufuncs)
such as ``numpy.sqrt``.

Compression
~~~~~~~~~~~
Expand Down
10 changes: 8 additions & 2 deletions skops/io/_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from ._audit import Node, get_tree
from ._protocol import PROTOCOL
from ._trusted_types import (
NUMPY_DTYPE_TYPE_NAMES,
NUMPY_UFUNC_TYPE_NAMES,
PRIMITIVE_TYPE_NAMES,
SCIPY_UFUNC_TYPE_NAMES,
SKLEARN_ESTIMATOR_TYPE_NAMES,
Expand Down Expand Up @@ -210,7 +212,9 @@ def __init__(
) -> None:
super().__init__(state, load_context, trusted)
# TODO: what do we trust?
self.trusted = self._get_trusted(trusted, default=SCIPY_UFUNC_TYPE_NAMES)
self.trusted = self._get_trusted(
trusted, default=SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES
)
self.children = {}

def _construct(self):
Expand Down Expand Up @@ -293,7 +297,9 @@ def __init__(
) -> None:
super().__init__(state, load_context, trusted)
# TODO: what do we trust?
self.trusted = self._get_trusted(trusted, PRIMITIVE_TYPE_NAMES)
self.trusted = self._get_trusted(
trusted, PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES
)
# We use a bare Node type here since a Node only checks the type in the
# dict using __class__ and __module__ keys.
self.children = {}
Expand Down
9 changes: 8 additions & 1 deletion skops/io/_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ._audit import Node, get_tree
from ._general import function_get_state
from ._protocol import PROTOCOL
from ._trusted_types import NUMPY_DTYPE_TYPE_NAMES
from ._utils import LoadContext, SaveContext, get_module, get_state, gettype
from .exceptions import UnsupportedTypeException

Expand Down Expand Up @@ -52,6 +53,10 @@ def ndarray_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:


class NdArrayNode(Node):
# TODO: NdArrayNode is not only responsible for np.arrays
# but also for np.generics, thus the confusion with DTypeNode.
# See PR-336

def __init__(
self,
state: dict[str, Any],
Expand All @@ -60,7 +65,9 @@ def __init__(
) -> None:
super().__init__(state, load_context, trusted)
self.type = state["type"]
self.trusted = self._get_trusted(trusted, [np.ndarray])
self.trusted = self._get_trusted(
trusted, [np.ndarray] + NUMPY_DTYPE_TYPE_NAMES # type: ignore
)
if self.type == "numpy":
self.children = {
"content": io.BytesIO(load_context.src.read(state["file"]))
Expand Down
22 changes: 12 additions & 10 deletions skops/io/_trusted_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import scipy
from sklearn.utils import all_estimators

from ._utils import get_type_name
from ._utils import get_public_type_names, get_type_name

PRIMITIVES_TYPES = [int, float, str, bool]

Expand All @@ -14,13 +14,15 @@
if get_type_name(estimator_class).startswith("sklearn.")
]

SCIPY_UFUNC_TYPE_NAMES = sorted(
set(
[
get_type_name(getattr(scipy.special, attr))
for attr in dir(scipy.special)
if isinstance(getattr(scipy.special, attr), np.ufunc)
and get_type_name(getattr(scipy.special, attr)).startswith("scipy")
]
)
SCIPY_UFUNC_TYPE_NAMES = get_public_type_names(module=scipy.special, oftype=np.ufunc)

NUMPY_UFUNC_TYPE_NAMES = get_public_type_names(module=np, oftype=np.ufunc)

NUMPY_DTYPE_TYPE_NAMES = sorted(
{
type_name
for dtypes in np.sctypes.values()
for dtype in dtypes # type: ignore
if (type_name := get_type_name(dtype)).startswith("numpy")
}
)
34 changes: 34 additions & 0 deletions skops/io/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
from dataclasses import dataclass, field
from functools import singledispatch
from types import ModuleType
from typing import Any, Type
from zipfile import ZipFile

Expand Down Expand Up @@ -200,3 +201,36 @@ def get_type_paths(types: Any) -> list[str]:
types = [types]

return [get_type_name(t) if not isinstance(t, str) else t for t in types]


def get_public_type_names(module: ModuleType, oftype: Type) -> list[str]:
"""
Helper function that gets the type names of all
public objects of the given ``oftype`` from the given ``module``,
which start with the root module name.

Public objects are those that can be read via ``dir(...)``.

Parameters
----------
module: ModuleType
Module under which the public objects are defined.
oftype: Type
The type of the objects.

Returns
----------
type_names_list: list of str
The sorted list of type names, all as strings,
e.g. ``["numpy.core._multiarray_umath.absolute"]``.
"""
module_name, _, _ = module.__name__.rpartition(".")

return sorted(
{
type_name
for attr in dir(module)
if issubclass((obj := getattr(module, attr)).__class__, oftype)
and (type_name := get_type_name(obj)).startswith(module_name)
}
)
22 changes: 0 additions & 22 deletions skops/io/tests/test_audit.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,8 @@
from contextlib import suppress
from zipfile import ZipFile

import numpy as np
import pytest
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler

from skops.io import dumps, get_untrusted_types
from skops.io._audit import Node, audit_tree, check_type, get_tree, temp_setattr
Expand Down Expand Up @@ -152,25 +149,6 @@ def __init__(self):
assert not hasattr(temp, "b")


def test_complex_pipeline_untrusted_set():
# fmt: off
clf = Pipeline([
("features", FeatureUnion([
("scaler", StandardScaler()),
("sqrt", FunctionTransformer(
func=np.sqrt,
inverse_func=np.square,
)),
])),
("clf", LogisticRegression(random_state=0, solver="liblinear")),
])
# fmt: on

untrusted = get_untrusted_types(data=dumps(clf))
type_names = [x.split(".")[-1] for x in untrusted]
assert type_names == ["sqrt", "square"]


def test_format_object_node():
estimator = LogisticRegression(random_state=0, solver="liblinear")
state = get_state(estimator, SaveContext(None))
Expand Down
4 changes: 0 additions & 4 deletions skops/io/tests/test_external.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,6 @@ def trusted(self):
"lightgbm.sklearn.LGBMClassifier",
"lightgbm.sklearn.LGBMRegressor",
"lightgbm.sklearn.LGBMRanker",
"numpy.int32",
"numpy.int64",
"sklearn.preprocessing._label.LabelEncoder",
]

Expand Down Expand Up @@ -329,8 +327,6 @@ def trusted(self):
# TODO: adjust once more types are trusted by default
return [
"builtins.bytes",
"numpy.float32",
"numpy.float64",
"catboost.core.CatBoostClassifier",
"catboost.core.CatBoostRegressor",
"catboost.core.CatBoostRanker",
Expand Down
32 changes: 28 additions & 4 deletions skops/io/tests/test_persist.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,13 @@
from skops.io import dump, dumps, get_untrusted_types, load, loads
from skops.io._audit import NODE_TYPE_MAPPING, get_tree
from skops.io._sklearn import UNSUPPORTED_TYPES
from skops.io._trusted_types import SCIPY_UFUNC_TYPE_NAMES, SKLEARN_ESTIMATOR_TYPE_NAMES
from skops.io._trusted_types import (
NUMPY_DTYPE_TYPE_NAMES,
NUMPY_UFUNC_TYPE_NAMES,
PRIMITIVE_TYPE_NAMES,
SCIPY_UFUNC_TYPE_NAMES,
SKLEARN_ESTIMATOR_TYPE_NAMES,
)
from skops.io._utils import LoadContext, SaveContext, _get_state, get_state, gettype
from skops.io.exceptions import UnsupportedTypeException, UntrustedTypesFoundException
from skops.io.tests._utils import assert_method_outputs_equal, assert_params_equal
Expand Down Expand Up @@ -225,11 +231,17 @@ def _tested_estimators(type_filter=None):


def _tested_ufuncs():
for full_name in SCIPY_UFUNC_TYPE_NAMES:
for full_name in SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES:
module_name, _, ufunc_name = full_name.rpartition(".")
yield gettype(module_name=module_name, cls_or_func=ufunc_name)


def _tested_types():
for full_name in PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES:
module_name, _, type_name = full_name.rpartition(".")
yield gettype(module_name=module_name, cls_or_func=type_name)


def _unsupported_estimators(type_filter=None):
for name, Estimator in all_estimators(type_filter=type_filter):
if Estimator not in UNSUPPORTED_TYPES:
Expand Down Expand Up @@ -359,15 +371,27 @@ def test_can_persist_fitted(estimator):

assert not any(type_ in SKLEARN_ESTIMATOR_TYPE_NAMES for type_ in untrusted_types)
assert not any(type_ in SCIPY_UFUNC_TYPE_NAMES for type_ in untrusted_types)
assert not any(type_ in NUMPY_UFUNC_TYPE_NAMES for type_ in untrusted_types)
assert not any(type_ in NUMPY_DTYPE_TYPE_NAMES for type_ in untrusted_types)
assert_method_outputs_equal(estimator, loaded, X)


@pytest.mark.parametrize("ufunc", _tested_ufuncs(), ids=SCIPY_UFUNC_TYPE_NAMES)
@pytest.mark.parametrize(
"ufunc", _tested_ufuncs(), ids=SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES
)
def test_can_trust_ufuncs(ufunc):
dumped = dumps(ufunc)
untrusted_types = get_untrusted_types(data=dumped)
assert len(untrusted_types) == 0
# TODO: extend with numpy ufuncs


@pytest.mark.parametrize(
"type_", _tested_types(), ids=PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES
)
def test_can_trust_types(type_):
dumped = dumps(type_)
untrusted_types = get_untrusted_types(data=dumped)
assert len(untrusted_types) == 0


@pytest.mark.parametrize(
Expand Down
13 changes: 3 additions & 10 deletions skops/io/tests/test_visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,26 +101,19 @@ def sink(nodes_iter, *args, **kwargs):
nodes_self_unsafe = [node for node in nodes if not node.is_self_safe]
nodes_unsafe = [node for node in nodes if not node.is_safe]

# there are currently 2 unsafe nodes, a numpy int and the custom
# functions. The former might be considered safe in the future, in which
# case this test needs to be changed.
assert len(nodes_self_unsafe) == 2
assert nodes_self_unsafe[0].val == "numpy.int64"
assert nodes_self_unsafe[1].val == "test_visualize.unsafe_function"
assert len(nodes_self_unsafe) == 1
assert nodes_self_unsafe[0].val == "test_visualize.unsafe_function"

# it's not easy to test the number of indirectly unsafe nodes, because
# it will depend on the nesting structure; we can only be sure that it's
# more than 2, and one of them should be the FunctionTransformer
assert len(nodes_unsafe) > 2
assert any("FunctionTransformer" in node.val for node in nodes_unsafe)

@pytest.mark.parametrize(
"trusted", [True, ["numpy.int64", "test_visualize.unsafe_function"]]
)
@pytest.mark.parametrize("trusted", [True, ["test_visualize.unsafe_function"]])
def test_all_nodes_trusted(self, pipeline, trusted, capsys):
# The pipeline contains untrusted type(s), but if we pass trusted=True,
# it is not considered untrusted anymore
# TODO: remove numpy.int64 from trusted once it's trusted by default
file = sio.dumps(pipeline)
sio.visualize(file, show="untrusted", trusted=trusted)
expected = "root: sklearn.pipeline.Pipeline"
Expand Down