diff --git a/docs/changes.rst b/docs/changes.rst index 61bec37e..9e040ae8 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -14,6 +14,11 @@ v0.10 - Removes Pythn 3.8 support and adds Python 3.12 Support :pr:`418` by :user:`Thomas Lazarus `. - Removes a shortcut to add `sklearn-intelex` as a not dependency. :pr:`420` by :user:`Thomas Lazarus < lazarust > `. +- ``trusted=True`` is now removed from ``skops.io.load`` and ``skops.io.loads``. + This is to further encourage users to inspect the input data before loading + it. :func:`skops.io.get_untrusted_types` can be used to get the untrusted types + present in the input. + :pr:`422` by `Adrin Jalali`_. v0.9 ---- diff --git a/docs/persistence.rst b/docs/persistence.rst index bf74e38a..255d019b 100644 --- a/docs/persistence.rst +++ b/docs/persistence.rst @@ -54,7 +54,7 @@ The code snippet below illustrates how to use :func:`skops.io.dump` and from xgboost.sklearn import XGBClassifier from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.datasets import load_iris - from skops.io import dump, load + from skops.io import dump, load, get_untrusted_types X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) @@ -64,26 +64,24 @@ The code snippet below illustrates how to use :func:`skops.io.dump` and 0.9666666666666667 dump(clf, "my-model.skops") # ... - loaded = load("my-model.skops", trusted=True) + unknown_types = get_untrusted_types(file="my-model.skops") + print(unknown_types) + ['sklearn.metrics._scorer._passthrough_scorer', + 'xgboost.core.Booster', 'xgboost.sklearn.XGBClassifier'] + loaded = load("my-model.skops", trusted=unknown_types) print(loaded.score(X_test, y_test)) 0.9666666666666667 # in memory from skops.io import dumps, loads serialized = dumps(clf) - loaded = loads(serialized, trusted=True) - -Note that you should only load files with ``trusted=True`` if you trust the -source. Otherwise you can get a list of untrusted types present in the dump -using :func:`skops.io.get_untrusted_types`: - -.. code:: python + loaded = loads(serialized, trusted=unknown_types) - from skops.io import get_untrusted_types - unknown_types = get_untrusted_types(file="my-model.skops") - print(unknown_types) - ['sklearn.metrics._scorer._passthrough_scorer', - 'xgboost.core.Booster', 'xgboost.sklearn.XGBClassifier'] +Note that the ``get_untrusted_types`` function is used to check which types are +not trusted by default. The user can then decide whether to trust them or not. +In previous before version 0.10, users could pass ``trusted=True`` to skip the +audit phase, which is now removed to encourage users to validate the input +before loading. Note that everything in the above list is safe to load. We already have many types included as trusted by default, and some of the above values might be @@ -92,10 +90,6 @@ added to that list in the future. Once you check the list and you validate that everything in the list is safe, you can load the file with ``trusted=unknown_types``: -.. code:: python - - loaded = load("my-model.skops", trusted=unknown_types) - At the moment, we support the vast majority of sklearn estimators. This includes complex use cases such as :class:`sklearn.pipeline.Pipeline`, :class:`sklearn.model_selection.GridSearchCV`, classes using objects defined in @@ -226,10 +220,11 @@ green to cyan. The ``rich`` docs list the `supported standard colors Note that the visualization feature is intended to help understand the structure of the object, e.g. what attributes are identified as untrusted. It is not a -replacement for a proper security check. In particular, just because an object's -visualization looks innocent does *not* mean you can just call `sio.load(, -trusted=True)` on this object -- only pass the types you really trust to the -``trusted`` argument. +replacement for a proper security check of the included types in the file. In +particular, just because an object's visualization looks innocent does *not* +mean you can just call `sio.load(, +trusted=get_untrusted_types(file=))` on this object -- only pass the +types you really trust to the ``trusted`` argument. Supported libraries ------------------- diff --git a/skops/card/_model_card.py b/skops/card/_model_card.py index 0f7ffaf4..31633985 100644 --- a/skops/card/_model_card.py +++ b/skops/card/_model_card.py @@ -12,7 +12,7 @@ from hashlib import sha256 from pathlib import Path from reprlib import Repr -from typing import Any, Iterator, Literal, Sequence, Union +from typing import Any, Iterator, List, Literal, Optional, Sequence, Union import joblib from huggingface_hub import ModelCardData @@ -488,7 +488,7 @@ def __init__( model_diagram: bool | Literal["auto"] | str = "auto", metadata: ModelCardData | None = None, template: Literal["skops"] | dict[str, str] | None = "skops", - trusted: bool = False, + trusted: Optional[List[str]] = None, ) -> None: self.model = model self.metadata = metadata or ModelCardData() diff --git a/skops/card/tests/test_card.py b/skops/card/tests/test_card.py index 97862d2a..9af02530 100644 --- a/skops/card/tests/test_card.py +++ b/skops/card/tests/test_card.py @@ -26,7 +26,7 @@ TableSection, _load_model, ) -from skops.io import dump, load +from skops.io import dump, get_untrusted_types, load from skops.utils.importutils import import_or_raise @@ -51,10 +51,14 @@ def save_model_to_file(model_instance, suffix): def test_load_model(suffix): model0 = LinearRegression(n_jobs=123) _, save_file = save_model_to_file(model0, suffix) - loaded_model_str = _load_model(save_file, trusted=True) + if suffix == ".skops": + untrusted_types = get_untrusted_types(file=save_file) + else: + untrusted_types = None + loaded_model_str = _load_model(save_file, trusted=untrusted_types) save_file_path = Path(save_file) - loaded_model_path = _load_model(save_file_path, trusted=True) - loaded_model_instance = _load_model(model0, trusted=True) + loaded_model_path = _load_model(save_file_path, trusted=untrusted_types) + loaded_model_instance = _load_model(model0, trusted=untrusted_types) assert loaded_model_str.n_jobs == 123 assert loaded_model_path.n_jobs == 123 @@ -1383,8 +1387,11 @@ def test_with_metadata(self, card: Card, meth, expected_lines): class TestCardModelAttributeIsPath: - def path_to_card(self, path): - card = Card(model=path, trusted=True) + def path_to_card(self, path, suffix): + if suffix == ".skops": + card = Card(model=path, trusted=get_untrusted_types(file=path)) + else: + card = Card(model=path) return card @pytest.mark.parametrize("meth", [repr, str]) @@ -1397,7 +1404,7 @@ def test_model_card_repr(self, meth, suffix): model = LinearRegression(fit_intercept=False) file_handle, file_name = save_model_to_file(model, suffix) os.close(file_handle) - card_from_path = self.path_to_card(file_name) + card_from_path = self.path_to_card(file_name, suffix=suffix) result0 = meth(card_from_path) expected = "Card(\n model=LinearRegression(fit_intercept=False)," diff --git a/skops/cli/_update.py b/skops/cli/_update.py index c648e37b..e3edecc0 100644 --- a/skops/cli/_update.py +++ b/skops/cli/_update.py @@ -9,7 +9,7 @@ from pathlib import Path from skops.cli._utils import get_log_level -from skops.io import dump, load +from skops.io import dump, get_untrusted_types, load from skops.io._protocol import PROTOCOL @@ -48,7 +48,7 @@ def _update_file( " file." ) - input_model = load(input_file, trusted=True) + input_model = load(input_file, trusted=get_untrusted_types(file=input_file)) with zipfile.ZipFile(input_file, "r") as zip_file: input_file_schema = json.loads(zip_file.read("schema.json")) diff --git a/skops/cli/tests/test_convert.py b/skops/cli/tests/test_convert.py index 6395dd3d..72a76b00 100644 --- a/skops/cli/tests/test_convert.py +++ b/skops/cli/tests/test_convert.py @@ -7,7 +7,7 @@ import pytest from skops.cli import _convert -from skops.io import load +from skops.io import get_untrusted_types, load class MockUnsafeType: @@ -61,7 +61,7 @@ def test_unsafe_case_works_as_expected( ): caplog.set_level(logging.WARNING) _convert._convert_file(pkl_path, skops_path) - persisted_obj = load(skops_path, trusted=True) + persisted_obj = load(skops_path, trusted=get_untrusted_types(file=skops_path)) assert isinstance(persisted_obj, MockUnsafeType) diff --git a/skops/io/_audit.py b/skops/io/_audit.py index 5ae9fc28..8d1352a1 100644 --- a/skops/io/_audit.py +++ b/skops/io/_audit.py @@ -2,7 +2,7 @@ import io from contextlib import contextmanager -from typing import Any, Dict, Generator, List, Literal, Optional, Sequence, Type, Union +from typing import Any, Dict, Generator, List, Optional, Sequence, Type, Union from ._protocol import PROTOCOL from ._utils import LoadContext, get_module, get_type_paths @@ -14,9 +14,7 @@ ] -def check_type( - module_name: str, type_name: str, trusted: Literal[True] | Sequence[str] -) -> bool: +def check_type(module_name: str, type_name: str, trusted: Sequence[str]) -> bool: """Check if a type is safe to load. A type is safe to load only if it's present in the trusted list. @@ -38,16 +36,13 @@ def check_type( is_safe : bool True if the type is safe, False otherwise. """ - if trusted is True: - return True return module_name + "." + type_name in trusted def audit_tree(tree: Node) -> None: """Audit a tree of nodes. - A tree is safe if it only contains trusted types. Audit is skipped if - trusted is ``True``. + A tree is safe if it only contains trusted types. Parameters ---------- @@ -59,9 +54,6 @@ def audit_tree(tree: Node) -> None: UntrustedTypesFoundException If the tree contains an untrusted type. """ - if tree.trusted is True: - return - unsafe = tree.get_unsafe_set() if unsafe: raise UntrustedTypesFoundException(unsafe) @@ -142,7 +134,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, memoize: bool = True, ) -> None: self.class_name, self.module_name = state["__class__"], state["__module__"] @@ -180,22 +172,19 @@ def _construct(self): @staticmethod def _get_trusted( - trusted: bool | Sequence[Union[str, Type]], default: Sequence[Union[str, Type]] - ) -> Literal[True] | list[str]: + trusted: Optional[Sequence[Union[str, Type]]], + default: Sequence[Union[str, Type]], + ) -> list[str]: """Return a trusted list, or True. - If ``trusted`` is ``False``, we return the ``default``. If a list of + If ``trusted`` is ``None``, we return the ``default``. If a list of types are being passed, those types, as well as default trusted types, are returned. This is a convenience method called by child classes. """ - if trusted is True: - # if trusted is True, we trust the node - return True - - if trusted is False: + if trusted is None: # if trusted is False, we only trust the defaults return get_type_paths(default) @@ -289,12 +278,12 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool = False, + trusted: Optional[List[str]] = None, ): # we pass memoize as False because we don't want to memoize the cached # node. super().__init__(state, load_context, trusted, memoize=False) - self.trusted = True + self.trusted = self._get_trusted(trusted, default=[]) # TODO: deal with case that __id__ is unknown or prevent it from # happening self.cached = load_context.get_object(state.get("__id__")) # type: ignore @@ -313,7 +302,7 @@ def _construct(self): def get_tree( state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str], + trusted: Optional[Sequence[str]], ) -> Node: """Get the tree of nodes. diff --git a/skops/io/_general.py b/skops/io/_general.py index 662252bc..46a3b1c8 100644 --- a/skops/io/_general.py +++ b/skops/io/_general.py @@ -7,7 +7,7 @@ from functools import partial from reprlib import Repr from types import FunctionType, MethodType -from typing import Any, Sequence +from typing import Any, Optional, Sequence import numpy as np @@ -60,7 +60,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [dict]) @@ -97,7 +97,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [list]) @@ -129,7 +129,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [set]) @@ -161,7 +161,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [tuple]) @@ -208,7 +208,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) # TODO: what do we trust? @@ -252,7 +252,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) # TODO: should we trust anything? @@ -293,7 +293,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) # TODO: what do we trust? @@ -327,7 +327,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [slice]) @@ -390,7 +390,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) @@ -449,7 +449,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.children = { @@ -474,7 +474,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.content = state["content"] @@ -527,7 +527,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [bytes]) @@ -548,7 +548,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [bytearray]) @@ -578,7 +578,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, []) diff --git a/skops/io/_numpy.py b/skops/io/_numpy.py index fb0dd80d..f3887ba6 100644 --- a/skops/io/_numpy.py +++ b/skops/io/_numpy.py @@ -1,7 +1,7 @@ from __future__ import annotations import io -from typing import Any, Sequence +from typing import Any, Optional, Sequence import numpy as np @@ -61,7 +61,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.type = state["type"] @@ -128,7 +128,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [np.ma.MaskedArray]) @@ -159,7 +159,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) # TODO @@ -190,7 +190,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.children = { @@ -231,7 +231,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.children = { diff --git a/skops/io/_persist.py b/skops/io/_persist.py index d69b6123..711ec884 100644 --- a/skops/io/_persist.py +++ b/skops/io/_persist.py @@ -4,7 +4,7 @@ import io import json from pathlib import Path -from typing import Any, BinaryIO, Sequence +from typing import Any, BinaryIO, Optional, Sequence from zipfile import ZIP_STORED, ZipFile import skops @@ -113,30 +113,21 @@ def dumps( return buffer.getbuffer().tobytes() -def load(file: str | Path, trusted: bool | Sequence[str] = False) -> Any: +def load(file: str | Path, trusted: Optional[Sequence[str]] = None) -> Any: """Load an object saved with the skops persistence format. Skops aims at providing a secure persistence feature that does not rely on :mod:`pickle`, which is inherently insecure. For more information, please visit the :ref:`persistence` documentation. - .. warning:: - - This feature is heavily under development, which means the API is - unstable and there might be security issues at the moment. Therefore, - use caution when loading files from sources you don't trust. - Parameters ---------- file: str or pathlib.Path The file name of the object to be loaded. - trusted: bool, or list of str, default=False - If ``True``, the object will be loaded without any security checks. If - ``False``, the object will be loaded only if there are only trusted - objects in the dumped file. If a list of strings, the object will be - loaded only if there are only trusted objects and objects of types - listed in ``trusted`` in the dumped file. + trusted: list of str, default=None + The object will be loaded only if there are only trusted objects and + objects of types listed in ``trusted`` in the dumped file. Returns ------- @@ -144,6 +135,15 @@ def load(file: str | Path, trusted: bool | Sequence[str] = False) -> Any: The loaded object. """ + if trusted is True: + raise TypeError( + "trusted must be a list of strings. Before version 0.10 trusted could " + "be a boolean, but this is no longer supported, due to a reported " + "CVE-2024-37065. You can pass the output of `get_untrusted_types` as " + "trusted to load the data. Be sure to review the output of the function " + "before passing it as trusted." + ) + with ZipFile(file, "r") as input_zip: schema = json.loads(input_zip.read("schema.json")) load_context = LoadContext(src=input_zip, protocol=schema["protocol"]) @@ -154,27 +154,18 @@ def load(file: str | Path, trusted: bool | Sequence[str] = False) -> Any: return instance -def loads(data: bytes, trusted: bool | Sequence[str] = False) -> Any: +def loads(data: bytes, trusted: Optional[Sequence[str]] = None) -> Any: """Load an object saved with the skops persistence format from a bytes object. - .. warning:: - - This feature is heavily under development, which means the API is - unstable and there might be security issues at the moment. Therefore, - use caution when loading files from sources you don't trust. - Parameters ---------- data: bytes The dumped data to be loaded in bytes format. trusted: bool, or list of str, default=False - If ``True``, the object will be loaded without any security checks. If - ``False``, the object will be loaded only if there are only trusted - objects in the dumped file. If a list of strings, the object will be - loaded only if there are only trusted objects and objects of types - listed in ``trusted`` in the dumped file. + The object will be loaded only if there are only trusted objects and + objects of types listed in ``trusted`` in the dumped file. Returns ------- @@ -184,6 +175,15 @@ def loads(data: bytes, trusted: bool | Sequence[str] = False) -> Any: if isinstance(data, str): raise TypeError("Can't load skops format from string, pass bytes") + if trusted is True: + raise TypeError( + "trusted must be a list of strings. Before version 0.10 trusted could " + "be a boolean, but this is no longer supported, due to a reported " + "CVE-2024-37065. You can pass the output of `get_untrusted_types` as " + "trusted to load the data. Be sure to review the output of the function " + "before passing it as trusted." + ) + with ZipFile(io.BytesIO(data), "r") as zip_file: schema = json.loads(zip_file.read("schema.json")) load_context = LoadContext(src=zip_file, protocol=schema["protocol"]) @@ -231,7 +231,7 @@ def get_untrusted_types( with ZipFile(content, "r") as zip_file: schema = json.loads(zip_file.read("schema.json")) load_context = LoadContext(src=zip_file, protocol=schema["protocol"]) - tree = get_tree(schema, load_context=load_context, trusted=False) + tree = get_tree(schema, load_context=load_context, trusted=None) untrusted_types = tree.get_unsafe_set() return sorted(untrusted_types) diff --git a/skops/io/_quantile_forest.py b/skops/io/_quantile_forest.py index a9b14d47..dbe745a7 100644 --- a/skops/io/_quantile_forest.py +++ b/skops/io/_quantile_forest.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Sequence +from typing import Any, Optional, Sequence from ._protocol import PROTOCOL from ._sklearn import ReduceNode, reduce_get_state @@ -26,7 +26,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: if QuantileForest is None: raise ImportError( diff --git a/skops/io/_scipy.py b/skops/io/_scipy.py index adc32a6d..e482a1af 100644 --- a/skops/io/_scipy.py +++ b/skops/io/_scipy.py @@ -1,7 +1,7 @@ from __future__ import annotations import io -from typing import Any, Sequence +from typing import Any, Optional, Sequence from scipy.sparse import load_npz, save_npz, spmatrix @@ -38,7 +38,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.type = state["type"] diff --git a/skops/io/_sklearn.py b/skops/io/_sklearn.py index 3f4c8b35..9f6058a1 100644 --- a/skops/io/_sklearn.py +++ b/skops/io/_sklearn.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Sequence, Type +from typing import Any, Optional, Sequence, Type from sklearn.cluster import Birch @@ -98,7 +98,7 @@ def __init__( state: dict[str, Any], load_context: LoadContext, constructor: Type[Any], - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) reduce = state["__reduce__"] @@ -157,7 +157,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: self.trusted = self._get_trusted(trusted, [get_module(Tree) + ".Tree"]) super().__init__(state, load_context, constructor=Tree, trusted=self.trusted) @@ -174,7 +174,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: # TODO: make sure trusted here makes sense and used. self.trusted = self._get_trusted( @@ -215,7 +215,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = [ diff --git a/skops/io/_visualize.py b/skops/io/_visualize.py index 2c094f05..7c4965d3 100644 --- a/skops/io/_visualize.py +++ b/skops/io/_visualize.py @@ -4,7 +4,7 @@ import json from dataclasses import dataclass from pathlib import Path -from typing import Any, Callable, Iterator, Literal, Sequence +from typing import Any, Callable, Iterator, Literal, Optional, Sequence from zipfile import ZipFile from ._audit import VALID_NODE_CHILD_TYPES, Node, get_tree @@ -290,7 +290,7 @@ def visualize( file: Path | str | bytes, *, show: Literal["all", "untrusted", "trusted"] = "all", - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, sink: Callable[..., None] = pretty_print_tree, **kwargs: Any, ) -> None: @@ -317,10 +317,8 @@ def visualize( Whether to print all nodes, only untrusted nodes, or only trusted nodes. trusted: bool, or list of str, default=False - If ``True``, all nodes will be treated as trusted. If ``False``, only - default types are trusted. If a list of strings, where those strongs - describe the trusted types, these types are trusted on top of the - default trusted types. + The object will be loaded only if there are only trusted objects and + objects of types listed in ``trusted`` in the dumped file. sink: function (default=:func:`~pretty_print_tree`) This function should take at least two arguments, an iterator of diff --git a/skops/io/old/_general_v0.py b/skops/io/old/_general_v0.py index 5580e63e..4078227c 100644 --- a/skops/io/old/_general_v0.py +++ b/skops/io/old/_general_v0.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Sequence +from typing import Any, Optional, Sequence from skops.io._audit import Node from skops.io._trusted_types import SCIPY_UFUNC_TYPE_NAMES @@ -14,12 +14,12 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) # TODO: what do we trust? self.trusted = self._get_trusted(trusted, default=SCIPY_UFUNC_TYPE_NAMES) - self.children = {"content": state["content"]} + self.children: dict = {"content": state["content"]} def _construct(self): return _import_obj( diff --git a/skops/io/old/_numpy_v0.py b/skops/io/old/_numpy_v0.py index 04e258ee..356f67d2 100644 --- a/skops/io/old/_numpy_v0.py +++ b/skops/io/old/_numpy_v0.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Sequence +from typing import Any, Optional, Sequence import numpy as np @@ -15,7 +15,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.children = {"bit_generator_state": state["content"]["bit_generator"]} diff --git a/skops/io/tests/test_audit.py b/skops/io/tests/test_audit.py index 756e9988..0e6514c3 100644 --- a/skops/io/tests/test_audit.py +++ b/skops/io/tests/test_audit.py @@ -25,11 +25,10 @@ def __init__(self, value): [ ("sklearn", "Pipeline", ["sklearn.Pipeline"], True), ("sklearn", "Pipeline", ["sklearn.preprocessing.StandardScaler"], False), - ("sklearn", "Pipeline", True, True), ("builtins", "int", ["builtins.int"], True), ("builtins", "int", [], False), ], - ids=["list-True", "list-False", "True", "int-True", "int-False"], + ids=["list-True", "list-False", "int-True", "int-False"], ) def test_check_type(module_name, type_name, trusted, expected): assert check_type(module_name, type_name, trusted) == expected @@ -40,7 +39,7 @@ def test_audit_tree_untrusted(): state = dict_get_state(var, SaveContext(None, 0, {})) load_context = LoadContext(None, -1) - node = DictNode(state, load_context, trusted=False) + node = DictNode(state, load_context, trusted=None) with pytest.raises( TypeError, match=re.escape( @@ -49,8 +48,8 @@ def test_audit_tree_untrusted(): ): audit_tree(node) - # there shouldn't be an error with trusted=True - node = DictNode(state, LoadContext(None, -1), trusted=True) + # there shouldn't be an error with trusted=everything + node = DictNode(state, LoadContext(None, -1), trusted=["test_audit.CustomType"]) audit_tree(node) untrusted_list = get_untrusted_types(data=dumps(var)) @@ -65,18 +64,17 @@ def test_audit_tree_defaults(): # test that the default types are trusted var = {"a": 1, 2: "b"} state = dict_get_state(var, SaveContext(None, 0, {})) - node = DictNode(state, LoadContext(None, -1), trusted=False) + node = DictNode(state, LoadContext(None, -1), trusted=None) audit_tree(node) @pytest.mark.parametrize( "trusted, defaults, expected", [ - (True, None, True), - (False, int, ["builtins.int"]), + (None, int, ["builtins.int"]), ([int], None, ["builtins.int"]), ], - ids=["trusted", "untrusted", "untrusted_list"], + ids=["untrusted", "untrusted_list"], ) def test_Node_get_trusted(trusted, defaults, expected): assert Node._get_trusted(trusted, defaults) == expected diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index 25ebf37d..9c0b7a57 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -851,7 +851,7 @@ def test_dump_to_and_load_from_disk(tmp_path): json.loads(ZipFile(f_name).read("schema.json")) # load and compare the actual estimator - loaded = load(f_name, trusted=True) + loaded = load(f_name, trusted=get_untrusted_types(file=f_name)) assert_params_equal(loaded.__dict__, estimator.__dict__) @@ -878,8 +878,10 @@ def test_disk_and_memory_are_identical(tmp_path): f_name = tmp_path / "estimator.skops" dump(estimator, f_name) - loaded_disk = load(f_name, trusted=True) - loaded_memory = loads(dumps(estimator), trusted=True) + loaded_disk = load(f_name, trusted=get_untrusted_types(file=f_name)) + loaded_memory = loads( + dumps(estimator), trusted=get_untrusted_types(data=dumps(estimator)) + ) assert joblib.hash(loaded_disk) == joblib.hash(loaded_memory) @@ -894,7 +896,7 @@ def test_dump_and_load_with_file_wrapper(tmp_path): with open(f_name, "wb") as f: dump(estimator, f) with open(f_name, "rb") as f: - loaded = load(f, trusted=True) + loaded = load(f, trusted=get_untrusted_types(file=f_name)) assert_params_equal(loaded.__dict__, estimator.__dict__) @@ -1017,7 +1019,7 @@ def test_persist_operator_raises_untrusted(op): name, func = op est = FunctionTransformer(func) with pytest.raises(UntrustedTypesFoundException, match=name): - loads(dumps(est), trusted=False) + loads(dumps(est), trusted=None) def dummy_func(X): @@ -1065,3 +1067,14 @@ def test_sparse_matrix(call_has_canonical_format): y = loads(dumped, trusted=untrusted_types) assert_params_equal(x.__dict__, y.__dict__) + + +def test_trusted_bool_raises(tmp_path): + """Make sure trusted=True is no longer accepted.""" + f_name = tmp_path / "file.skops" + dump(10, f_name) + with pytest.raises(TypeError, match="trusted must be a list of strings"): + load(f_name, trusted=True) # type: ignore + + with pytest.raises(TypeError, match="trusted must be a list of strings"): + loads(dumps(10), trusted=True) # type: ignore diff --git a/skops/io/tests/test_persist_old.py b/skops/io/tests/test_persist_old.py index 504b16c0..1dcff103 100644 --- a/skops/io/tests/test_persist_old.py +++ b/skops/io/tests/test_persist_old.py @@ -7,7 +7,7 @@ from scipy import special from sklearn.preprocessing import FunctionTransformer -from skops.io import dumps, loads +from skops.io import dumps, get_untrusted_types, loads from skops.io._utils import get_module from skops.io.tests._utils import ( assert_method_outputs_equal, @@ -58,7 +58,7 @@ def old_function_get_state(obj, save_context): old_state=old_function_get_state(func, None), protocol=0, ) - loaded = loads(downgraded, trusted=True) + loaded = loads(downgraded, trusted=get_untrusted_types(data=downgraded)) # sanity check: ensure that the old get_state function was really called assert call_count == 1 @@ -106,11 +106,6 @@ def old_random_generator_get_state(obj, save_context): ) # old loader only worked with trusted=True, see #329 - loaded = loads(downgraded, trusted=True) - - # sanity check: ensure that the old get_state function was really called - assert call_count == 1 - - rand_floats_expected = rng.random(100) - rand_floats_loaded = loaded.random(100) - np.testing.assert_equal(rand_floats_loaded, rand_floats_expected) + # update: we have removed trusted=True, so this doesn't work anymore. + with pytest.raises(AttributeError): + loads(downgraded, trusted=[]) diff --git a/skops/io/tests/test_visualize.py b/skops/io/tests/test_visualize.py index d3825175..85ee55dd 100644 --- a/skops/io/tests/test_visualize.py +++ b/skops/io/tests/test_visualize.py @@ -16,6 +16,7 @@ from sklearn.utils.fixes import parse_version import skops.io as sio +from skops.io import get_untrusted_types class TestVisualizeTree: @@ -112,12 +113,11 @@ def sink(nodes_iter, *args, **kwargs): assert len(nodes_unsafe) > 2 assert any("FunctionTransformer" in node.val for node in nodes_unsafe) - @pytest.mark.parametrize("trusted", [True, ["test_visualize.unsafe_function"]]) - def test_all_nodes_trusted(self, pipeline, trusted, capsys): - # The pipeline contains untrusted type(s), but if we pass trusted=True, - # it is not considered untrusted anymore + def test_all_nodes_trusted(self, pipeline, capsys): + # The pipeline contains untrusted type(s), so we have to pass extra + # trusted types file = sio.dumps(pipeline) - sio.visualize(file, show="untrusted", trusted=trusted) + sio.visualize(file, show="untrusted", trusted=get_untrusted_types(data=file)) expected = "root: sklearn.pipeline.Pipeline" stdout, _ = capsys.readouterr() assert stdout.strip() == expected diff --git a/spaces/skops_model_card_creator/start.py b/spaces/skops_model_card_creator/start.py index 523591d9..2e76d05d 100644 --- a/spaces/skops_model_card_creator/start.py +++ b/spaces/skops_model_card_creator/start.py @@ -28,6 +28,7 @@ import skops.io as sio from skops import card, hub_utils +from skops.io import get_untrusted_types tmp_path = Path(mkdtemp(prefix="skops-")) # temporary files description = """Create a Hugging Face model repository for scikit learn models @@ -46,7 +47,7 @@ def load_model() -> None: bytes_data = st.session_state.model_file.getvalue() if st.session_state.model_file.name.endswith("skops"): - model = sio.loads(bytes_data, trusted=True) + model = sio.loads(bytes_data, trusted=get_untrusted_types(data=bytes_data)) else: model = pickle.loads(bytes_data) assert isinstance(model, BaseEstimator), "model must be an sklearn model"