From 79148da8e430082b6f26c8add05882cb4df21764 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 6 Jun 2024 12:19:19 +0200 Subject: [PATCH 1/8] SEC remove trusted=True --- docs/persistence.rst | 36 ++++++++------------- skops/card/_model_card.py | 4 +-- skops/card/tests/test_card.py | 14 +++++--- skops/cli/_update.py | 4 +-- skops/cli/tests/test_convert.py | 4 +-- skops/io/_audit.py | 21 ++++++------ skops/io/_general.py | 30 ++++++++--------- skops/io/_numpy.py | 12 +++---- skops/io/_persist.py | 41 ++++++++---------------- skops/io/_quantile_forest.py | 4 +-- skops/io/_scipy.py | 4 +-- skops/io/_sklearn.py | 10 +++--- skops/io/_visualize.py | 10 +++--- skops/io/old/_general_v0.py | 6 ++-- skops/io/old/_numpy_v0.py | 4 +-- skops/io/tests/test_audit.py | 4 +-- skops/io/tests/test_persist.py | 10 +++--- skops/io/tests/test_persist_old.py | 6 ++-- skops/io/tests/test_visualize.py | 7 ++-- spaces/skops_model_card_creator/start.py | 3 +- 20 files changed, 108 insertions(+), 126 deletions(-) diff --git a/docs/persistence.rst b/docs/persistence.rst index bf74e38a..47bc6b6f 100644 --- a/docs/persistence.rst +++ b/docs/persistence.rst @@ -54,7 +54,7 @@ The code snippet below illustrates how to use :func:`skops.io.dump` and from xgboost.sklearn import XGBClassifier from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.datasets import load_iris - from skops.io import dump, load + from skops.io import dump, load, get_untrusted_types X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) @@ -64,26 +64,21 @@ The code snippet below illustrates how to use :func:`skops.io.dump` and 0.9666666666666667 dump(clf, "my-model.skops") # ... - loaded = load("my-model.skops", trusted=True) + unknown_types = get_untrusted_types(file="my-model.skops") + print(unknown_types) + ['sklearn.metrics._scorer._passthrough_scorer', + 'xgboost.core.Booster', 'xgboost.sklearn.XGBClassifier'] + loaded = load("my-model.skops", trusted=unknown_types) print(loaded.score(X_test, y_test)) 0.9666666666666667 # in memory from skops.io import dumps, loads serialized = dumps(clf) - loaded = loads(serialized, trusted=True) - -Note that you should only load files with ``trusted=True`` if you trust the -source. Otherwise you can get a list of untrusted types present in the dump -using :func:`skops.io.get_untrusted_types`: - -.. code:: python + loaded = loads(serialized, trusted=unknown_types) - from skops.io import get_untrusted_types - unknown_types = get_untrusted_types(file="my-model.skops") - print(unknown_types) - ['sklearn.metrics._scorer._passthrough_scorer', - 'xgboost.core.Booster', 'xgboost.sklearn.XGBClassifier'] +Note that the ``get_untrusted_types`` function is used to check which types are +not trusted by default. The user can then decide whether to trust them or not. Note that everything in the above list is safe to load. We already have many types included as trusted by default, and some of the above values might be @@ -92,10 +87,6 @@ added to that list in the future. Once you check the list and you validate that everything in the list is safe, you can load the file with ``trusted=unknown_types``: -.. code:: python - - loaded = load("my-model.skops", trusted=unknown_types) - At the moment, we support the vast majority of sklearn estimators. This includes complex use cases such as :class:`sklearn.pipeline.Pipeline`, :class:`sklearn.model_selection.GridSearchCV`, classes using objects defined in @@ -226,10 +217,11 @@ green to cyan. The ``rich`` docs list the `supported standard colors Note that the visualization feature is intended to help understand the structure of the object, e.g. what attributes are identified as untrusted. It is not a -replacement for a proper security check. In particular, just because an object's -visualization looks innocent does *not* mean you can just call `sio.load(, -trusted=True)` on this object -- only pass the types you really trust to the -``trusted`` argument. +replacement for a proper security check of the included types in the file. In +particular, just because an object's visualization looks innocent does *not* +mean you can just call `sio.load(, +trusted=get_untrusted_types(file=))` on this object -- only pass the +types you really trust to the ``trusted`` argument. Supported libraries ------------------- diff --git a/skops/card/_model_card.py b/skops/card/_model_card.py index 02cd8000..e945c0d6 100644 --- a/skops/card/_model_card.py +++ b/skops/card/_model_card.py @@ -12,7 +12,7 @@ from hashlib import sha256 from pathlib import Path from reprlib import Repr -from typing import Any, Iterator, Literal, Sequence, Union +from typing import Any, Iterator, List, Literal, Optional, Sequence, Union import joblib from huggingface_hub import ModelCardData @@ -489,7 +489,7 @@ def __init__( model_diagram: bool | Literal["auto"] | str = "auto", metadata: ModelCardData | None = None, template: Literal["skops"] | dict[str, str] | None = "skops", - trusted: bool = False, + trusted: Optional[List[str]] = None, ) -> None: self.model = model self.metadata = metadata or ModelCardData() diff --git a/skops/card/tests/test_card.py b/skops/card/tests/test_card.py index 97862d2a..9eb6aea1 100644 --- a/skops/card/tests/test_card.py +++ b/skops/card/tests/test_card.py @@ -26,7 +26,7 @@ TableSection, _load_model, ) -from skops.io import dump, load +from skops.io import dump, get_untrusted_types, load from skops.utils.importutils import import_or_raise @@ -51,10 +51,14 @@ def save_model_to_file(model_instance, suffix): def test_load_model(suffix): model0 = LinearRegression(n_jobs=123) _, save_file = save_model_to_file(model0, suffix) - loaded_model_str = _load_model(save_file, trusted=True) + if suffix == ".skops": + untrusted_types = get_untrusted_types(file=save_file) + else: + untrusted_types = None + loaded_model_str = _load_model(save_file, trusted=untrusted_types) save_file_path = Path(save_file) - loaded_model_path = _load_model(save_file_path, trusted=True) - loaded_model_instance = _load_model(model0, trusted=True) + loaded_model_path = _load_model(save_file_path, trusted=untrusted_types) + loaded_model_instance = _load_model(model0, trusted=untrusted_types) assert loaded_model_str.n_jobs == 123 assert loaded_model_path.n_jobs == 123 @@ -1384,7 +1388,7 @@ def test_with_metadata(self, card: Card, meth, expected_lines): class TestCardModelAttributeIsPath: def path_to_card(self, path): - card = Card(model=path, trusted=True) + card = Card(model=path, trusted=get_untrusted_types(file=path)) return card @pytest.mark.parametrize("meth", [repr, str]) diff --git a/skops/cli/_update.py b/skops/cli/_update.py index c648e37b..e3edecc0 100644 --- a/skops/cli/_update.py +++ b/skops/cli/_update.py @@ -9,7 +9,7 @@ from pathlib import Path from skops.cli._utils import get_log_level -from skops.io import dump, load +from skops.io import dump, get_untrusted_types, load from skops.io._protocol import PROTOCOL @@ -48,7 +48,7 @@ def _update_file( " file." ) - input_model = load(input_file, trusted=True) + input_model = load(input_file, trusted=get_untrusted_types(file=input_file)) with zipfile.ZipFile(input_file, "r") as zip_file: input_file_schema = json.loads(zip_file.read("schema.json")) diff --git a/skops/cli/tests/test_convert.py b/skops/cli/tests/test_convert.py index 6395dd3d..72a76b00 100644 --- a/skops/cli/tests/test_convert.py +++ b/skops/cli/tests/test_convert.py @@ -7,7 +7,7 @@ import pytest from skops.cli import _convert -from skops.io import load +from skops.io import get_untrusted_types, load class MockUnsafeType: @@ -61,7 +61,7 @@ def test_unsafe_case_works_as_expected( ): caplog.set_level(logging.WARNING) _convert._convert_file(pkl_path, skops_path) - persisted_obj = load(skops_path, trusted=True) + persisted_obj = load(skops_path, trusted=get_untrusted_types(file=skops_path)) assert isinstance(persisted_obj, MockUnsafeType) diff --git a/skops/io/_audit.py b/skops/io/_audit.py index 5ae9fc28..a3ae1b26 100644 --- a/skops/io/_audit.py +++ b/skops/io/_audit.py @@ -142,7 +142,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, memoize: bool = True, ) -> None: self.class_name, self.module_name = state["__class__"], state["__module__"] @@ -180,22 +180,19 @@ def _construct(self): @staticmethod def _get_trusted( - trusted: bool | Sequence[Union[str, Type]], default: Sequence[Union[str, Type]] - ) -> Literal[True] | list[str]: + trusted: Optional[Sequence[Union[str, Type]]], + default: Sequence[Union[str, Type]], + ) -> list[str]: """Return a trusted list, or True. - If ``trusted`` is ``False``, we return the ``default``. If a list of + If ``trusted`` is ``None``, we return the ``default``. If a list of types are being passed, those types, as well as default trusted types, are returned. This is a convenience method called by child classes. """ - if trusted is True: - # if trusted is True, we trust the node - return True - - if trusted is False: + if trusted is None: # if trusted is False, we only trust the defaults return get_type_paths(default) @@ -289,12 +286,12 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool = False, + trusted: Optional[List[str]] = None, ): # we pass memoize as False because we don't want to memoize the cached # node. super().__init__(state, load_context, trusted, memoize=False) - self.trusted = True + self.trusted = self._get_trusted(trusted, default=[]) # TODO: deal with case that __id__ is unknown or prevent it from # happening self.cached = load_context.get_object(state.get("__id__")) # type: ignore @@ -313,7 +310,7 @@ def _construct(self): def get_tree( state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str], + trusted: Optional[Sequence[str]], ) -> Node: """Get the tree of nodes. diff --git a/skops/io/_general.py b/skops/io/_general.py index 662252bc..46a3b1c8 100644 --- a/skops/io/_general.py +++ b/skops/io/_general.py @@ -7,7 +7,7 @@ from functools import partial from reprlib import Repr from types import FunctionType, MethodType -from typing import Any, Sequence +from typing import Any, Optional, Sequence import numpy as np @@ -60,7 +60,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [dict]) @@ -97,7 +97,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [list]) @@ -129,7 +129,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [set]) @@ -161,7 +161,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [tuple]) @@ -208,7 +208,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) # TODO: what do we trust? @@ -252,7 +252,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) # TODO: should we trust anything? @@ -293,7 +293,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) # TODO: what do we trust? @@ -327,7 +327,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [slice]) @@ -390,7 +390,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) @@ -449,7 +449,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.children = { @@ -474,7 +474,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.content = state["content"] @@ -527,7 +527,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [bytes]) @@ -548,7 +548,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [bytearray]) @@ -578,7 +578,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, []) diff --git a/skops/io/_numpy.py b/skops/io/_numpy.py index fb0dd80d..f3887ba6 100644 --- a/skops/io/_numpy.py +++ b/skops/io/_numpy.py @@ -1,7 +1,7 @@ from __future__ import annotations import io -from typing import Any, Sequence +from typing import Any, Optional, Sequence import numpy as np @@ -61,7 +61,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.type = state["type"] @@ -128,7 +128,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, [np.ma.MaskedArray]) @@ -159,7 +159,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) # TODO @@ -190,7 +190,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.children = { @@ -231,7 +231,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.children = { diff --git a/skops/io/_persist.py b/skops/io/_persist.py index d69b6123..bdc185d9 100644 --- a/skops/io/_persist.py +++ b/skops/io/_persist.py @@ -4,7 +4,7 @@ import io import json from pathlib import Path -from typing import Any, BinaryIO, Sequence +from typing import Any, BinaryIO, Optional, Sequence from zipfile import ZIP_STORED, ZipFile import skops @@ -113,30 +113,21 @@ def dumps( return buffer.getbuffer().tobytes() -def load(file: str | Path, trusted: bool | Sequence[str] = False) -> Any: +def load(file: str | Path, trusted: Optional[Sequence[str]] = None) -> Any: """Load an object saved with the skops persistence format. Skops aims at providing a secure persistence feature that does not rely on :mod:`pickle`, which is inherently insecure. For more information, please visit the :ref:`persistence` documentation. - .. warning:: - - This feature is heavily under development, which means the API is - unstable and there might be security issues at the moment. Therefore, - use caution when loading files from sources you don't trust. - Parameters ---------- file: str or pathlib.Path The file name of the object to be loaded. - trusted: bool, or list of str, default=False - If ``True``, the object will be loaded without any security checks. If - ``False``, the object will be loaded only if there are only trusted - objects in the dumped file. If a list of strings, the object will be - loaded only if there are only trusted objects and objects of types - listed in ``trusted`` in the dumped file. + trusted: list of str, default=None + The object will be loaded only if there are only trusted objects and + objects of types listed in ``trusted`` in the dumped file. Returns ------- @@ -144,6 +135,8 @@ def load(file: str | Path, trusted: bool | Sequence[str] = False) -> Any: The loaded object. """ + if trusted and not isinstance(trusted, list): + raise TypeError("trusted must be a list of strings") with ZipFile(file, "r") as input_zip: schema = json.loads(input_zip.read("schema.json")) load_context = LoadContext(src=input_zip, protocol=schema["protocol"]) @@ -154,27 +147,18 @@ def load(file: str | Path, trusted: bool | Sequence[str] = False) -> Any: return instance -def loads(data: bytes, trusted: bool | Sequence[str] = False) -> Any: +def loads(data: bytes, trusted: Optional[Sequence[str]] = None) -> Any: """Load an object saved with the skops persistence format from a bytes object. - .. warning:: - - This feature is heavily under development, which means the API is - unstable and there might be security issues at the moment. Therefore, - use caution when loading files from sources you don't trust. - Parameters ---------- data: bytes The dumped data to be loaded in bytes format. trusted: bool, or list of str, default=False - If ``True``, the object will be loaded without any security checks. If - ``False``, the object will be loaded only if there are only trusted - objects in the dumped file. If a list of strings, the object will be - loaded only if there are only trusted objects and objects of types - listed in ``trusted`` in the dumped file. + The object will be loaded only if there are only trusted objects and + objects of types listed in ``trusted`` in the dumped file. Returns ------- @@ -184,6 +168,9 @@ def loads(data: bytes, trusted: bool | Sequence[str] = False) -> Any: if isinstance(data, str): raise TypeError("Can't load skops format from string, pass bytes") + if trusted and not isinstance(trusted, list): + raise TypeError("trusted must be a list of strings") + with ZipFile(io.BytesIO(data), "r") as zip_file: schema = json.loads(zip_file.read("schema.json")) load_context = LoadContext(src=zip_file, protocol=schema["protocol"]) @@ -231,7 +218,7 @@ def get_untrusted_types( with ZipFile(content, "r") as zip_file: schema = json.loads(zip_file.read("schema.json")) load_context = LoadContext(src=zip_file, protocol=schema["protocol"]) - tree = get_tree(schema, load_context=load_context, trusted=False) + tree = get_tree(schema, load_context=load_context, trusted=None) untrusted_types = tree.get_unsafe_set() return sorted(untrusted_types) diff --git a/skops/io/_quantile_forest.py b/skops/io/_quantile_forest.py index a9b14d47..dbe745a7 100644 --- a/skops/io/_quantile_forest.py +++ b/skops/io/_quantile_forest.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Sequence +from typing import Any, Optional, Sequence from ._protocol import PROTOCOL from ._sklearn import ReduceNode, reduce_get_state @@ -26,7 +26,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: if QuantileForest is None: raise ImportError( diff --git a/skops/io/_scipy.py b/skops/io/_scipy.py index adc32a6d..e482a1af 100644 --- a/skops/io/_scipy.py +++ b/skops/io/_scipy.py @@ -1,7 +1,7 @@ from __future__ import annotations import io -from typing import Any, Sequence +from typing import Any, Optional, Sequence from scipy.sparse import load_npz, save_npz, spmatrix @@ -38,7 +38,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.type = state["type"] diff --git a/skops/io/_sklearn.py b/skops/io/_sklearn.py index 3f4c8b35..9f6058a1 100644 --- a/skops/io/_sklearn.py +++ b/skops/io/_sklearn.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Sequence, Type +from typing import Any, Optional, Sequence, Type from sklearn.cluster import Birch @@ -98,7 +98,7 @@ def __init__( state: dict[str, Any], load_context: LoadContext, constructor: Type[Any], - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) reduce = state["__reduce__"] @@ -157,7 +157,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: self.trusted = self._get_trusted(trusted, [get_module(Tree) + ".Tree"]) super().__init__(state, load_context, constructor=Tree, trusted=self.trusted) @@ -174,7 +174,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: # TODO: make sure trusted here makes sense and used. self.trusted = self._get_trusted( @@ -215,7 +215,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.trusted = [ diff --git a/skops/io/_visualize.py b/skops/io/_visualize.py index 2c094f05..7c4965d3 100644 --- a/skops/io/_visualize.py +++ b/skops/io/_visualize.py @@ -4,7 +4,7 @@ import json from dataclasses import dataclass from pathlib import Path -from typing import Any, Callable, Iterator, Literal, Sequence +from typing import Any, Callable, Iterator, Literal, Optional, Sequence from zipfile import ZipFile from ._audit import VALID_NODE_CHILD_TYPES, Node, get_tree @@ -290,7 +290,7 @@ def visualize( file: Path | str | bytes, *, show: Literal["all", "untrusted", "trusted"] = "all", - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, sink: Callable[..., None] = pretty_print_tree, **kwargs: Any, ) -> None: @@ -317,10 +317,8 @@ def visualize( Whether to print all nodes, only untrusted nodes, or only trusted nodes. trusted: bool, or list of str, default=False - If ``True``, all nodes will be treated as trusted. If ``False``, only - default types are trusted. If a list of strings, where those strongs - describe the trusted types, these types are trusted on top of the - default trusted types. + The object will be loaded only if there are only trusted objects and + objects of types listed in ``trusted`` in the dumped file. sink: function (default=:func:`~pretty_print_tree`) This function should take at least two arguments, an iterator of diff --git a/skops/io/old/_general_v0.py b/skops/io/old/_general_v0.py index 5580e63e..4078227c 100644 --- a/skops/io/old/_general_v0.py +++ b/skops/io/old/_general_v0.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Sequence +from typing import Any, Optional, Sequence from skops.io._audit import Node from skops.io._trusted_types import SCIPY_UFUNC_TYPE_NAMES @@ -14,12 +14,12 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) # TODO: what do we trust? self.trusted = self._get_trusted(trusted, default=SCIPY_UFUNC_TYPE_NAMES) - self.children = {"content": state["content"]} + self.children: dict = {"content": state["content"]} def _construct(self): return _import_obj( diff --git a/skops/io/old/_numpy_v0.py b/skops/io/old/_numpy_v0.py index 04e258ee..356f67d2 100644 --- a/skops/io/old/_numpy_v0.py +++ b/skops/io/old/_numpy_v0.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Sequence +from typing import Any, Optional, Sequence import numpy as np @@ -15,7 +15,7 @@ def __init__( self, state: dict[str, Any], load_context: LoadContext, - trusted: bool | Sequence[str] = False, + trusted: Optional[Sequence[str]] = None, ) -> None: super().__init__(state, load_context, trusted) self.children = {"bit_generator_state": state["content"]["bit_generator"]} diff --git a/skops/io/tests/test_audit.py b/skops/io/tests/test_audit.py index 756e9988..a5e23e6e 100644 --- a/skops/io/tests/test_audit.py +++ b/skops/io/tests/test_audit.py @@ -49,8 +49,8 @@ def test_audit_tree_untrusted(): ): audit_tree(node) - # there shouldn't be an error with trusted=True - node = DictNode(state, LoadContext(None, -1), trusted=True) + # there shouldn't be an error with trusted=everything + node = DictNode(state, LoadContext(None, -1), trusted=[]) audit_tree(node) untrusted_list = get_untrusted_types(data=dumps(var)) diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index 25ebf37d..eb34c608 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -851,7 +851,7 @@ def test_dump_to_and_load_from_disk(tmp_path): json.loads(ZipFile(f_name).read("schema.json")) # load and compare the actual estimator - loaded = load(f_name, trusted=True) + loaded = load(f_name, trusted=get_untrusted_types(file=f_name)) assert_params_equal(loaded.__dict__, estimator.__dict__) @@ -878,8 +878,10 @@ def test_disk_and_memory_are_identical(tmp_path): f_name = tmp_path / "estimator.skops" dump(estimator, f_name) - loaded_disk = load(f_name, trusted=True) - loaded_memory = loads(dumps(estimator), trusted=True) + loaded_disk = load(f_name, trusted=get_untrusted_types(file=f_name)) + loaded_memory = loads( + dumps(estimator), trusted=get_untrusted_types(data=dumps(estimator)) + ) assert joblib.hash(loaded_disk) == joblib.hash(loaded_memory) @@ -894,7 +896,7 @@ def test_dump_and_load_with_file_wrapper(tmp_path): with open(f_name, "wb") as f: dump(estimator, f) with open(f_name, "rb") as f: - loaded = load(f, trusted=True) + loaded = load(f, trusted=get_untrusted_types(file=f_name)) assert_params_equal(loaded.__dict__, estimator.__dict__) diff --git a/skops/io/tests/test_persist_old.py b/skops/io/tests/test_persist_old.py index 504b16c0..aafe9c3d 100644 --- a/skops/io/tests/test_persist_old.py +++ b/skops/io/tests/test_persist_old.py @@ -7,7 +7,7 @@ from scipy import special from sklearn.preprocessing import FunctionTransformer -from skops.io import dumps, loads +from skops.io import dumps, get_untrusted_types, loads from skops.io._utils import get_module from skops.io.tests._utils import ( assert_method_outputs_equal, @@ -58,7 +58,7 @@ def old_function_get_state(obj, save_context): old_state=old_function_get_state(func, None), protocol=0, ) - loaded = loads(downgraded, trusted=True) + loaded = loads(downgraded, trusted=get_untrusted_types(data=downgraded)) # sanity check: ensure that the old get_state function was really called assert call_count == 1 @@ -106,7 +106,7 @@ def old_random_generator_get_state(obj, save_context): ) # old loader only worked with trusted=True, see #329 - loaded = loads(downgraded, trusted=True) + loaded = loads(downgraded, trusted=get_untrusted_types(data=downgraded)) # sanity check: ensure that the old get_state function was really called assert call_count == 1 diff --git a/skops/io/tests/test_visualize.py b/skops/io/tests/test_visualize.py index d3825175..b17d71e6 100644 --- a/skops/io/tests/test_visualize.py +++ b/skops/io/tests/test_visualize.py @@ -16,6 +16,7 @@ from sklearn.utils.fixes import parse_version import skops.io as sio +from skops.io import get_untrusted_types class TestVisualizeTree: @@ -114,10 +115,10 @@ def sink(nodes_iter, *args, **kwargs): @pytest.mark.parametrize("trusted", [True, ["test_visualize.unsafe_function"]]) def test_all_nodes_trusted(self, pipeline, trusted, capsys): - # The pipeline contains untrusted type(s), but if we pass trusted=True, - # it is not considered untrusted anymore + # The pipeline contains untrusted type(s), so we have to pass extra + # trusted types file = sio.dumps(pipeline) - sio.visualize(file, show="untrusted", trusted=trusted) + sio.visualize(file, show="untrusted", trusted=get_untrusted_types(data=file)) expected = "root: sklearn.pipeline.Pipeline" stdout, _ = capsys.readouterr() assert stdout.strip() == expected diff --git a/spaces/skops_model_card_creator/start.py b/spaces/skops_model_card_creator/start.py index 523591d9..2e76d05d 100644 --- a/spaces/skops_model_card_creator/start.py +++ b/spaces/skops_model_card_creator/start.py @@ -28,6 +28,7 @@ import skops.io as sio from skops import card, hub_utils +from skops.io import get_untrusted_types tmp_path = Path(mkdtemp(prefix="skops-")) # temporary files description = """Create a Hugging Face model repository for scikit learn models @@ -46,7 +47,7 @@ def load_model() -> None: bytes_data = st.session_state.model_file.getvalue() if st.session_state.model_file.name.endswith("skops"): - model = sio.loads(bytes_data, trusted=True) + model = sio.loads(bytes_data, trusted=get_untrusted_types(data=bytes_data)) else: model = pickle.loads(bytes_data) assert isinstance(model, BaseEstimator), "model must be an sklearn model" From 0b870378af9da32b6f52d389f051831e3f613df0 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 6 Jun 2024 13:43:48 +0200 Subject: [PATCH 2/8] FIX make the rest of the fixes and make sure tests pass --- skops/card/tests/test_card.py | 9 ++++++--- skops/io/_audit.py | 3 --- skops/io/tests/test_audit.py | 11 +++++------ skops/io/tests/test_persist_old.py | 11 +++-------- 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/skops/card/tests/test_card.py b/skops/card/tests/test_card.py index 9eb6aea1..9af02530 100644 --- a/skops/card/tests/test_card.py +++ b/skops/card/tests/test_card.py @@ -1387,8 +1387,11 @@ def test_with_metadata(self, card: Card, meth, expected_lines): class TestCardModelAttributeIsPath: - def path_to_card(self, path): - card = Card(model=path, trusted=get_untrusted_types(file=path)) + def path_to_card(self, path, suffix): + if suffix == ".skops": + card = Card(model=path, trusted=get_untrusted_types(file=path)) + else: + card = Card(model=path) return card @pytest.mark.parametrize("meth", [repr, str]) @@ -1401,7 +1404,7 @@ def test_model_card_repr(self, meth, suffix): model = LinearRegression(fit_intercept=False) file_handle, file_name = save_model_to_file(model, suffix) os.close(file_handle) - card_from_path = self.path_to_card(file_name) + card_from_path = self.path_to_card(file_name, suffix=suffix) result0 = meth(card_from_path) expected = "Card(\n model=LinearRegression(fit_intercept=False)," diff --git a/skops/io/_audit.py b/skops/io/_audit.py index a3ae1b26..f14dab56 100644 --- a/skops/io/_audit.py +++ b/skops/io/_audit.py @@ -59,9 +59,6 @@ def audit_tree(tree: Node) -> None: UntrustedTypesFoundException If the tree contains an untrusted type. """ - if tree.trusted is True: - return - unsafe = tree.get_unsafe_set() if unsafe: raise UntrustedTypesFoundException(unsafe) diff --git a/skops/io/tests/test_audit.py b/skops/io/tests/test_audit.py index a5e23e6e..4e7289d4 100644 --- a/skops/io/tests/test_audit.py +++ b/skops/io/tests/test_audit.py @@ -40,7 +40,7 @@ def test_audit_tree_untrusted(): state = dict_get_state(var, SaveContext(None, 0, {})) load_context = LoadContext(None, -1) - node = DictNode(state, load_context, trusted=False) + node = DictNode(state, load_context, trusted=None) with pytest.raises( TypeError, match=re.escape( @@ -50,7 +50,7 @@ def test_audit_tree_untrusted(): audit_tree(node) # there shouldn't be an error with trusted=everything - node = DictNode(state, LoadContext(None, -1), trusted=[]) + node = DictNode(state, LoadContext(None, -1), trusted=["test_audit.CustomType"]) audit_tree(node) untrusted_list = get_untrusted_types(data=dumps(var)) @@ -65,18 +65,17 @@ def test_audit_tree_defaults(): # test that the default types are trusted var = {"a": 1, 2: "b"} state = dict_get_state(var, SaveContext(None, 0, {})) - node = DictNode(state, LoadContext(None, -1), trusted=False) + node = DictNode(state, LoadContext(None, -1), trusted=None) audit_tree(node) @pytest.mark.parametrize( "trusted, defaults, expected", [ - (True, None, True), - (False, int, ["builtins.int"]), + (None, int, ["builtins.int"]), ([int], None, ["builtins.int"]), ], - ids=["trusted", "untrusted", "untrusted_list"], + ids=["untrusted", "untrusted_list"], ) def test_Node_get_trusted(trusted, defaults, expected): assert Node._get_trusted(trusted, defaults) == expected diff --git a/skops/io/tests/test_persist_old.py b/skops/io/tests/test_persist_old.py index aafe9c3d..1dcff103 100644 --- a/skops/io/tests/test_persist_old.py +++ b/skops/io/tests/test_persist_old.py @@ -106,11 +106,6 @@ def old_random_generator_get_state(obj, save_context): ) # old loader only worked with trusted=True, see #329 - loaded = loads(downgraded, trusted=get_untrusted_types(data=downgraded)) - - # sanity check: ensure that the old get_state function was really called - assert call_count == 1 - - rand_floats_expected = rng.random(100) - rand_floats_loaded = loaded.random(100) - np.testing.assert_equal(rand_floats_loaded, rand_floats_expected) + # update: we have removed trusted=True, so this doesn't work anymore. + with pytest.raises(AttributeError): + loads(downgraded, trusted=[]) From 2c3d7094260f473fc0c4fd67bade21563e1d3386 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 6 Jun 2024 13:52:51 +0200 Subject: [PATCH 3/8] TST add test for trusted=True --- skops/io/tests/test_persist.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index eb34c608..9c0b7a57 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -1019,7 +1019,7 @@ def test_persist_operator_raises_untrusted(op): name, func = op est = FunctionTransformer(func) with pytest.raises(UntrustedTypesFoundException, match=name): - loads(dumps(est), trusted=False) + loads(dumps(est), trusted=None) def dummy_func(X): @@ -1067,3 +1067,14 @@ def test_sparse_matrix(call_has_canonical_format): y = loads(dumped, trusted=untrusted_types) assert_params_equal(x.__dict__, y.__dict__) + + +def test_trusted_bool_raises(tmp_path): + """Make sure trusted=True is no longer accepted.""" + f_name = tmp_path / "file.skops" + dump(10, f_name) + with pytest.raises(TypeError, match="trusted must be a list of strings"): + load(f_name, trusted=True) # type: ignore + + with pytest.raises(TypeError, match="trusted must be a list of strings"): + loads(dumps(10), trusted=True) # type: ignore From 9bff44a8ca83c0b04359aa1b3938627af32be5d5 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 6 Jun 2024 13:56:13 +0200 Subject: [PATCH 4/8] DOC add changelog --- docs/changes.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/changes.rst b/docs/changes.rst index 61bec37e..a36939b7 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -14,6 +14,8 @@ v0.10 - Removes Pythn 3.8 support and adds Python 3.12 Support :pr:`418` by :user:`Thomas Lazarus `. - Removes a shortcut to add `sklearn-intelex` as a not dependency. :pr:`420` by :user:`Thomas Lazarus < lazarust > `. +- ``trusted=True`` is now removed from ``skops.io.load`` and ``skops.io.loads``. + :pr:`XXX` by `Adrin Jalali`_. v0.9 ---- From 4efe41c81c6861c688808fe311e731a3bf57eb77 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 6 Jun 2024 13:59:28 +0200 Subject: [PATCH 5/8] DOC update PR number --- docs/changes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changes.rst b/docs/changes.rst index a36939b7..4c8bbedb 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -15,7 +15,7 @@ v0.10 - Removes a shortcut to add `sklearn-intelex` as a not dependency. :pr:`420` by :user:`Thomas Lazarus < lazarust > `. - ``trusted=True`` is now removed from ``skops.io.load`` and ``skops.io.loads``. - :pr:`XXX` by `Adrin Jalali`_. + :pr:`422` by `Adrin Jalali`_. v0.9 ---- From b1de09fe4b417de528fcdac7076d185e92ee6d5b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 6 Jun 2024 14:03:42 +0200 Subject: [PATCH 6/8] ENH make the error message more helpful --- skops/io/_persist.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/skops/io/_persist.py b/skops/io/_persist.py index bdc185d9..3f988201 100644 --- a/skops/io/_persist.py +++ b/skops/io/_persist.py @@ -136,7 +136,14 @@ def load(file: str | Path, trusted: Optional[Sequence[str]] = None) -> Any: """ if trusted and not isinstance(trusted, list): - raise TypeError("trusted must be a list of strings") + raise TypeError( + "trusted must be a list of strings. Before version 0.10 trusted could " + "be a boolean, but this is no longer supported, due to a reported " + "CVE-2024-37065. You can pass the output of `get_untrusted_types` as " + "trusted to load the data. Be sure to review the output of the function " + "before passing it as trusted." + ) + with ZipFile(file, "r") as input_zip: schema = json.loads(input_zip.read("schema.json")) load_context = LoadContext(src=input_zip, protocol=schema["protocol"]) @@ -169,7 +176,13 @@ def loads(data: bytes, trusted: Optional[Sequence[str]] = None) -> Any: raise TypeError("Can't load skops format from string, pass bytes") if trusted and not isinstance(trusted, list): - raise TypeError("trusted must be a list of strings") + raise TypeError( + "trusted must be a list of strings. Before version 0.10 trusted could " + "be a boolean, but this is no longer supported, due to a reported " + "CVE-2024-37065. You can pass the output of `get_untrusted_types` as " + "trusted to load the data. Be sure to review the output of the function " + "before passing it as trusted." + ) with ZipFile(io.BytesIO(data), "r") as zip_file: schema = json.loads(zip_file.read("schema.json")) From ed32c8d0c3f5605e1d830167b1bb798c62a007dc Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 10 Jun 2024 16:11:12 +0200 Subject: [PATCH 7/8] Address Benjamin's comments --- docs/changes.rst | 3 +++ docs/persistence.rst | 3 +++ skops/io/_audit.py | 11 +++-------- skops/io/tests/test_audit.py | 3 +-- skops/io/tests/test_visualize.py | 3 +-- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/docs/changes.rst b/docs/changes.rst index 4c8bbedb..9e040ae8 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -15,6 +15,9 @@ v0.10 - Removes a shortcut to add `sklearn-intelex` as a not dependency. :pr:`420` by :user:`Thomas Lazarus < lazarust > `. - ``trusted=True`` is now removed from ``skops.io.load`` and ``skops.io.loads``. + This is to further encourage users to inspect the input data before loading + it. :func:`skops.io.get_untrusted_types` can be used to get the untrusted types + present in the input. :pr:`422` by `Adrin Jalali`_. v0.9 diff --git a/docs/persistence.rst b/docs/persistence.rst index 47bc6b6f..255d019b 100644 --- a/docs/persistence.rst +++ b/docs/persistence.rst @@ -79,6 +79,9 @@ The code snippet below illustrates how to use :func:`skops.io.dump` and Note that the ``get_untrusted_types`` function is used to check which types are not trusted by default. The user can then decide whether to trust them or not. +In previous before version 0.10, users could pass ``trusted=True`` to skip the +audit phase, which is now removed to encourage users to validate the input +before loading. Note that everything in the above list is safe to load. We already have many types included as trusted by default, and some of the above values might be diff --git a/skops/io/_audit.py b/skops/io/_audit.py index f14dab56..8d1352a1 100644 --- a/skops/io/_audit.py +++ b/skops/io/_audit.py @@ -2,7 +2,7 @@ import io from contextlib import contextmanager -from typing import Any, Dict, Generator, List, Literal, Optional, Sequence, Type, Union +from typing import Any, Dict, Generator, List, Optional, Sequence, Type, Union from ._protocol import PROTOCOL from ._utils import LoadContext, get_module, get_type_paths @@ -14,9 +14,7 @@ ] -def check_type( - module_name: str, type_name: str, trusted: Literal[True] | Sequence[str] -) -> bool: +def check_type(module_name: str, type_name: str, trusted: Sequence[str]) -> bool: """Check if a type is safe to load. A type is safe to load only if it's present in the trusted list. @@ -38,16 +36,13 @@ def check_type( is_safe : bool True if the type is safe, False otherwise. """ - if trusted is True: - return True return module_name + "." + type_name in trusted def audit_tree(tree: Node) -> None: """Audit a tree of nodes. - A tree is safe if it only contains trusted types. Audit is skipped if - trusted is ``True``. + A tree is safe if it only contains trusted types. Parameters ---------- diff --git a/skops/io/tests/test_audit.py b/skops/io/tests/test_audit.py index 4e7289d4..0e6514c3 100644 --- a/skops/io/tests/test_audit.py +++ b/skops/io/tests/test_audit.py @@ -25,11 +25,10 @@ def __init__(self, value): [ ("sklearn", "Pipeline", ["sklearn.Pipeline"], True), ("sklearn", "Pipeline", ["sklearn.preprocessing.StandardScaler"], False), - ("sklearn", "Pipeline", True, True), ("builtins", "int", ["builtins.int"], True), ("builtins", "int", [], False), ], - ids=["list-True", "list-False", "True", "int-True", "int-False"], + ids=["list-True", "list-False", "int-True", "int-False"], ) def test_check_type(module_name, type_name, trusted, expected): assert check_type(module_name, type_name, trusted) == expected diff --git a/skops/io/tests/test_visualize.py b/skops/io/tests/test_visualize.py index b17d71e6..85ee55dd 100644 --- a/skops/io/tests/test_visualize.py +++ b/skops/io/tests/test_visualize.py @@ -113,8 +113,7 @@ def sink(nodes_iter, *args, **kwargs): assert len(nodes_unsafe) > 2 assert any("FunctionTransformer" in node.val for node in nodes_unsafe) - @pytest.mark.parametrize("trusted", [True, ["test_visualize.unsafe_function"]]) - def test_all_nodes_trusted(self, pipeline, trusted, capsys): + def test_all_nodes_trusted(self, pipeline, capsys): # The pipeline contains untrusted type(s), so we have to pass extra # trusted types file = sio.dumps(pipeline) From 005f8bbf2b6f3c05a00a324afd76c36510aad151 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 13 Jun 2024 12:18:05 +0200 Subject: [PATCH 8/8] CLN fix a typecheck to what we really want --- skops/io/_persist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skops/io/_persist.py b/skops/io/_persist.py index 3f988201..711ec884 100644 --- a/skops/io/_persist.py +++ b/skops/io/_persist.py @@ -135,7 +135,7 @@ def load(file: str | Path, trusted: Optional[Sequence[str]] = None) -> Any: The loaded object. """ - if trusted and not isinstance(trusted, list): + if trusted is True: raise TypeError( "trusted must be a list of strings. Before version 0.10 trusted could " "be a boolean, but this is no longer supported, due to a reported " @@ -175,7 +175,7 @@ def loads(data: bytes, trusted: Optional[Sequence[str]] = None) -> Any: if isinstance(data, str): raise TypeError("Can't load skops format from string, pass bytes") - if trusted and not isinstance(trusted, list): + if trusted is True: raise TypeError( "trusted must be a list of strings. Before version 0.10 trusted could " "be a boolean, but this is no longer supported, due to a reported "